In [2]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

In [3]:
# Load CSV into a DataFrame
test_data = pd.read_csv(r"C:\Users\leonh\Downloads\test.csv")
train_data = pd.read_csv(r"C:\Users\leonh\Downloads\train.csv")

In [4]:
test_new = pd.read_csv(r"C:\Users\leonh\Downloads\test_new.csv")
train_new = pd.read_csv(r"C:\Users\leonh\Downloads\train_new.csv")

In [5]:
# Work on copies
_train = train_data.copy()
_test = test_data.copy()

# Join O and P by index
if not {'O','P'}.issubset(_train.columns):
    _train = _train.join(train_new[['O','P']])
if not {'O','P'}.issubset(_test.columns):
    _test = _test.join(test_new[['O','P']])

# Quick checks
missing_train = {'O','P'} - set(_train.columns)
missing_test = {'O','P'} - set(_test.columns)
assert not missing_train, f"Missing in train after join: {missing_train}"
assert not missing_test, f"Missing in test after join: {missing_test}"

# Convert time to numeric seconds since epoch if datetime-like or parseable strings
def ensure_time_numeric(df, col='time'):
    if col in df.columns:
        if np.issubdtype(df[col].dtype, np.datetime64):
            df[col] = df[col].view('int64') // 10**9
        elif df[col].dtype == object:
            try:
                parsed = pd.to_datetime(df[col], errors='raise')
                df[col] = parsed.view('int64') // 10**9
            except Exception:
                pass
    return df

_train = ensure_time_numeric(_train, 'time')
_test = ensure_time_numeric(_test, 'time')

# Targets and features
target_cols = ['Y1', 'Y2']
feature_cols = [c for c in _train.columns if c not in (target_cols + ['id'])]

X_train = _train[feature_cols]
y_train = _train[target_cols]
X_test = _test[feature_cols]

print("Train shape:", X_train.shape, "| Targets:", y_train.shape)
print("Test shape:", X_test.shape)
print("Sample feature columns:", feature_cols[:8], "... total", len(feature_cols))


Train shape: (80000, 17) | Targets: (80000, 2)
Test shape: (15996, 17)
Sample feature columns: ['time', 'A', 'B', 'C', 'D', 'E', 'F', 'G'] ... total 17


In [7]:
# Preprocess: median impute on all features
preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), feature_cols)
    ],
    remainder="drop"
)

# Fast, robust multioutput regressor
reg = MultiOutputRegressor(
    HistGradientBoostingRegressor(random_state=42)
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", reg)
])

model.fit(X_train, y_train)

In [8]:
pred = model.predict(X_test)
pred_df = pd.DataFrame({
    "id": _test["id"].values,
    "Y1": pred[:, 0],
    "Y2": pred[:, 1]
})

# Save exactly as requested
pred_df.to_csv("scikit_prediction.csv", index=False)

print(pred_df.head(10).to_string(index=False))


 id        Y1        Y2
  1  0.494914 -0.182835
  2 -0.271534 -0.330748
  3 -0.249876 -0.055190
  4 -0.405644  0.276639
  5 -0.903259  0.160204
  6  0.309278  0.142049
  7  1.181707  0.275246
  8 -0.160422 -0.119418
  9 -0.783138  0.003577
 10 -1.199036 -0.078520
