In [97]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import PowerTransformer, RobustScaler
import xgboost as xgb
import joblib
import os

In [98]:
df1 = pd.read_csv('C:\\Users\\sword\\OneDrive - The University of Melbourne\\Desktop\\Quantchallenge\\Quantchallenge_25\\Data\\train.csv')
df2 = pd.read_csv('C:\\Users\\sword\\OneDrive - The University of Melbourne\\Desktop\\Quantchallenge\\Quantchallenge_25\\Data\\train_new.csv')
merged = pd.concat([df1, df2], axis = 1)
new_order = ['time','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Y1', 'Y2']
df_train = merged[new_order]

In [99]:
df1_test = pd.read_csv('C:\\Users\\sword\\OneDrive - The University of Melbourne\\Desktop\\Quantchallenge\\Quantchallenge_25\\Data\\test.csv')
df2_test = pd.read_csv('C:\\Users\\sword\\OneDrive - The University of Melbourne\\Desktop\\Quantchallenge\\Quantchallenge_25\\Data\\test_new.csv')
merged_test = pd.concat([df1_test, df2_test], axis = 1)
new_order_test = ['time','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P']
df_test = merged_test[new_order_test]

In [100]:
g1_cols = ['A', 'B', 'D', 'F', 'I', 'K', 'L', 'O']
g2_cols = ['C', 'E', 'G', 'H', 'J', 'M', 'N', 'P']
FEATURE_COLS = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P']

In [101]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)
robust_scaler = RobustScaler()


In [102]:
df_train = df_train.drop(columns=['time'])
df_train.interpolate(method='linear', inplace=True)
df_train['P'] = (df_train['P']>0.5).astype(int)
df_train[g1_cols] = pt.fit_transform(df_train[g1_cols])
df_train[g2_cols] = robust_scaler.fit_transform(df_train[g2_cols])

In [103]:
def calculate_r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred)**2)
    ss_tot = np.sum((y_true - np.mean(y_true))**2)
    if ss_tot == 0:
        return 1.0 if np.allclose(y_true, y_pred) else 0.0
    return 1 - (ss_res / ss_tot)

def mean_r2_multi(y_true, y_pred):
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n_targets = y_true.shape[1]
    r2s = []
    for t in range(n_targets):
        r2s.append(calculate_r2(y_true[:, t], y_pred[:, t]))
    return float(np.mean(r2s))

multi_r2_scorer = make_scorer(lambda yt, yp: mean_r2_multi(yt, yp), greater_is_better=True)

In [104]:
df_test = df_test.drop(columns=['time'])
df_test.interpolate(method='linear', inplace=True)
df_test['P'] = (df_test['P']>0.5).astype(int)
df_test[g1_cols] = pt.transform(df_test[g1_cols])
df_test[g2_cols] = robust_scaler.transform(df_test[g2_cols])

In [105]:
X_g1 = df_train[g1_cols].values
X_g2 = df_train[g2_cols].values
y = np.vstack([df_train['Y1'].values, df_train['Y2'].values]).T

X_test_g1 = df_test[g1_cols].values
X_test_g2 = df_test[g2_cols].values

if 'id' in df_test.columns:
    test_ids = df_test['id'].astype(str).values
else:
    test_ids = df_test.index.astype(str).values

In [106]:
Xg1_tr, Xg1_val, Xg2_tr, Xg2_val, y_tr, y_val = train_test_split(
    X_g1, X_g2, y, test_size=0.15, random_state=42
)
base_xgb = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=1, verbosity=0)
multi_base = MultiOutputRegressor(base_xgb, n_jobs=-1)

param_grid = {
    'estimator__n_estimators': [100, 300],
    'estimator__max_depth': [3, 5],
    'estimator__learning_rate': [0.05, 0.1],
    'estimator__subsample': [0.7, 1.0],
    'estimator__colsample_bytree': [0.7, 1.0],
}

cv = KFold(n_splits=4, shuffle=True, random_state=42)
grid = GridSearchCV(
    estimator=multi_base,
    param_grid=param_grid,
    scoring=multi_r2_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True
)

print("Fitting Model G1 (g1_cols) ...")
grid.fit(Xg1_tr, y_tr)
best_model_g1 = grid.best_estimator_
print("G1 best params:", grid.best_params_)

print("Fitting Model G2 (g2_cols) ...")
grid2 = GridSearchCV(
    estimator=MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=1, verbosity=0),
                                   n_jobs=-1),
    param_grid=param_grid,
    scoring=multi_r2_scorer,
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True
)
grid2.fit(Xg2_tr, y_tr)
best_model_g2 = grid2.best_estimator_
print("G2 best params:", grid2.best_params_)

Fitting Model G1 (g1_cols) ...
Fitting 4 folds for each of 32 candidates, totalling 128 fits
G1 best params: {'estimator__colsample_bytree': 0.7, 'estimator__learning_rate': 0.05, 'estimator__max_depth': 5, 'estimator__n_estimators': 100, 'estimator__subsample': 1.0}
Fitting Model G2 (g2_cols) ...
Fitting 4 folds for each of 32 candidates, totalling 128 fits
G2 best params: {'estimator__colsample_bytree': 0.7, 'estimator__learning_rate': 0.05, 'estimator__max_depth': 3, 'estimator__n_estimators': 300, 'estimator__subsample': 1.0}


In [107]:
y_val_pred_g1 = best_model_g1.predict(Xg1_val)
y_val_pred_g2 = best_model_g2.predict(Xg2_val)

# compute per-target R2s
r2_y1_g1 = calculate_r2(y_val[:,0], y_val_pred_g1[:,0])
r2_y2_g1 = calculate_r2(y_val[:,1], y_val_pred_g1[:,1])
mean_r2_g1 = np.mean([r2_y1_g1, r2_y2_g1])

r2_y1_g2 = calculate_r2(y_val[:,0], y_val_pred_g2[:,0])
r2_y2_g2 = calculate_r2(y_val[:,1], y_val_pred_g2[:,1])
mean_r2_g2 = np.mean([r2_y1_g2, r2_y2_g2])

print(f"G1 validation R2s: Y1={r2_y1_g1:.5f}, Y2={r2_y2_g1:.5f}, mean={mean_r2_g1:.5f}")
print(f"G2 validation R2s: Y1={r2_y1_g2:.5f}, Y2={r2_y2_g2:.5f}, mean={mean_r2_g2:.5f}")

score_g1 = max(mean_r2_g1, 0.0)
score_g2 = max(mean_r2_g2, 0.0)
if score_g1 + score_g2 == 0:
    w1, w2 = 0.5, 0.5
else:
    w1 = score_g1 / (score_g1 + score_g2)
    w2 = score_g2 / (score_g1 + score_g2)

print(f"Ensemble weights -> G1: {w1:.3f}, G2: {w2:.3f}")

G1 validation R2s: Y1=0.04522, Y2=0.69598, mean=0.37060
G2 validation R2s: Y1=0.77454, Y2=0.38238, mean=0.57846
Ensemble weights -> G1: 0.390, G2: 0.610


In [108]:
pred_g1_test = best_model_g1.predict(X_test_g1)
pred_g2_test = best_model_g2.predict(X_test_g2)
y_test_ensemble = w1 * pred_g1_test + w2 * pred_g2_test

if np.isnan(y_test_ensemble).any():
    print("Warning: NaNs found in ensemble predictions; filling with train target means.")
    y_mean = np.nanmean(y, axis=0)
    for col in range(y_test_ensemble.shape[1]):
        col_nan = np.isnan(y_test_ensemble[:, col])
        if np.any(col_nan):
            y_test_ensemble[col_nan, col] = y_mean[col]

In [109]:
out_df = pd.DataFrame({
    'id': test_ids,
    'Y1': y_test_ensemble[:, 0],
    'Y2': y_test_ensemble[:, 1]
})
assert not out_df[['Y1','Y2']].isnull().any().any(), "Predictions contain NaNs!"

In [110]:
output_path = 'predictions_ensemble_g1g2.csv'
out_df.to_csv(output_path, index=False, float_format='%.8f')
print(f"Wrote predictions to {output_path} (size MB: {os.path.getsize(output_path)/1024**2:.3f})")

Wrote predictions to predictions_ensemble_g1g2.csv (size MB: 0.452)


In [111]:
print("Done.")
print("Validation mean R2s: G1=", mean_r2_g1, " G2=", mean_r2_g2)
print("Ensemble final weights:", (w1, w2))

Done.
Validation mean R2s: G1= 0.3706000110695048  G2= 0.5784620931490871
Ensemble final weights: (np.float64(0.39049079024669037), np.float64(0.6095092097533096))
