In [None]:
# ============================================================
# XGBoost + User Feature Cleaning (Direct Model)
# ============================================================

import os
# Set GPU device (change to "-1" if you don't have a GPU)
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# -----------------------------
# 0) Configuration
# -----------------------------
DATA_DIR = "/rds/rds-lxu/ml_datasets/exam_score_predict"
TRAIN_PATH = f"{DATA_DIR}/train.csv"
TEST_PATH  = f"{DATA_DIR}/test.csv"

ID_COL = "id"
TARGET = "exam_score"

# -----------------------------
# 1) Load Data
# -----------------------------
print("Loading data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

y = train_df[TARGET].astype(float).values
train_ids = train_df[ID_COL].values
test_ids  = test_df[ID_COL].values

train_X0 = train_df.drop(columns=[TARGET, ID_COL]).copy()
test_X0  = test_df.drop(columns=[ID_COL]).copy()

# -----------------------------
# 2) APPLY USER FEATURE CLEANING
# -----------------------------
cols_to_drop = ["age", "internet_access", "course", "gender", "exam_difficulty"]
# cols_to_drop = ["age", "internet_access"]

print("\n--- Applying Feature Cleaning ---")
train_X = train_X0.drop(columns=cols_to_drop, errors='ignore')
test_X  = test_X0.drop(columns=cols_to_drop, errors='ignore')

print(f"Dropped: {cols_to_drop}")

# -----------------------------
# 3) One-Hot Encoding
# -----------------------------
print("\nOne-Hot Encoding...")
all_X = pd.concat([train_X, test_X], axis=0, ignore_index=True)

obj_cols = all_X.select_dtypes(include=["object"]).columns
all_X[obj_cols] = all_X[obj_cols].astype(str).fillna("missing")
all_X = all_X.fillna(0.0)

all_X_enc = pd.get_dummies(all_X, columns=obj_cols, drop_first=False)

# Split back to Train/Test - No stacking features added here
X_train = all_X_enc.iloc[:len(train_X)].values
X_test  = all_X_enc.iloc[len(train_X):].values

print(f"Final Input Dimension: {X_train.shape[1]}")

# -----------------------------
# 4) XGBoost CV Training (GPU)
# -----------------------------
xgb_params = dict(
    n_estimators=30000,
    learning_rate=0.02,
    max_depth=6,
    min_child_weight=3.0,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_lambda=2.0,
    reg_alpha=0.1,
    gamma=0.0,
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",           # Required for GPU
    device="cuda:0",              # GPU Device
    early_stopping_rounds=200,
    verbosity=0,
)

print("\nStarting XGBoost CV Training (No Stacking)...")
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_xgb = np.zeros(len(X_train), dtype=float)
test_pred_folds = np.zeros((N_SPLITS, len(X_test)), dtype=float)
best_iters = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train), 1):
    X_tr, y_tr = X_train[tr_idx], y[tr_idx]
    X_va, y_va = X_train[va_idx], y[va_idx]

    model = XGBRegressor(**xgb_params, random_state=42 + fold)
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=500)

    oof_xgb[va_idx] = model.predict(X_va)
    test_pred_folds[fold - 1] = model.predict(X_test)

    best_iters.append(int(model.best_iteration) + 1)
    rmse_fold = float(np.sqrt(mean_squared_error(y_va, oof_xgb[va_idx])))
    print(f"[Fold {fold}] best_iter={best_iters[-1]}  RMSE={rmse_fold:.6f}")

rmse_oof = float(np.sqrt(mean_squared_error(y, oof_xgb)))
print(f"\nOOF RMSE (Direct XGBoost): {rmse_oof:.6f}")

# -----------------------------
# 5) Final Fit and Submission
# -----------------------------
best_n = int(np.mean(best_iters))
print(f"Retraining on full dataset with {best_n} estimators...")

final_params = xgb_params.copy()
final_params.pop("early_stopping_rounds", None)
final_params["n_estimators"] = best_n
final_params["verbosity"] = 1

final_model = XGBRegressor(**final_params, random_state=123)
final_model.fit(X_train, y, verbose=False)

test_pred = final_model.predict(X_test)
test_pred = np.clip(test_pred, 0.0, 100.0) # Ensure within score range

submission = pd.DataFrame({ID_COL: test_ids, TARGET: test_pred})
out_path = f"{DATA_DIR}/submission_xgb_no_stacking.csv"
submission.to_csv(out_path, index=False)

print("\nWrote:", out_path)
print(submission.head())

Loading data...

--- Applying Feature Cleaning ---
Dropped: ['age', 'internet_access']

One-Hot Encoding...
Final Input Dimension: 27

Starting XGBoost CV Training (No Stacking)...
[0]	validation_0-rmse:18.59631
[500]	validation_0-rmse:8.78324
[1000]	validation_0-rmse:8.75860
[1500]	validation_0-rmse:8.74625
[2000]	validation_0-rmse:8.73984
[2500]	validation_0-rmse:8.73657
[3000]	validation_0-rmse:8.73489
[3500]	validation_0-rmse:8.73423
[3746]	validation_0-rmse:8.73446
[Fold 1] best_iter=3547  RMSE=8.734093
[0]	validation_0-rmse:18.64005
[500]	validation_0-rmse:8.79616
[1000]	validation_0-rmse:8.77037
[1500]	validation_0-rmse:8.75793
[2000]	validation_0-rmse:8.75066
[2500]	validation_0-rmse:8.74707
[3000]	validation_0-rmse:8.74523
[3500]	validation_0-rmse:8.74425
[4000]	validation_0-rmse:8.74364
[4196]	validation_0-rmse:8.74372
[Fold 2] best_iter=3997  RMSE=8.743622
[0]	validation_0-rmse:18.81782
[500]	validation_0-rmse:8.78591
[1000]	validation_0-rmse:8.75992
[1500]	validation_0-rmse

In [2]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb
import warnings
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.pipeline import Pipeline

warnings.filterwarnings('ignore')
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use GPU 0

# -----------------------------
# 0) Configuration
# -----------------------------
DATA_DIR = "/rds/rds-lxu/ml_datasets/exam_score_predict" 
TRAIN_PATH = f"{DATA_DIR}/train.csv"
TEST_PATH  = f"{DATA_DIR}/test.csv"
SEED = 1003
N_SPLITS = 10 # 10-fold for maximum stability

# -----------------------------
# 1) Advanced Feature Engineering
# -----------------------------
def advanced_fe(df):
    output = df.copy()
    
    # Drop IDs
    if 'student_id' in output.columns: output.drop('student_id', axis=1, inplace=True)
    
    # 1. Peer Group Features (Crucial for school data)
    output['course_mean_study'] = output.groupby('course')['study_hours'].transform('mean')
    output['study_vs_course_avg'] = output['study_hours'] - output['course_mean_study']
    
    # 2. Polynomials & Logs
    output['study_hours_2'] = output['study_hours'] ** 2
    output['attendance_2'] = output['class_attendance'] ** 2
    output['log_study'] = np.log1p(output['study_hours'])
    
    # 3. Ratios & Efficiency
    epsilon = 1e-5
    output['efficiency'] = (output['study_hours'] * output['class_attendance']) / (output['sleep_hours'] + 1)
    output['study_per_sleep'] = output['study_hours'] / (output['sleep_hours'] + epsilon)
    
    # 4. Ordinal Mapping
    q_map = {'poor': 0, 'average': 1, 'good': 2}
    r_map = {'low': 0, 'medium': 1, 'high': 2}
    d_map = {'easy': 0, 'medium': 1, 'hard': 2}
    output['sleep_quality_num'] = output['sleep_quality'].map(q_map).fillna(1).astype(int)
    output['facility_rating_num'] = output['facility_rating'].map(r_map).fillna(1).astype(int)
    output['difficulty_num'] = output['exam_difficulty'].map(d_map).fillna(1).astype(int)
    
    # 5. Trigonometric (Cyclical)
    output['study_sin'] = np.sin(2 * np.pi * output['study_hours'] / 24)
    output['study_cos'] = np.cos(2 * np.pi * output['study_hours'] / 24)
    
    return output

# -----------------------------
# 2) Data Preparation
# -----------------------------
print("Loading and Engineering Features...")
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
TARGET = 'exam_score'

train_X_raw = advanced_fe(train_df).drop(columns=['id', TARGET])
test_X_raw  = advanced_fe(test_df).drop(columns=['id'])
y = train_df[TARGET].values

cat_cols = ['gender', 'course', 'internet_access', 'sleep_quality', 
            'study_method', 'facility_rating', 'exam_difficulty']

# -----------------------------
# 3) Phase 1: Ridge Stacking
# -----------------------------
print("PHASE 1: Generating Ridge Meta-Features (10-Fold OOF)...")
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
ridge_oof = np.zeros(len(train_X_raw))
ridge_test = np.zeros(len(test_X_raw))

for fold, (tr_idx, va_idx) in enumerate(kf.split(train_X_raw), 1):
    # Ridge needs encoded/scaled data
    X_tr_fold, X_va_fold = train_X_raw.iloc[tr_idx], train_X_raw.iloc[va_idx]
    y_tr_fold = y[tr_idx]
    
    # Target Encoding for Linear Model
    te = TargetEncoder(target_type='continuous', random_state=SEED)
    X_tr_te = te.fit_transform(X_tr_fold[cat_cols], y_tr_fold)
    X_va_te = te.transform(X_va_fold[cat_cols])
    X_test_te = te.transform(test_X_raw[cat_cols])
    
    # Combine with numeric
    X_tr_lin = np.hstack([X_tr_fold.drop(cat_cols, axis=1).values, X_tr_te])
    X_va_lin = np.hstack([X_va_fold.drop(cat_cols, axis=1).values, X_va_te])
    X_test_lin = np.hstack([test_X_raw.drop(cat_cols, axis=1).values, X_test_te])
    
    # Ridge Pipeline
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('ridge', RidgeCV(alphas=np.logspace(-3, 3, 10)))
    ])
    
    model.fit(X_tr_lin, y_tr_fold)
    ridge_oof[va_idx] = model.predict(X_va_lin)
    ridge_test += model.predict(X_test_lin) / N_SPLITS
    print(f"  Ridge Fold {fold} Complete")

# Append Ridge prediction to XGBoost features
train_X_raw['ridge_pred'] = ridge_oof
test_X_raw['ridge_pred'] = ridge_test

# Set categoricals for XGBoost
for col in cat_cols:
    train_X_raw[col] = train_X_raw[col].astype('category')
    test_X_raw[col] = test_X_raw[col].astype('category')

# -----------------------------
# 4) Phase 2: Regularized XGBoost
# -----------------------------
# L1/L2 Regularization formula:
# Loss = MSE + gamma*T + 0.5*lambda*||w||^2 + alpha*||w||
xgb_params = {
    'n_estimators': 20000,
    'learning_rate': 0.005,
    'max_depth': 7,             # Depth control
    'min_child_weight': 10,     # Prevents nodes from modeling small noise
    'subsample': 0.8,           # Row sampling
    'colsample_bytree': 0.6,    # Feature sampling
    'reg_lambda': 15.0,         # Strong L2 regularization
    'reg_alpha': 1.0,           # L1 regularization
    'gamma': 0.2,               # Min loss reduction to split
    'tree_method': 'hist',
    'device': 'cuda',
    'enable_categorical': True,
    'early_stopping_rounds': 150,
    'eval_metric': 'rmse',
    'random_state': SEED
}

print("\nPHASE 2: Training Regularized XGBoost...")
xgb_oof = np.zeros(len(train_X_raw))
xgb_test = np.zeros(len(test_X_raw))

for fold, (tr_idx, va_idx) in enumerate(kf.split(train_X_raw), 1):
    X_tr, X_va = train_X_raw.iloc[tr_idx], train_X_raw.iloc[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]
    
    model = xgb.XGBRegressor(**xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=1000)
    
    xgb_oof[va_idx] = model.predict(X_va)
    xgb_test += model.predict(test_X_raw) / N_SPLITS
    
    rmse = np.sqrt(mean_squared_error(y_va, xgb_oof[va_idx]))
    print(f"  Fold {fold} RMSE: {rmse:.5f}")

print(f"\nFINAL OOF RMSE: {np.sqrt(mean_squared_error(y, xgb_oof)):.5f}")

# -----------------------------
# 5) Submission
# -----------------------------
submission = pd.DataFrame({
    'id': test_df['id'],
    'exam_score': np.clip(xgb_test, 0, 100)
})
submission.to_csv("submission_final_stack.csv", index=False)
print("Submission saved successfully.")

Loading and Engineering Features...
PHASE 1: Generating Ridge Meta-Features (10-Fold OOF)...
  Ridge Fold 1 Complete
  Ridge Fold 2 Complete
  Ridge Fold 3 Complete
  Ridge Fold 4 Complete
  Ridge Fold 5 Complete
  Ridge Fold 6 Complete
  Ridge Fold 7 Complete
  Ridge Fold 8 Complete
  Ridge Fold 9 Complete
  Ridge Fold 10 Complete

PHASE 2: Training Regularized XGBoost...
[0]	validation_0-rmse:18.84292
[1000]	validation_0-rmse:8.76928
[2000]	validation_0-rmse:8.72962
[3000]	validation_0-rmse:8.70686
[4000]	validation_0-rmse:8.69247
[5000]	validation_0-rmse:8.68204
[6000]	validation_0-rmse:8.67494
[7000]	validation_0-rmse:8.66956
[8000]	validation_0-rmse:8.66525
[9000]	validation_0-rmse:8.66201
[10000]	validation_0-rmse:8.65970
[11000]	validation_0-rmse:8.65807
[12000]	validation_0-rmse:8.65642
[13000]	validation_0-rmse:8.65513
[14000]	validation_0-rmse:8.65424
[15000]	validation_0-rmse:8.65366
[15550]	validation_0-rmse:8.65343
  Fold 1 RMSE: 8.65342
[0]	validation_0-rmse:18.88933
[100