In [1]:
# ==========================================================
# 1. IMPORTS & CONFIGURATION
# ==========================================================
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
import gc
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Settings
warnings.filterwarnings('ignore')
RANDOM_STATE = 42
N_SPLITS = 5 

# --- FIXED PARAMETERS TO PREVENT CPU EXPLOSION ---
LGB_PARAMS = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 8000,
    'learning_rate': 0.015,
    'num_leaves': 31,
    'max_depth': 8,
    'min_child_samples': 50,
    'subsample': 0.8,
    'colsample_bytree': 0.7,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    
    # CRITICAL FIXES
    'n_jobs': 8,               # <--- CHANGED from -1 to 8 to prevent thread explosion
    'device': 'gpu',           # Try to use GPU
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    
    'random_state': RANDOM_STATE,
    'verbosity': -1
}

# ==========================================================
# 2. DATA LOADING & MEMORY OPTIMIZATION
# ==========================================================
print("--- Loading Data ---")
DATA_DIR = "/rds/rds-lxu/ml_datasets/exam_score_predict"

# Load columns
df_train = pd.read_csv(f'{DATA_DIR}/train.csv')
df_test = pd.read_csv(f'{DATA_DIR}/test.csv')
original = pd.read_csv(f'{DATA_DIR}/Exam_Score_Prediction.csv')

# Handle ID columns
df_train = df_train.set_index('id')
df_test = df_test.set_index('id')
if 'student_id' in original.columns:
    original = original.set_index('student_id')

# Merge
df_train = pd.concat([original, df_train], axis=0).reset_index(drop=True)

# Separate Target
y = df_train['exam_score'].values.astype(np.float32)
df_train = df_train.drop(columns=['exam_score'])

print(f"Train Shape: {df_train.shape}")
gc.collect()

# ==========================================================
# 3. FEATURE ENGINEERING
# ==========================================================
def preprocess(df):
    df = df.copy()
    
    # 1. Ordinal Mapping
    mappings = {
        'sleep_quality': {'poor': 1, 'average': 2, 'good': 3},
        'facility_rating': {'low': 1, 'medium': 2, 'high': 3},
        'exam_difficulty': {'easy': 1, 'moderate': 2, 'hard': 3}
    }
    for col, mapping in mappings.items():
        if col in df.columns:
            df[f'{col}_score'] = df[col].map(mapping).fillna(2).astype(np.float32)

    # 2. Interactions
    if 'study_hours' in df.columns and 'sleep_quality_score' in df.columns:
        df['study_efficiency'] = df['study_hours'] * df['sleep_quality_score']
    if 'class_attendance' in df.columns and 'facility_rating_score' in df.columns:
        df['attendance_impact'] = df['class_attendance'] * df['facility_rating_score']
    if 'study_hours' in df.columns:
        df['study_hours_sq'] = df['study_hours'] ** 2

    # 3. Formula Feature
    is_good = (df['sleep_quality'] == 'good').astype(np.float32)
    is_poor = (df['sleep_quality'] == 'poor').astype(np.float32)
    is_coaching = (df['study_method'] == 'coaching').astype(np.float32)
    is_high_fac = (df['facility_rating'] == 'high').astype(np.float32)
    
    df['formula'] = (6 * df['study_hours'] + 
                     0.35 * df['class_attendance'] + 
                     1.5 * df['sleep_hours'] +
                     5 * is_good - 5 * is_poor +
                     10 * is_coaching + 
                     4 * is_high_fac)

    # 4. Convert Categoricals to 'category' dtype for LightGBM
    cat_cols = ['gender', 'course', 'study_method', 'internet_access', 
                'sleep_quality', 'facility_rating', 'exam_difficulty']
    
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
            
    return df

print("--- Processing Features ---")
X = preprocess(df_train)
X_test = preprocess(df_test)

# Identify Numeric Columns for Ridge (Skip categories to save RAM)
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Optimize Memory: Downcast floats
for col in num_cols:
    X[col] = X[col].astype(np.float32)
    X_test[col] = X_test[col].astype(np.float32)

gc.collect()

# ==========================================================
# 4. LIGHTWEIGHT RIDGE STACKING (Numeric Only)
# ==========================================================
print("\n--- Generating Ridge Meta-Feature (Numeric Only) ---")

# Scale numerics for Ridge
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X[num_cols].fillna(0))
X_test_num_scaled = scaler.transform(X_test[num_cols].fillna(0))

kf_ridge = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
ridge_oof = np.zeros((X.shape[0],), dtype=np.float32)
ridge_test_preds = np.zeros((X_test.shape[0], 5), dtype=np.float32)

model_ridge = RidgeCV(alphas=[0.1, 1.0, 10.0], scoring='neg_root_mean_squared_error')

for fold, (idx_tr, idx_va) in enumerate(kf_ridge.split(X_num_scaled, y)):
    model_ridge.fit(X_num_scaled[idx_tr], y[idx_tr])
    ridge_oof[idx_va] = model_ridge.predict(X_num_scaled[idx_va])
    ridge_test_preds[:, fold] = model_ridge.predict(X_test_num_scaled)

# Add Ridge Prediction as Feature
X['ridge_pred'] = ridge_oof
X_test['ridge_pred'] = ridge_test_preds.mean(axis=1)

# Clean up to save RAM
del X_num_scaled, X_test_num_scaled, ridge_test_preds
gc.collect()

print(f"Final Data Shape: {X.shape}")

# ==========================================================
# 5. LIGHTGBM TRAINING (GPU)
# ==========================================================
print("\n--- Training LightGBM ---")
# Check if GPU is actually being used by creating a small test model
try:
    lgb.train({'device': 'gpu'}, lgb.Dataset(np.array([[1]]), np.array([1])), num_boost_round=1)
    print("GPU Detected and Enabled.")
except Exception as e:
    print("WARNING: GPU init failed. Falling back to CPU with n_jobs=8.")
    print(f"Error: {e}")
    LGB_PARAMS['device'] = 'cpu'

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

lgb_oof_preds = np.zeros(X.shape[0])
lgb_test_preds = np.zeros((X_test.shape[0], N_SPLITS))

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, y)):
    X_tr, y_tr = X.iloc[idx_tr], y[idx_tr]
    X_va, y_va = X.iloc[idx_va], y[idx_va]
    
    model = lgb.LGBMRegressor(**LGB_PARAMS)
    
    callbacks = [
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=1000)
    ]
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='rmse',
        callbacks=callbacks
    )
    
    lgb_oof_preds[idx_va] = model.predict(X_va)
    lgb_test_preds[:, fold] = model.predict(X_test)
    
    gc.collect() 

# ==========================================================
# 6. SUBMISSION
# ==========================================================
final_rmse = np.sqrt(mean_squared_error(y, lgb_oof_preds))
print(f"\nOverall CV RMSE: {final_rmse:.5f}")

submission = pd.DataFrame({
    'id': df_test.index,
    'exam_score': np.clip(lgb_test_preds.mean(axis=1), 0, 100)
})

submission.to_csv(f'{DATA_DIR}/submission_lgb_refined.csv', index=False)
print(f"Saved submission to '{DATA_DIR}/submission_lgb_refined.csv'")

--- Loading Data ---
Train Shape: (650000, 11)
--- Processing Features ---

--- Generating Ridge Meta-Feature (Numeric Only) ---
Final Data Shape: (650000, 19)

--- Training LightGBM ---
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 1, number of used features: 0
[LightGBM] [Info] Using GPU Device: NVIDIA RTX PRO 6000 Blackwell Workstation Edition, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 16 bins...




[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Start training from score 1.000000
GPU Detected and Enabled.
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8.78803
[2000]	valid_0's rmse: 8.76142
[3000]	valid_0's rmse: 8.7475
[4000]	valid_0's rmse: 8.74057
[5000]	valid_0's rmse: 8.73669
[6000]	valid_0's rmse: 8.73434
[7000]	valid_0's rmse: 8.73258
Early stopping, best iteration is:
[7564]	valid_0's rmse: 8.73149
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8.84038
[2000]	valid_0's rmse: 8.81213
[3000]	valid_0's rmse: 8.80114
[4000]	valid_0's rmse: 8.79424
[5000]	valid_0's rmse: 8.78956
[6000]	valid_0's rmse: 8.78512
[7000]	valid_0's rmse: 8.78245
[8000]	valid_0's rmse: 8.77927
Did not meet early stopping. Best iteration is:
[7998]	valid_0's rmse: 8.77926
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8.78539
[2000]	valid_0's rmse: 8.75846
[3000]	valid_0's 

In [2]:
# ==========================================================
# 1. IMPORTS & CONFIGURATION
# ==========================================================
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
import gc
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Settings
warnings.filterwarnings('ignore')
RANDOM_STATE = 42
N_SPLITS = 5 

# --- HYPERPARAMETER ZOO (Diversity Strategy) ---
# We will rotate through these 5 sets to force diversity
PARAMS_ZOO = [
    # Set 1: Baseline (Balanced)
    {
        'learning_rate': 0.015, 'num_leaves': 31, 'max_depth': 8, 
        'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_samples': 50,
        'reg_alpha': 0.1, 'reg_lambda': 0.1
    },
    # Set 2: Deep & Aggressive (Catches complex patterns)
    {
        'learning_rate': 0.01, 'num_leaves': 63, 'max_depth': 12, 
        'subsample': 0.9, 'colsample_bytree': 0.8, 'min_child_samples': 30,
        'reg_alpha': 0.05, 'reg_lambda': 0.05
    },
    # Set 3: Shallow & Conservative (Prevents overfitting)
    {
        'learning_rate': 0.02, 'num_leaves': 15, 'max_depth': 6, 
        'subsample': 0.7, 'colsample_bytree': 0.6, 'min_child_samples': 100,
        'reg_alpha': 1.0, 'reg_lambda': 1.0
    },
    # Set 4: Wide Search (High colsample, low subsample)
    {
        'learning_rate': 0.015, 'num_leaves': 45, 'max_depth': 10, 
        'subsample': 0.6, 'colsample_bytree': 0.9, 'min_child_samples': 40,
        'reg_alpha': 0.5, 'reg_lambda': 0.5
    },
    # Set 5: Regularization Heavy (L1/L2 focus)
    {
        'learning_rate': 0.012, 'num_leaves': 31, 'max_depth': 8, 
        'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_samples': 60,
        'reg_alpha': 5.0, 'reg_lambda': 5.0
    }
]

# Common Fixed Params
FIXED_PARAMS = {
    'objective': 'regression',
    'metric': 'rmse',
    'n_estimators': 8000,
    'n_jobs': 8,               # Prevent CPU Explosion
    'device': 'gpu',           # GPU Enabled
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'random_state': RANDOM_STATE,
    'verbosity': -1
}

# ==========================================================
# 2. DATA LOADING & MEMORY OPTIMIZATION
# ==========================================================
print("--- Loading Data ---")
DATA_DIR = "/rds/rds-lxu/ml_datasets/exam_score_predict"

df_train = pd.read_csv(f'{DATA_DIR}/train.csv')
df_test = pd.read_csv(f'{DATA_DIR}/test.csv')
original = pd.read_csv(f'{DATA_DIR}/Exam_Score_Prediction.csv')

df_train = df_train.set_index('id')
df_test = df_test.set_index('id')
if 'student_id' in original.columns:
    original = original.set_index('student_id')

df_train = pd.concat([original, df_train], axis=0).reset_index(drop=True)

y = df_train['exam_score'].values.astype(np.float32)
df_train = df_train.drop(columns=['exam_score'])

print(f"Train Shape: {df_train.shape}")
gc.collect()

# ==========================================================
# 3. FEATURE ENGINEERING
# ==========================================================
def preprocess(df):
    df = df.copy()
    
    # Ordinal
    mappings = {
        'sleep_quality': {'poor': 1, 'average': 2, 'good': 3},
        'facility_rating': {'low': 1, 'medium': 2, 'high': 3},
        'exam_difficulty': {'easy': 1, 'moderate': 2, 'hard': 3}
    }
    for col, mapping in mappings.items():
        if col in df.columns:
            df[f'{col}_score'] = df[col].map(mapping).fillna(2).astype(np.float32)

    # Interactions
    if 'study_hours' in df.columns and 'sleep_quality_score' in df.columns:
        df['study_efficiency'] = df['study_hours'] * df['sleep_quality_score']
    if 'class_attendance' in df.columns and 'facility_rating_score' in df.columns:
        df['attendance_impact'] = df['class_attendance'] * df['facility_rating_score']
    if 'study_hours' in df.columns:
        df['study_hours_sq'] = df['study_hours'] ** 2

    # Formula
    is_good = (df['sleep_quality'] == 'good').astype(np.float32)
    is_poor = (df['sleep_quality'] == 'poor').astype(np.float32)
    is_coaching = (df['study_method'] == 'coaching').astype(np.float32)
    is_high_fac = (df['facility_rating'] == 'high').astype(np.float32)
    
    df['formula'] = (6 * df['study_hours'] + 
                     0.35 * df['class_attendance'] + 
                     1.5 * df['sleep_hours'] +
                     5 * is_good - 5 * is_poor +
                     10 * is_coaching + 
                     4 * is_high_fac)

    # Convert Categoricals to 'category'
    cat_cols = ['gender', 'course', 'study_method', 'internet_access', 
                'sleep_quality', 'facility_rating', 'exam_difficulty']
    
    for col in cat_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')
            
    return df

print("--- Processing Features ---")
X = preprocess(df_train)
X_test = preprocess(df_test)

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Downcast
for col in num_cols:
    X[col] = X[col].astype(np.float32)
    X_test[col] = X_test[col].astype(np.float32)

gc.collect()

# ==========================================================
# 4. LIGHTWEIGHT RIDGE STACKING
# ==========================================================
print("\n--- Generating Ridge Meta-Feature ---")
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X[num_cols].fillna(0))
X_test_num_scaled = scaler.transform(X_test[num_cols].fillna(0))

kf_ridge = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
ridge_oof = np.zeros((X.shape[0],), dtype=np.float32)
ridge_test_preds = np.zeros((X_test.shape[0], 5), dtype=np.float32)
model_ridge = RidgeCV(alphas=[0.1, 1.0, 10.0])

for fold, (idx_tr, idx_va) in enumerate(kf_ridge.split(X_num_scaled, y)):
    model_ridge.fit(X_num_scaled[idx_tr], y[idx_tr])
    ridge_oof[idx_va] = model_ridge.predict(X_num_scaled[idx_va])
    ridge_test_preds[:, fold] = model_ridge.predict(X_test_num_scaled)

X['ridge_pred'] = ridge_oof
X_test['ridge_pred'] = ridge_test_preds.mean(axis=1)

del X_num_scaled, X_test_num_scaled, ridge_test_preds
gc.collect()

# ==========================================================
# 5. DIVERSE LIGHTGBM TRAINING (GPU)
# ==========================================================
print("\n--- Training Diverse LightGBM Ensemble ---")

# Check GPU
try:
    lgb.train({'device': 'gpu'}, lgb.Dataset(np.array([[1]]), np.array([1])), num_boost_round=1)
    print("GPU Detected.")
except:
    print("WARNING: GPU init failed. Using CPU.")
    FIXED_PARAMS['device'] = 'cpu'

kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)

lgb_oof_preds = np.zeros(X.shape[0])
lgb_test_preds = np.zeros((X_test.shape[0], N_SPLITS))

for fold, (idx_tr, idx_va) in enumerate(kf.split(X, y)):
    X_tr, y_tr = X.iloc[idx_tr], y[idx_tr]
    X_va, y_va = X.iloc[idx_va], y[idx_va]
    
    # --- SELECT DIVERSE PARAMS FOR THIS FOLD ---
    current_params = {**FIXED_PARAMS, **PARAMS_ZOO[fold % len(PARAMS_ZOO)]}
    print(f"\nFold {fold+1} Params: Depth={current_params['max_depth']}, Leaves={current_params['num_leaves']}")
    
    model = lgb.LGBMRegressor(**current_params)
    
    callbacks = [
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(period=1000)
    ]
    
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric='rmse',
        callbacks=callbacks
    )
    
    lgb_oof_preds[idx_va] = model.predict(X_va)
    lgb_test_preds[:, fold] = model.predict(X_test)
    
    gc.collect()

# ==========================================================
# 6. SUBMISSION
# ==========================================================
final_rmse = np.sqrt(mean_squared_error(y, lgb_oof_preds))
print(f"\nOverall CV RMSE: {final_rmse:.5f}")

submission = pd.DataFrame({
    'id': df_test.index,
    'exam_score': np.clip(lgb_test_preds.mean(axis=1), 0, 100)
})

submission.to_csv(f'{DATA_DIR}/submission_diverse_ensemble.csv', index=False)
print(f"Saved submission to '{DATA_DIR}/submission_diverse_ensemble.csv'")

--- Loading Data ---
Train Shape: (650000, 11)
--- Processing Features ---

--- Generating Ridge Meta-Feature ---

--- Training Diverse LightGBM Ensemble ---
GPU Detected.

Fold 1 Params: Depth=8, Leaves=31
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8.78803
[2000]	valid_0's rmse: 8.76142
[3000]	valid_0's rmse: 8.7475
[4000]	valid_0's rmse: 8.74058
[5000]	valid_0's rmse: 8.73672
[6000]	valid_0's rmse: 8.73429
[7000]	valid_0's rmse: 8.73246
Early stopping, best iteration is:
[7133]	valid_0's rmse: 8.73218

Fold 2 Params: Depth=12, Leaves=63
Training until validation scores don't improve for 100 rounds
[1000]	valid_0's rmse: 8.83723
[2000]	valid_0's rmse: 8.81251
[3000]	valid_0's rmse: 8.80154
[4000]	valid_0's rmse: 8.79488
[5000]	valid_0's rmse: 8.79097
[6000]	valid_0's rmse: 8.78775
Early stopping, best iteration is:
[6754]	valid_0's rmse: 8.78617

Fold 3 Params: Depth=6, Leaves=15
Training until validation scores don't improve for 100 rounds
[1