In [1]:
# ============================================================
# Tabular Transformer for Exam Score Prediction
# Architecture: Feature Tokenizer + Transformer Encoder
# ============================================================

import os
import gc
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error

# -----------------------------
# Configuration
# -----------------------------
DATA_DIR = "/rds/rds-lxu/ml_datasets/exam_score_predict"
TRAIN_PATH = f"{DATA_DIR}/train.csv"
TEST_PATH  = f"{DATA_DIR}/test.csv"
SUB_PATH   = f"{DATA_DIR}/submission_transformer.csv"

# Hyperparameters
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED = 42
N_SPLITS = 3
BATCH_SIZE = 1024
EPOCHS = 30
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-4
PATIENCE = 5        # Early stopping rounds

# Model Config
D_MODEL = 64        # Embedding dimension
N_HEADS = 4         # Attention heads
N_LAYERS = 3        # Transformer blocks
DROPOUT = 0.1

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(SEED)

# -----------------------------
# 1) Data Loading & Feature Engineering
# -----------------------------
print("Loading Data...")
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

# (Same Feature Engineering as before - Crucial for consistency)
LUT = {
    "sleep_quality": {"good": 5, "average": 0, "poor": -5},
    "facility_rating": {"high": 4, "medium": 0, "low": -4},
    "study_method": {"coaching": 10, "mixed": 5, "group study": 2, "online videos": 1, "self-study": 0},
}

def preprocess_df(df):
    df = df.copy()
    # String normalization
    cat_cols = ["sleep_quality", "facility_rating", "study_method", "gender", 
                "course", "internet_access", "exam_difficulty"]
    for c in cat_cols:
        if c in df.columns:
            df[c] = df[c].astype(str).str.strip()

    # Manual Scoring
    sq = df["sleep_quality"].map(LUT["sleep_quality"]).fillna(0.0)
    sm = df["study_method"].map(LUT["study_method"]).fillna(0.0)
    fr = df["facility_rating"].map(LUT["facility_rating"]).fillna(0.0)

    df["manual_formula"] = (
        6.0 * df["study_hours"]
        + 0.35 * df["class_attendance"]
        + 1.5 * df["sleep_hours"]
        + sq + sm + fr
    )
    
    # Interactions
    df["study_att"] = df["study_hours"] * df["class_attendance"]
    df["study_div_sleep"] = df["study_hours"] / (df["sleep_hours"] + 1e-4)
    
    # Trig
    for p in [12, 14]:
        df[f"sin_study_{p}"] = np.sin(2 * np.pi * df["study_hours"] / p)
    
    return df

train_df = preprocess_df(train_df)
test_df  = preprocess_df(test_df)

target = train_df["exam_score"].values.astype(np.float32)
train_df = train_df.drop(columns=["exam_score", "id"])
test_ids = test_df["id"].values
test_df  = test_df.drop(columns=["id"])

# -----------------------------
# 2) Preprocessing for Neural Net
# -----------------------------
# Identify columns
cat_cols = [c for c in train_df.columns if train_df[c].dtype == 'object']
num_cols = [c for c in train_df.columns if c not in cat_cols]

print(f"Categorical: {len(cat_cols)}, Numerical: {len(num_cols)}")

# Fill NaNs (Neural Nets hate NaNs)
for c in num_cols:
    train_df[c] = train_df[c].fillna(train_df[c].mean())
    test_df[c] = test_df[c].fillna(train_df[c].mean())

for c in cat_cols:
    train_df[c] = train_df[c].fillna("MISSING")
    test_df[c]  = test_df[c].fillna("MISSING")

# Scale Numerics (StandardScaler is vital for convergence)
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols]  = scaler.transform(test_df[num_cols])

# Encode Categoricals (LabelEncoder for Embeddings)
# We map each unique category to an integer id: 0, 1, 2...
cat_cardinalities = []
for c in cat_cols:
    le = LabelEncoder()
    # Fit on both to ensure we cover all categories
    full_list = pd.concat([train_df[c], test_df[c]], axis=0).astype(str)
    le.fit(full_list)
    train_df[c] = le.transform(train_df[c].astype(str))
    test_df[c]  = le.transform(test_df[c].astype(str))
    cat_cardinalities.append(len(le.classes_))

X_train_full = train_df.values.astype(np.float32)
X_test_full  = test_df.values.astype(np.float32)

# Indices for the model to know which columns are which
cat_idxs = [train_df.columns.get_loc(c) for c in cat_cols]
num_idxs = [train_df.columns.get_loc(c) for c in num_cols]

# -----------------------------
# 3) PyTorch Dataset
# -----------------------------
class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        # Return features and target
        if self.y is not None:
            return torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)
        else:
            return torch.tensor(self.X[idx], dtype=torch.float32)

# -----------------------------
# 4) Transformer Model
# -----------------------------
class ExamTransformer(nn.Module):
    def __init__(self, 
                 num_cols_idx, 
                 cat_cols_idx, 
                 cat_counts, 
                 d_model=64, 
                 n_heads=4, 
                 n_layers=3, 
                 dropout=0.1):
        super().__init__()
        
        self.num_idx = num_cols_idx
        self.cat_idx = cat_cols_idx
        
        # --- Feature Tokenizer ---
        # 1. Numerical Embedding: Project scalar -> vector
        self.num_embeddings = nn.ModuleList([
            nn.Sequential(
                nn.Linear(1, d_model),
                nn.ReLU(),
                nn.Linear(d_model, d_model)
            ) for _ in num_cols_idx
        ])
        
        # 2. Categorical Embedding: Integer -> vector
        self.cat_embeddings = nn.ModuleList([
            nn.Embedding(count, d_model) for count in cat_counts
        ])
        
        # [CLS] Token to aggregate info
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        
        # --- Transformer Encoder ---
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=n_heads, 
            dim_feedforward=d_model*4, 
            dropout=dropout, 
            batch_first=True,
            norm_first=True  # Pre-Norm is often more stable
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # --- Head ---
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )
        
    def forward(self, x):
        batch_size = x.shape[0]
        
        # Process Numerical features
        # x shape: [Batch, Total_Features]
        num_inputs = [x[:, i].unsqueeze(-1) for i in self.num_idx] # List of [Batch, 1]
        num_embeds = [emb(v) for emb, v in zip(self.num_embeddings, num_inputs)] # List of [Batch, D]
        num_embeds = torch.stack(num_embeds, dim=1) # [Batch, N_num, D]
        
        # Process Categorical features
        cat_inputs = [x[:, i].long() for i in self.cat_idx] # List of [Batch]
        cat_embeds = [emb(v) for emb, v in zip(self.cat_embeddings, cat_inputs)] # List of [Batch, D]
        cat_embeds = torch.stack(cat_embeds, dim=1) # [Batch, N_cat, D]
        
        # Concatenate all features -> Sequence
        # Sequence: [CLS] + [Num Features] + [Cat Features]
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        
        # Combine
        sequence = torch.cat([cls_tokens, num_embeds, cat_embeds], dim=1) 
        
        # Transformer Pass
        output = self.transformer(sequence)
        
        # Take the [CLS] token output (index 0)
        cls_output = output[:, 0, :]
        
        # Final prediction
        pred = self.head(cls_output)
        return pred.squeeze(-1)

# -----------------------------
# 5) Training Loop
# -----------------------------
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

oof_preds = np.zeros(len(train_df))
test_preds_folds = np.zeros((N_SPLITS, len(test_df)))

print(f"Starting {N_SPLITS}-Fold CV on Device: {DEVICE}")

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full)):
    print(f"\n--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Prepare DataLoaders
    train_dataset = TabularDataset(X_train_full[train_idx], target[train_idx])
    val_dataset   = TabularDataset(X_train_full[val_idx], target[val_idx])
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    
    # Initialize Model
    model = ExamTransformer(
        num_cols_idx=num_idxs,
        cat_cols_idx=cat_idxs,
        cat_counts=cat_cardinalities,
        d_model=D_MODEL,
        n_heads=N_HEADS,
        n_layers=N_LAYERS,
        dropout=DROPOUT
    ).to(DEVICE)
    
    optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer, max_lr=LEARNING_RATE, steps_per_epoch=len(train_loader), epochs=EPOCHS
    )
    
    # Training Loop
    best_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(EPOCHS):
        model.train()
        train_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()
            
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
                preds = model(X_batch)
                loss = criterion(preds, y_batch)
                val_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_rmse = np.sqrt(val_loss)
        
        # Simple logging
        if epoch % 5 == 0:
            print(f"Epoch {epoch}: Train MSE={train_loss:.4f}, Val RMSE={val_rmse:.4f}")
            
        # Early Stopping Check
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= PATIENCE:
                print("Early stopping triggered.")
                break
    
    # Load best model for inference
    model.load_state_dict(best_model_state)
    model.eval()
    
    # Predict OOF
    val_preds = []
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch = X_batch.to(DEVICE)
            preds = model(X_batch)
            val_preds.extend(preds.cpu().numpy())
    oof_preds[val_idx] = np.array(val_preds)
    
    # Predict Test
    test_dataset = TabularDataset(X_test_full)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE*2, shuffle=False)
    
    fold_test_preds = []
    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(DEVICE)
            preds = model(X_batch)
            fold_test_preds.extend(preds.cpu().numpy())
    test_preds_folds[fold] = np.array(fold_test_preds)
    
    print(f"Fold {fold+1} Best RMSE: {np.sqrt(best_loss):.5f}")

# -----------------------------
# 6) Final Submission
# -----------------------------
final_rmse = np.sqrt(mean_squared_error(target, oof_preds))
print(f"\n>>> Transformer CV RMSE: {final_rmse:.5f}")

avg_test_pred = test_preds_folds.mean(axis=0)
avg_test_pred = np.clip(avg_test_pred, 0, 100)

submission = pd.DataFrame({
    "id": test_ids,
    "exam_score": avg_test_pred
})
submission.to_csv(SUB_PATH, index=False)
print(f"Saved Transformer submission to {SUB_PATH}")

Loading Data...
Categorical: 7, Numerical: 9
Starting 3-Fold CV on Device: cuda

--- Fold 1/3 ---




Epoch 0: Train MSE=4034.1796, Val RMSE=62.1150
Epoch 5: Train MSE=80.1888, Val RMSE=8.9571
Epoch 10: Train MSE=79.1263, Val RMSE=8.9743
Early stopping triggered.
Fold 1 Best RMSE: 8.85346

--- Fold 2/3 ---




Epoch 0: Train MSE=3915.6929, Val RMSE=60.3187
Epoch 5: Train MSE=80.1495, Val RMSE=9.0064
Epoch 10: Train MSE=79.1396, Val RMSE=8.9473
Early stopping triggered.
Fold 2 Best RMSE: 8.89013

--- Fold 3/3 ---
Epoch 0: Train MSE=3955.8551, Val RMSE=60.7932
Epoch 5: Train MSE=79.7508, Val RMSE=8.8974
Epoch 10: Train MSE=78.9539, Val RMSE=8.9316
Early stopping triggered.
Fold 3 Best RMSE: 8.88403

>>> Transformer CV RMSE: 8.91703
Saved Transformer submission to /rds/rds-lxu/ml_datasets/exam_score_predict/submission_transformer.csv


In [1]:
# ============================================================
# Analytic Solution: Polynomial Ridge Regression
# Method: Normal Equation with Regularization (Closed-Form)
# "No Training" loops - just one Matrix Calculation.
# ============================================================

import numpy as np
import pandas as pd
import scipy.linalg as la  # Pure Linear Algebra

# -----------------------------
# Configuration
# -----------------------------
DATA_DIR = "/rds/rds-lxu/ml_datasets/exam_score_predict"
TRAIN_PATH = f"{DATA_DIR}/train.csv"
TEST_PATH  = f"{DATA_DIR}/test.csv"
SUB_PATH   = f"{DATA_DIR}/submission_analytic.csv"

# Regularization Strength (Lambda)
# Prevents "Singular Matrix" crashes. 
# Higher = simpler model, Lower = complex fit.
LAMBDA = 15.0 

# -----------------------------
# 1) Robust Data Loading
# -----------------------------
train_df = pd.read_csv(TRAIN_PATH)
test_df  = pd.read_csv(TEST_PATH)

y_train = train_df["exam_score"].values
test_ids = test_df["id"].values

# -----------------------------
# 2) Manual Feature Engineering
# -----------------------------
# We map everything to numbers. No "One-Hot" expansion to save memory.
LUT = {
    "sleep_quality": {"good": 1, "average": 0, "poor": -1},
    "facility_rating": {"high": 1, "medium": 0, "low": -1},
    "study_method": {"coaching": 2, "mixed": 1, "group study": 0.5, "online videos": 0.5, "self-study": 0},
    "exam_difficulty": {"hard": -1, "medium": 0, "easy": 1},
    "internet_access": {"yes": 1, "no": 0},
    "gender": {"Male": 0, "Female": 1} 
}

def get_feature_matrix(df):
    # Make a copy to avoid warnings
    df = df.copy()
    
    # 1. Map Categoricals to Ordinals
    for col, mapping in LUT.items():
        if col in df.columns:
            # Map knowns, fill unknowns with 0 (neutral)
            df[col] = df[col].map(mapping).fillna(0)

    # 2. Select Base Features (Numerical + Mapped)
    features = [
        "study_hours", "class_attendance", "sleep_hours", 
        "previous_exam_score", "tutoring_sessions", "physical_activity",
        "sleep_quality", "facility_rating", "study_method", 
        "exam_difficulty", "internet_access"
    ]
    # Filter for columns that actually exist
    use_cols = [c for c in features if c in df.columns]
    
    # 3. Extract Matrix
    X = df[use_cols].values.astype(np.float64)
    
    # 4. Fill NaNs (Mean Imputation)
    # Fast numpy way to fill NaNs with column means
    col_mean = np.nanmean(X, axis=0)
    inds = np.where(np.isnan(X))
    X[inds] = np.take(col_mean, inds[1])
    
    # 5. Polynomial Expansion (The "Optimization")
    # We add Squared terms to capture non-linearities (Curves)
    # e.g., "study_hours^2" helps model burnout
    X_poly = X ** 2
    
    # 6. Interaction Terms (Manual)
    # study * attendance is often powerful
    if "study_hours" in df.columns and "class_attendance" in df.columns:
        interaction = (df["study_hours"] * df["class_attendance"]).values.reshape(-1, 1)
        X = np.hstack([X, X_poly, interaction])
    else:
        X = np.hstack([X, X_poly])
        
    # 7. Add Bias Term (Intercept) column of 1s
    ones = np.ones((X.shape[0], 1))
    X = np.hstack([ones, X])
    
    return X

print("Constructing Matrices...")
X_train = get_feature_matrix(train_df)
X_test  = get_feature_matrix(test_df)

print(f"Matrix Shape: {X_train.shape}")

# -----------------------------
# 3) The Solution (Normal Equation)
# -----------------------------
# We solve: (X.T @ X + lambda*I) * w = X.T @ y

print("Calculating analytic solution...")

# A = X^T * X
A = X_train.T @ X_train

# Add Regularization (Ridge) to the diagonal
# This makes the matrix invertible (Prevents Crash)
I = np.eye(A.shape[0])
A_ridge = A + (LAMBDA * I)

# b = X^T * y
b = X_train.T @ y_train

# Solve for weights w using Cholesky solve (Faster/Stable than inversion)
# Aw = b  =>  w = solve(A, b)
try:
    w = la.solve(A_ridge, b)
    print("Solved successfully.")
except la.LinAlgError:
    # Fallback to least squares if singular (unlikely with Lambda)
    w = la.lstsq(A_ridge, b)[0]
    print("Solved using least squares approximation.")

# -----------------------------
# 4) Prediction
# -----------------------------
# y_pred = X_test @ w
predictions = X_test @ w

# Clip reasonable range (0-100)
predictions = np.clip(predictions, 0, 100)

# -----------------------------
# 5) Submission
# -----------------------------
submission = pd.DataFrame({
    "id": test_ids,
    "exam_score": predictions
})

submission.to_csv(SUB_PATH, index=False)
print(f"Submission saved to {SUB_PATH}")
print(submission.head())

Constructing Matrices...
Matrix Shape: (630000, 18)
Calculating analytic solution...
Solved successfully.
Submission saved to /rds/rds-lxu/ml_datasets/exam_score_predict/submission_analytic.csv
       id  exam_score
0  630000   71.705458
1  630001   69.715685
2  630002   86.974607
3  630003   54.442378
4  630004   47.074184
