In [3]:
#loading augmented train data and other files
import json
import numpy as np
import pandas as pd

# Load metric names
with open('/kaggle/input/da5401-2025-data-challenge/metric_names.json', 'r') as f:
    metric_names = json.load(f)

# --- FIX 1: Create the missing metric_map ---
metric_map = {name: i for i, name in enumerate(metric_names)}
print("Created metric_map dictionary.")

# Load metric embeddings
metric_embeddings = np.load('/kaggle/input/da5401-2025-data-challenge/metric_name_embeddings.npy')

# Load train data
aug_train_df = pd.read_csv('/kaggle/input/final-aug/augmented_train_dataset_400 (1).csv')

# --- FIX 2: Copy aug_train_df to train_df before using it ---
train_df = aug_train_df.copy()
print("Copied aug_train_df to train_df.")

# Load test data
with open('/kaggle/input/da5401-2025-data-challenge/test_data.json', 'r') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)

# Check basic info and missing data
print("Train dataset info:")
print(train_df.info())
print("\nMissing values in train data:")
print(train_df.isnull().sum())

print("\n" + "="*50)
print("Test dataset info:")
print(test_df.info())
print("\nMissing values in test data:")
print(test_df.isnull().sum())

# Normalize text columns (lowercase) and handle missing values
for col in ['user_prompt', 'system_prompt', 'response']:
    train_df[col] = train_df[col].fillna('').str.lower()
    test_df[col] = test_df[col].fillna('').str.lower()

# Convert score to numeric type (float)
train_df['score'] = pd.to_numeric(train_df['score'], errors='coerce')

# Check for any remaining missing scores
print("\n" + "="*50)
print(f"Missing scores after conversion: {train_df['score'].isnull().sum()}")

print(f"Full training set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")
print(f"Number of unique metrics: {len(metric_names)}")
print(f"Metric embeddings shape: {metric_embeddings.shape}")

Created metric_map dictionary.
Copied aug_train_df to train_df.
Train dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7806 entries, 0 to 7805
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   metric_name    7806 non-null   object
 1   score          7806 non-null   int64 
 2   user_prompt    7806 non-null   object
 3   response       7805 non-null   object
 4   system_prompt  6126 non-null   object
dtypes: int64(1), object(4)
memory usage: 305.1+ KB
None

Missing values in train data:
metric_name         0
score               0
user_prompt         0
response            1
system_prompt    1680
dtype: int64

Test dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3638 entries, 0 to 3637
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   metric_name    3638 non-null   object
 1   user_prompt    3638 non-null   object


In [32]:
len(aug_train_df)

7798

In [2]:
!pip install --upgrade transformers sentence-transformers

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl.metadata (16 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=

In [6]:
from huggingface_hub import login
import os
from kaggle_secrets import UserSecretsClient

# Log in to Hugging Face
try:
    user_secrets = UserSecretsClient()
    hf_token = user_secrets.get_secret("HF_TOKEN")
    login(token=hf_token)
    print("Hugging Face login successful.")
except:
    print("Hugging Face login failed. Make sure HF_TOKEN is set as a Kaggle Secret.")

Hugging Face login successful.


In [7]:
# Final Safety Filter
# Drop rows where prompt or response is less than 15 characters (likely noise/fragments)
print(len(aug_train_df))
aug_train_df = aug_train_df[aug_train_df['user_prompt'].str.len() > 15]
aug_train_df = aug_train_df[aug_train_df['response'].str.len() > 15]

print(f"Final dataset size: {len(aug_train_df)}")

7806
Final dataset size: 7798


  return op(a, b)


In [8]:
#generating embeddings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer
import torch 
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# --- 1. Load Prerequisite Data (Assumed to be in memory) ---
# Make sure these variables are loaded in your environment first:
# aug_train_df: pd.DataFrame (Your augmented training data)
# test_df: pd.DataFrame (The original test data)
# metric_map: dict (Mapping from metric_name to index)
# metric_embeddings: np.array (Embeddings for all metrics)

# --- 2. Initialize Your Chosen Encoder (FIXED) ---
print("Loading multilingual-e5-large encoder...")
# Explicitly assign to the first available GPU. This is the robust method.
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Assigning encoder to device: {device}")
encoder = SentenceTransformer("intfloat/multilingual-e5-large", device=device)
print("Encoder loaded.")

# --- 3. Create Copies and Clean Data ---
print("Cleaning data...")
train_df = aug_train_df.copy()
test_df_proc = test_df.copy()

# Clean train_df
for col in ['user_prompt', 'response']:
    train_df[col] = train_df[col].fillna('').str.lower()
train_df['score'] = pd.to_numeric(train_df['score'], errors='coerce')

# Clean test_df_proc
for col in ['user_prompt', 'response']:
    test_df_proc[col] = test_df_proc[col].fillna('').str.lower()

# --- 4. Define Text Function & Get Full Text ---
def get_full_text_no_system(row):
    """Combines prompt/response, *totally dropping* system_prompt."""
    return f"user: {row['user_prompt']} \nai: {row['response']}"

print("Combining text fields...")
train_df['full_text'] = train_df.apply(get_full_text_no_system, axis=1)
test_df_proc['full_text'] = test_df_proc.apply(get_full_text_no_system, axis=1)

# --- 5. Generate and Save Text Embeddings ---
print("Encoding augmented training text (using GPU)...")
# Pass the texts as a list
E_pr_train_aug = encoder.encode(train_df['full_text'].tolist(), show_progress_bar=True)
np.save("E_pr_train_aug.npy", E_pr_train_aug)
print("Saved E_pr_train_aug.npy")

print("Encoding test text (using GPU)...")
E_pr_test_new = encoder.encode(test_df_proc['full_text'].tolist(), show_progress_bar=True)
np.save("E_pr_test_new.npy", E_pr_test_new)
print("Saved E_pr_test_new.npy")

# --- 6. Generate and Save Metric Embeddings ---
print("Mapping metric embeddings...")
train_metric_indices = train_df['metric_name'].map(metric_map).values
E_met_train_aug = metric_embeddings[train_metric_indices]
np.save("E_met_train_aug.npy", E_met_train_aug)
print("Saved E_met_train_aug.npy")

test_metric_indices = test_df_proc['metric_name'].map(metric_map).values
E_met_test_new = metric_embeddings[test_metric_indices]
np.save("E_met_test_new.npy", E_met_test_new)
print("Saved E_met_test_new.npy")

# --- 7. Define Target Variable ---
y_train_float = train_df['score'].values

print("\n--- Data Preparation Complete ---")
print(f"E_pr_train_aug shape: {E_pr_train_aug.shape}")
print(f"E_met_train_aug shape: {E_met_train_aug.shape}")
print(f"E_pr_test_new shape: {E_pr_test_new.shape}")
print(f"E_met_test_new shape: {E_met_test_new.shape}")

2025-11-19 11:31:48.269873: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763551908.694208      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763551908.820375      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Loading multilingual-e5-large encoder...
Assigning encoder to device: cuda


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

Encoder loaded.
Cleaning data...
Combining text fields...
Encoding augmented training text (using GPU)...


Batches:   0%|          | 0/244 [00:00<?, ?it/s]

Saved E_pr_train_aug.npy
Encoding test text (using GPU)...


Batches:   0%|          | 0/114 [00:00<?, ?it/s]

Saved E_pr_test_new.npy
Mapping metric embeddings...
Saved E_met_train_aug.npy
Saved E_met_test_new.npy

--- Data Preparation Complete ---
E_pr_train_aug shape: (7798, 1024)
E_met_train_aug shape: (7798, 768)
E_pr_test_new shape: (3638, 1024)
E_met_test_new shape: (3638, 768)


In [49]:
#Model 1 - single layer MLP
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')


print("--- Starting Strategy: Shallow 'Bottleneck' MLP ---")

# --- 1. Load Embeddings ---
print("Loading pre-computed embeddings...")
try:
    E_pr_train_aug = np.load("E_pr_train_aug.npy")
    E_pr_test_new = np.load("E_pr_test_new.npy")
    E_met_train_aug = np.load("E_met_train_aug.npy")
    E_met_test_new = np.load("E_met_test_new.npy")
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise

# --- 2. Create & Scale Features ---
print("Creating full concatenated features (1792 dims)...")
X_train_full = np.hstack([E_pr_train_aug, E_met_train_aug])
X_test_full = np.hstack([E_pr_test_new, E_met_test_new])

# Neural Networks require scaling
scaler = StandardScaler()
X_train_full = scaler.fit_transform(X_train_full)
X_test_full = scaler.transform(X_test_full)

# --- 3. Define the Shallow Model ---
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
INPUT_DIM = X_train_full.shape[1] # 1792

class ShallowBottleneckMLP(nn.Module):
    def __init__(self):
        super(ShallowBottleneckMLP, self).__init__()
        # The "Bottleneck": Compress 1792 dimensions down to just 16
        self.layer1 = nn.Linear(INPUT_DIM, 16) 
        self.bn1 = nn.BatchNorm1d(16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5) # High dropout
        self.output = nn.Linear(16, 1) # Predict score

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.output(x)

# --- 4. Setup Sampling Weights (Inverse Frequency) ---
# Calculate weights for the sampler so low scores are picked more often
score_counts = train_df['score'].value_counts().sort_index()
class_weights = score_counts / score_counts
# Map weights to every sample in the training set
sample_weights = train_df['score'].map(class_weights).values
sample_weights = torch.DoubleTensor(sample_weights)

# --- 5. K-Fold Training ---
N_SPLITS = 10
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
BATCH_SIZE = 256
EPOCHS = 15
LR = 0.001

oof_predictions = np.zeros(X_train_full.shape[0])
test_predictions = np.zeros(X_test_full.shape[0])

# Convert test data to tensor once
X_test_tensor = torch.tensor(X_test_full, dtype=torch.float32).to(DEVICE)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full, y_train_float)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Prepare Data
    X_tr = torch.tensor(X_train_full[train_idx], dtype=torch.float32)
    y_tr = torch.tensor(y_train_float[train_idx], dtype=torch.float32).view(-1, 1)
    X_val = torch.tensor(X_train_full[val_idx], dtype=torch.float32).to(DEVICE)
    y_val = torch.tensor(y_train_float[val_idx], dtype=torch.float32).view(-1, 1).to(DEVICE)
    
    # Create Sampler for THIS fold
    fold_weights = sample_weights[train_idx]
    sampler = WeightedRandomSampler(fold_weights, len(fold_weights))
    
    train_ds = TensorDataset(X_tr, y_tr)
    # Use sampler in DataLoader (shuffle must be False when using sampler)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler)
    
    # Init Model
    model = ShallowBottleneckMLP().to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01) # L2 reg
    criterion = nn.MSELoss()
    
    # Training Loop
    model.train()
    for epoch in range(EPOCHS):
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
            
    # Validate
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val).cpu().numpy()
        oof_predictions[val_idx] = val_preds.flatten()
        
        val_rmse = np.sqrt(mean_squared_error(y_val.cpu().numpy(), val_preds))
        print(f"Fold {fold+1} Val RMSE: {val_rmse:.4f}")
        
        # Predict on Test
        test_preds = model(X_test_tensor).cpu().numpy()
        test_predictions += test_preds.flatten() / N_SPLITS

print("\n--- Shallow MLP Training Complete ---")

# --- 6. Post-Processing & Submission ---
oof_rmse_final = np.sqrt(mean_squared_error(y_train_float, oof_predictions))
print(f"OOF RMSE (Shallow MLP): {oof_rmse_final:.4f}")

# Clip and floor predictions
test_predictions = np.clip(test_predictions, 0, 10)
final_predictions = np.floor(test_predictions)
final_predictions_clipped = np.clip(final_predictions, 0, 10)

submission_df = pd.DataFrame({
    'ID': test_df.index + 1,
    'score': final_predictions_clipped
})

submission_df['score'] = submission_df['score'].astype(float)
submission_df.to_csv("submission_shallow_mlp.csv", index=False) 

print("submission_shallow_mlp.csv created successfully!")
print(submission_df.head())
print("\nPrediction value counts:")
print(submission_df['score'].value_counts().sort_index())
print(submission_df['score'].mean())

--- Starting Strategy: Shallow 'Bottleneck' MLP ---
Loading pre-computed embeddings...
Creating full concatenated features (1792 dims)...
--- Fold 1/10 ---
Fold 1 Val RMSE: 3.1567
--- Fold 2/10 ---
Fold 2 Val RMSE: 2.3293
--- Fold 3/10 ---
Fold 3 Val RMSE: 2.8694
--- Fold 4/10 ---
Fold 4 Val RMSE: 2.5451
--- Fold 5/10 ---
Fold 5 Val RMSE: 2.7110
--- Fold 6/10 ---
Fold 6 Val RMSE: 2.5177
--- Fold 7/10 ---
Fold 7 Val RMSE: 2.2596
--- Fold 8/10 ---
Fold 8 Val RMSE: 2.6834
--- Fold 9/10 ---
Fold 9 Val RMSE: 3.0593
--- Fold 10/10 ---
Fold 10 Val RMSE: 2.7143

--- Shallow MLP Training Complete ---
OOF RMSE (Shallow MLP): 2.6985
submission_shallow_mlp.csv created successfully!
   ID  score
0   1    5.0
1   2    6.0
2   3    7.0
3   4    5.0
4   5    1.0

Prediction value counts:
score
0.0       2
1.0      10
2.0      24
3.0      70
4.0     249
5.0     708
6.0    1222
7.0    1017
8.0     316
9.0      20
Name: count, dtype: int64
6.037108301264431


In [76]:
#model 2 2-layer MLP
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, WeightedRandomSampler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# --- !! Prerequisite !! ---
# Assumes .npy files are saved and 'y_train_float', 'test_df', 'train_df' are loaded.

print("--- Starting Strategy: Wider MLP (512 Hidden Neurons) ---")

# --- 1. Load Embeddings ---
print("Loading pre-computed embeddings...")
try:
    E_pr_train_aug = np.load("E_pr_train_aug.npy")
    E_pr_test_new = np.load("E_pr_test_new.npy")
    E_met_train_aug = np.load("E_met_train_aug.npy")
    E_met_test_new = np.load("E_met_test_new.npy")
except FileNotFoundError as e:
    print(f"Error: {e}")
    raise

# --- 2. Create & Scale Features ---
print("Creating full concatenated features (1792 dims)...")
X_train_full = np.hstack([E_pr_train_aug, E_met_train_aug])
X_test_full = np.hstack([E_pr_test_new, E_met_test_new])

# Neural Networks require scaling
scaler = StandardScaler()
X_train_full = scaler.fit_transform(X_train_full)
X_test_full = scaler.transform(X_test_full)

# --- 3. Define the Wider Model ---
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
INPUT_DIM = X_train_full.shape[1] # 1792

class WiderMLP(nn.Module):
    def __init__(self):
        super(WiderMLP, self).__init__()
        # WIDER HIDDEN LAYER: 
        # Increased from 16 to 512 to capture more complex non-linear patterns
        self.layer1 = nn.Linear(INPUT_DIM, 20) 
        
        # BatchNorm must match the layer output size (512)
        self.bn1 = nn.BatchNorm1d(20)
        
        # ReLU introduces the non-linearity
        self.relu = nn.ReLU()
        
        # Dropout helps prevent overfitting now that the model has high capacity
        self.dropout = nn.Dropout(0.5) 
        
        # Output layer: Takes 512 inputs and predicts 1 score
        self.output = nn.Linear(20, 1) 

    def forward(self, x):
        x = self.layer1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        return self.output(x)

# --- 4. Setup Sampling Weights (Inverse Frequency) ---
score_counts = train_df['score'].value_counts().sort_index()
class_weights = 1 / score_counts
sample_weights = train_df['score'].map(class_weights).values
sample_weights = torch.DoubleTensor(sample_weights)

# --- 5. K-Fold Training ---
N_SPLITS = 10
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
BATCH_SIZE = 256
EPOCHS = 30
LR = 0.0008

oof_predictions = np.zeros(X_train_full.shape[0])
test_predictions = np.zeros(X_test_full.shape[0])

# Convert test data to tensor once
X_test_tensor = torch.tensor(X_test_full, dtype=torch.float32).to(DEVICE)

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_full, y_train_float)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    # Prepare Data
    X_tr = torch.tensor(X_train_full[train_idx], dtype=torch.float32)
    y_tr = torch.tensor(y_train_float[train_idx], dtype=torch.float32).view(-1, 1)
    X_val = torch.tensor(X_train_full[val_idx], dtype=torch.float32).to(DEVICE)
    y_val = torch.tensor(y_train_float[val_idx], dtype=torch.float32).view(-1, 1).to(DEVICE)
    
    # Create Sampler for THIS fold
    fold_weights = sample_weights[train_idx]
    sampler = WeightedRandomSampler(fold_weights, len(fold_weights))
    
    train_ds = TensorDataset(X_tr, y_tr)
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=sampler)
    
    # Init Model (Using the new WiderMLP)
    model = WiderMLP().to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
    criterion = nn.HuberLoss(delta=1.0)
    
    # Training Loop
    model.train()
    for epoch in range(EPOCHS):
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(DEVICE), y_batch.to(DEVICE)
            
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()
            
    # Validate
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val).cpu().numpy()
        oof_predictions[val_idx] = val_preds.flatten()
        
        val_rmse = np.sqrt(mean_squared_error(y_val.cpu().numpy(), val_preds))
        print(f"Fold {fold+1} Val RMSE: {val_rmse:.4f}")
        
        # Predict on Test
        test_preds = model(X_test_tensor).cpu().numpy()
        test_predictions += test_preds.flatten() / N_SPLITS

print("\n--- Wider MLP Training Complete ---")

# --- 6. Post-Processing & Submission ---
oof_rmse_final = np.sqrt(mean_squared_error(y_train_float, oof_predictions))
print(f"OOF RMSE (Wider MLP): {oof_rmse_final:.4f}")

# Clip and floor predictions
test_predictions = np.clip(test_predictions, 0, 10)
final_predictions = np.floor(test_predictions)
final_predictions_clipped = np.clip(final_predictions, 0, 10)

submission_df = pd.DataFrame({
    'ID': test_df.index + 1,
    'score': final_predictions_clipped
})

submission_df['score'] = submission_df['score'].astype(float)
submission_df.to_csv("submission_wider_mlp.csv", index=False) 

print("submission_wider_mlp.csv created successfully!")
print(submission_df.head())
print("\nPrediction value counts:")
print(submission_df['score'].value_counts().sort_index())
print(submission_df['score'].mean())

--- Starting Strategy: Wider MLP (512 Hidden Neurons) ---
Loading pre-computed embeddings...
Creating full concatenated features (1792 dims)...
--- Fold 1/10 ---
Fold 1 Val RMSE: 1.6184
--- Fold 2/10 ---
Fold 2 Val RMSE: 1.7397
--- Fold 3/10 ---
Fold 3 Val RMSE: 1.6999
--- Fold 4/10 ---
Fold 4 Val RMSE: 1.7115
--- Fold 5/10 ---
Fold 5 Val RMSE: 1.7368
--- Fold 6/10 ---
Fold 6 Val RMSE: 1.6511
--- Fold 7/10 ---
Fold 7 Val RMSE: 1.7004
--- Fold 8/10 ---
Fold 8 Val RMSE: 1.5639
--- Fold 9/10 ---
Fold 9 Val RMSE: 1.6319
--- Fold 10/10 ---
Fold 10 Val RMSE: 1.5535

--- Wider MLP Training Complete ---
OOF RMSE (Wider MLP): 1.6620
submission_wider_mlp.csv created successfully!
   ID  score
0   1    7.0
1   2    7.0
2   3    8.0
3   4    7.0
4   5    3.0

Prediction value counts:
score
1.0        6
2.0       15
3.0       52
4.0       83
5.0      208
6.0      486
7.0     1040
8.0     1076
9.0      521
10.0     151
Name: count, dtype: int64
7.30263881253436


In [88]:
#Model 3 - MLP regressor
import numpy as np
import pandas as pd
import warnings

# Scikit-Learn Imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Suppress warnings
warnings.filterwarnings('ignore')

def main():
    print("--- Starting Strategy: Log-Target MLP with PCA (Sklearn) ---")

    # --- 1. Load Pre-computed Embeddings ---
    print("Loading .npy files...")
    try:
        E_pr_train_aug = np.load("E_pr_train_aug.npy")
        E_pr_test_new = np.load("E_pr_test_new.npy")
        E_met_train_aug = np.load("E_met_train_aug.npy")
        E_met_test_new = np.load("E_met_test_new.npy")
    except FileNotFoundError as e:
        print(f"Error loading files: {e}")
        return

    # Concatenate features (1792 dims)
    X_train_arr = np.hstack([E_pr_train_aug, E_met_train_aug])
    X_test_arr  = np.hstack([E_pr_test_new, E_met_test_new])

    print(f"Feature Matrix Shape: {X_train_arr.shape}")

    # Convert to DataFrame to satisfy Pipeline column selection
    # We generate dummy column names: feat_0, feat_1, ...
    feat_cols = [f"feat_{i}" for i in range(X_train_arr.shape[1])]
    X = pd.DataFrame(X_train_arr, columns=feat_cols)
    X_test = pd.DataFrame(X_test_arr, columns=feat_cols)

    # --- 2. Target Variable Setup (Log Transformation) ---
    # Assuming 'y_train_float' is in the global scope
    if 'y_train_float' not in globals():
        print("Error: 'y_train_float' not found in global scope.")
        return
    
    y_raw = y_train_float 
    y = np.log1p(y_raw) # Log transformation - KEY IMPROVEMENT for this strategy

    print(f"Target range (raw): {y_raw.min():.2f} - {y_raw.max():.2f}")
    print(f"Target range (log): {y.min():.4f} - {y.max():.4f}")

    # --- 3. Define Pipeline ---
    print("üèóÔ∏è  Step 3: Building model pipeline...")
    numeric_features = [c for c in X.columns]
    
    # Preprocessing: Scale -> PCA (256 components)
    preprocess_pipe = ColumnTransformer([
        ("scaler", StandardScaler(), numeric_features),
        ("pca", PCA(n_components=256, random_state=42), numeric_features)
    ], remainder="drop")

    # MLP Regressor - tuned hyperparameters from your snippet
    mlp = MLPRegressor(
        hidden_layer_sizes=(256, 128),
        activation="tanh",
        solver="adam",
        learning_rate_init=0.001,
        max_iter=1000,
        random_state=42,
        early_stopping=True,        # Stop if validation score doesn't improve
        validation_fraction=0.1,
        n_iter_no_change=20
    )
    
    mlp_pipe = Pipeline([
        ("preprocess", preprocess_pipe),
        ("model", mlp)
    ])
    print("‚úÖ Pipeline created")

    # --- 4. K-Fold Training ---
    print("="*70)
    print("CROSS-VALIDATION TRAINING")
    print("="*70)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    fold_rmse_log, fold_rmse_orig = [], []
    test_pred_log = np.zeros(len(X_test))
    
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X), 1):
        print(f"üì¶ Fold {fold}/5 training...", end=" ")
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]
        
        mlp_pipe.fit(X_tr, y_tr)
        
        # Validation Prediction
        y_val_pred_log = mlp_pipe.predict(X_val)
        rmse_log = np.sqrt(mean_squared_error(y_val, y_val_pred_log))
        
        # Inverse transform for reporting "Original" RMSE
        y_val_pred = np.expm1(y_val_pred_log)
        y_val_true = np.expm1(y_val)
        rmse_orig = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
        
        fold_rmse_log.append(rmse_log)
        fold_rmse_orig.append(rmse_orig)
        print(f"Log RMSE={rmse_log:.4f} | Orig RMSE={rmse_orig:.4f}")
        
        # Test Prediction (Accumulate average)
        test_pred_log += mlp_pipe.predict(X_test) / kf.n_splits

    print("\n=== 5-Fold Summary ===")
    print(f"Mean Log RMSE={np.mean(fold_rmse_log):.4f} ¬± {np.std(fold_rmse_log):.4f}")
    print(f"Mean Orig RMSE={np.mean(fold_rmse_orig):.4f} ¬± {np.std(fold_rmse_orig):.4f}")

    # --- 5. Post-Processing & Submission ---
    print("="*70)
    print("GENERATING PREDICTIONS")
    print("="*70)

    # A. Inverse Log Transform
    test_pred = np.expm1(test_pred_log)
    
    # B. Clip to valid range [0, 10]
    test_pred_clipped = np.clip(test_pred, 0, 10)
    
    # C. Floor the values (Discrete integers)
    test_pred_floored = np.floor(test_pred_clipped)
    
    # D. Final Safety Clip
    final_predictions = np.clip(test_pred_floored, 0, 10)

    # Create Submission DataFrame
    if 'test_df' in globals():
        submission_df = pd.DataFrame({
            'ID': test_df.index + 1,
            'score': final_predictions
        })
    else:
        # Fallback if test_df isn't loaded
        submission_df = pd.DataFrame({
            'ID': range(1, len(final_predictions) + 1),
            'score': final_predictions
        })
    
    # Ensure float format
    submission_df['score'] = submission_df['score'].astype(float)
    
    # Save
    filename = "submission_mlp_pca_logtarget.csv"
    submission_df.to_csv(filename, index=False)
    
    # --- 6. Final Analysis ---
    print(f"{filename} created successfully!")
    print("\n--- Head ---")
    print(submission_df.head())
    
    print("\n--- Prediction Value Counts ---")
    print(submission_df['score'].value_counts().sort_index())
    
    print("\n--- Mean Score ---")
    print(f"Mean: {submission_df['score'].mean():.4f}")

if __name__ == "__main__":
    main()

--- Starting Strategy: Log-Target MLP with PCA (Sklearn) ---
Loading .npy files...
Feature Matrix Shape: (7798, 1792)
Target range (raw): 0.00 - 10.00
Target range (log): 0.0000 - 2.3979
üèóÔ∏è  Step 3: Building model pipeline...
‚úÖ Pipeline created
CROSS-VALIDATION TRAINING
üì¶ Fold 1/5 training... Log RMSE=0.1108 | Orig RMSE=0.8444
üì¶ Fold 2/5 training... Log RMSE=0.1308 | Orig RMSE=0.9469
üì¶ Fold 3/5 training... Log RMSE=0.1366 | Orig RMSE=0.9519
üì¶ Fold 4/5 training... Log RMSE=0.1347 | Orig RMSE=0.8493
üì¶ Fold 5/5 training... Log RMSE=0.1605 | Orig RMSE=1.0515

=== 5-Fold Summary ===
Mean Log RMSE=0.1347 ¬± 0.0158
Mean Orig RMSE=0.9288 ¬± 0.0766
GENERATING PREDICTIONS
submission_mlp_pca_logtarget.csv created successfully!

--- Head ---
   ID  score
0   1    9.0
1   2    9.0
2   3    8.0
3   4    9.0
4   5    3.0

--- Prediction Value Counts ---
score
0.0        1
1.0        4
2.0        6
3.0       30
4.0       42
5.0       58
6.0      126
7.0      348
8.0     1492
9.0 