## Overview
* This notebook builds a score prediction model by combining embeddings, anomaly signals, match probabilities, and coherence measures. 
* The goal is to extract meaningful interactions and train a regressor that handles clustered/bimodal target behavior.

# Imports

In [1]:
import os
import numpy as np
import pandas as pd
import lightgbm as lgb

from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPClassifier # The "Scikit-Learn Neural Net"
from sklearn.metrics.pairwise import cosine_similarity

# Random Seed for Reproducibility

In [2]:
np.random.seed(42)
test_ids = np.load('test_ids.npy')

# Load the preprocessed embeddings

In [3]:
# ============================
# Load embeddings and targets
# ============================
y = np.load('train_scores.npy')

# Load ALL embeddings (System included)
train_metric = np.load("train_metric_embedding.npy").astype(np.float32)
train_resp   = np.load("train_response_embedding.npy").astype(np.float32)
train_user   = np.load("train_user_prompt_embedding.npy").astype(np.float32)
train_sys = np.load("train_system_prompt_embedding.npy").astype(np.float32)

test_metric = np.load("test_metric_embedding.npy").astype(np.float32)
test_resp   = np.load("test_response_embedding.npy").astype(np.float32)
test_user   = np.load("test_user_prompt_embedding.npy").astype(np.float32)
test_sys = np.load("test_system_prompt_embedding.npy").astype(np.float32)

# Feature Preparation

* We combine system, user, and response embeddings into a unified text feature space. 
* This helps capture conversation-level semantics and ensures each training row reflects the full context.

In [4]:
# --- 3. PREPARE FEATURES ---
# Concat System + User + Response
X_text_train = np.hstack([train_sys, train_user, train_resp])
X_text_test  = np.hstack([test_sys,  test_user,  test_resp])
X_metric_train = train_metric
X_metric_test  = test_metric

print(f"Text Vector Shape: {X_text_train.shape}")

Text Vector Shape: (5000, 2304)


# Signal Engineering (F1, F2, F3)

* Three complementary signals—anomaly detection, metric–text matching, and semantic coherence—capture different behavioral dimensions. 
* These signals help the model distinguish strong vs weak metric associations.

In [6]:
# --- 4. DEFINE NEW FEATURES (F1, F2, F3) ---

# F1: Anomaly Detection uaing Isolation Forest
# Detects outliers in the text embedding space.
print("Fitting F1 (Isolation Forest)...")
iso_model = IsolationForest(
    n_estimators=100,
    contamination='auto',
    n_jobs=-1,
    random_state=42
)
iso_model.fit(X_text_train)

# F2: Metric–Text Match Predictor (MLPClassifier)
# Uses a small neural network to learn whether a metric embedding
# belongs to a given text embedding.
print("Fitting F2 (MLP Matcher)...")

# 1. Positives (Real Pairs)
X_positives = np.hstack([X_metric_train, X_text_train])
y_positives = np.ones(len(X_positives))

# 2. Fake (negative) pairs – metric with mismatched text
# Using 2× shuffled negatives makes the classifier stricter.
rng = np.random.RandomState(42)
idx_shuf1 = rng.permutation(len(X_text_train))
idx_shuf2 = rng.permutation(len(X_text_train))

X_neg1 = np.hstack([X_metric_train, X_text_train[idx_shuf1]])
X_neg2 = np.hstack([X_metric_train, X_text_train[idx_shuf2]])
y_neg = np.zeros(len(X_neg1) + len(X_neg2))

X_matcher = np.vstack([X_positives, X_neg1, X_neg2])
y_matcher = np.concatenate([y_positives, y_neg])

# MLP: Hidden layers project the high-dim embeddings. 

matcher = MLPClassifier(
    hidden_layer_sizes=(512, 256), # Layers to compress info
    activation='relu',
    solver='adam',
    alpha=0.001,
    batch_size=256,
    learning_rate_init=0.001,
    max_iter=150, # 50 epochs is usually enough
    random_state=42
)
matcher.fit(X_matcher, y_matcher)

# # F3: Cosine Similarity (Semantic Alignment)
def get_s3(u, r):
    num = np.sum(u * r, axis=1)
    den = np.linalg.norm(u, axis=1) * np.linalg.norm(r, axis=1)
    return num / (den + 1e-9)

Fitting F1 (Isolation Forest)...
Fitting F2 (MLP Matcher)...


# Data Augmentation
Synthetic mismatched examples are created to teach the model how low‑quality pairs behave. This helps LightGBM avoid over‑predicting and improves separation between good and bad samples.

In [7]:
# --- 5. DATA AUGMENTATION & NEW FEATURE GENERATION ---
print("Generating New Features for improved model accuracy")

# We create additional training samples so that the regressor
# learns how "bad" or mismatched metric–text pairs behave.
# This helps the model avoid over-predicting scores.
# Number of synthetic samples = size of original training set

n_aug = len(y)
idx_aug = rng.permutation(len(y))

# --- Synthetic (Negative) Data Construction ---
# We keep metric embeddings the same but shuffle text/user/response embeddings.
# This breaks the true pairing and produces intentionally incorrect combinations.

print('Synthetic Data Creation Started...')
aug_metric = X_metric_train
aug_text   = X_text_train[idx_aug]
aug_user   = train_user[idx_aug]
aug_resp   = train_resp[idx_aug]
print('Data Creation Complete...')

# Synthetic scores: force them to be low (0.0 to 3.5).
# This trains LightGBM to recognize and penalize mismatches.

aug_y      = rng.uniform(0.0, 3.5, size=n_aug)

# --- Merge Real + Synthetic Data ---
# Stack real data with synthetic "bad" pairs to create the final training set.
X_text_total = np.vstack([X_text_train, aug_text])
X_metric_total = np.vstack([X_metric_train, aug_metric])
train_user_total = np.vstack([train_user, aug_user])
train_resp_total = np.vstack([train_resp, aug_resp])
y_total = np.concatenate([y, aug_y])

Generating New Features for improved model accuracy
Synthetic Data Creation Started...
Data Creation Complete...


In [8]:
# --- COMPUTE Features ---

# F1: Anomaly
# Isolation Forest outputs a raw anomaly value (higher = more normal, lower = more anomalous).
# We compute this score for both augmented training data and test data.
s1_raw_tr = iso_model.decision_function(X_text_total)
s1_raw_te = iso_model.decision_function(X_text_test)
sc1 = MinMaxScaler()
S1_train = sc1.fit_transform(s1_raw_tr.reshape(-1,1)).flatten()
S1_test  = sc1.transform(s1_raw_te.reshape(-1,1)).flatten()

# F2: Matcher (Probability)
# We combine metric and text embeddings so the matcher can evaluate if they belong together.
X_concat_tr = np.hstack([X_metric_total, X_text_total])
X_concat_te = np.hstack([X_metric_test, X_text_test])
S2_train = matcher.predict_proba(X_concat_tr)[:, 1]
S2_test  = matcher.predict_proba(X_concat_te)[:, 1]

# F3: Coherence
# Computes cosine similarity between user embedding and response embedding.
# Higher value → more semantically aligned interaction.
s3_raw_tr = get_s3(train_user_total, train_resp_total)
s3_raw_te = get_s3(test_user, test_resp)
sc3 = MinMaxScaler()
S3_train = sc3.fit_transform(s3_raw_tr.reshape(-1,1)).flatten()
S3_test  = sc3.transform(s3_raw_te.reshape(-1,1)).flatten()

# Final Regression Model
* Interaction features (S1×S2×S3) highlight cluster boundaries, and LightGBM is tuned to detect multimodal patterns. 
* This produces stable final scores aligned with high‑quality matching behavior.

In [9]:
# --- 6. FINAL REGRESSION ---
def get_feats(s1, s2, s3):
    df = pd.DataFrame({"S1": s1, "S2": s2, "S3": s3})
    
    # Create interaction terms between signals.
    # These help the model capture *joint patterns*:
    # - S1xS2: anomaly + match strength
    # - S2xS3: match strength + semantic coherence
    # - S1xS3: anomaly + coherence
    # - All: full 3-way interaction capturing "perfect agreement"
    # Strong interactions to isolate the "Perfect" cluster (High S1*S2*S3)
    
    df["S1xS2"] = df.S1 * df.S2
    df["S2xS3"] = df.S2 * df.S3
    df["S1xS3"] = df.S1 * df.S3
    df["All"]   = df.S1 * df.S2 * df.S3
    return df

# Build training and test feature matrices

df_train = get_feats(S1_train, S2_train, S3_train)
df_test  = get_feats(S1_test, S2_test, S3_test)

# -------------------- Sample Weighting --------------------
# Count how many samples occur in each rounded score value.
# Rare score levels get higher weight so the model does not ignore them.
counts = pd.Series(np.round(y_total)).value_counts()

# Weight = inverse frequency of that rounded score
# Helps LightGBM treat underrepresented score levels fairly.
weights = pd.Series(np.round(y_total)).map(lambda x: 1.0 / counts.get(x, 1.0)).values

print("Training LightGBM...")
dtrain = lgb.Dataset(df_train, label=y_total, weight=weights)

# Parameters optimized to separate clusters rather than smooth the mean
# -------------------- Model Parameters --------------------
# Settings tuned to help LightGBM separate clusters (low vs high scores)
# instead of treating the target as a smooth regression line.

params = {
    "objective": "regression",
    "metric": "rmse",
    "learning_rate": 0.05, 
    "num_leaves": 63,        # Higher leaves = more complex splits (good for bimodal)
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "min_data_in_leaf": 10,  # Allow smaller clusters (peaks)
    "seed": 42,
    "verbosity": -1
}


# Train the final LightGBM regressor
model = lgb.train(params, dtrain, num_boost_round=1200)
pred_test = model.predict(df_test)

Training LightGBM...


# Saving the CSV in the required format

In [10]:
# --- 8. EXPORT (No Manual Scaling) ---
final_df = pd.DataFrame({'ids': test_ids, 'score': pred_test})

# Optional: Clip to valid range just to be safe (0-10), but no shifting
final_df['score'] = np.clip(final_df['score'], 0, 10)

final_df.to_csv("stacking_submission_newest.csv", index=False)

print("Submission Saved.")
print(final_df['score'].describe())


Submission Saved.
count    3638.000000
mean        5.636960
std         3.432238
min         0.000000
25%         1.854232
50%         7.080926
75%         8.881930
max        10.000000
Name: score, dtype: float64
