In [11]:
# Personalized Medicine: Redefining Cancer Treatment — Data Loading & Quick Audit
# ------------------------------------------------------------------------------
# Dataset: https://www.kaggle.com/competitions/msk-redefining-cancer-treatment
# Goal: Build a multi-class classifier (9 classes) from tabular + text features.
# Key: Robust loading, schema checks, null audit, and first sanity EDA.

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

from pathlib import Path

# Repro + style
sns.set(style="whitegrid")
os.makedirs("figures", exist_ok=True)
os.makedirs("data", exist_ok=True)

nltk.download("punkt", quiet=True)

# ----------------------------
# 1) Paths & safe CSV loading
# ----------------------------
DATA_DIR = Path("Raw/msk-redefining-cancer-treatment")

# Kaggle provides text files named exactly like below (no .csv extension)
train_variants_path = DATA_DIR / "training_variants"
train_text_path = DATA_DIR / "training_text"
test_variants_path = DATA_DIR / "test_variants"  # stage 1 test
test_text_path = DATA_DIR / "test_text"


def read_required_csv(path: Path, **kwargs) -> pd.DataFrame:
    if not path.exists():
        raise FileNotFoundError(
            f"Missing file: {path}. Download from Kaggle competition page."
        )
    return pd.read_csv(path, **kwargs)


# Train variants (ID, Gene, Variation, Class)
train_variants = read_required_csv(train_variants_path)
print(f"✅ training_variants loaded: {train_variants.shape}")

# Train text (||-separated; two columns: ID || Text)
train_text = read_required_csv(
    train_text_path,
    sep=r"\|\|",
    engine="python",
    header=None,
    names=["ID", "Text"],
    skiprows=1,
)
print(f"✅ training_text loaded: {train_text.shape}")

# Test variants/text (for submission)
test_variants = read_required_csv(test_variants_path)
print(f"✅ test_variants loaded: {test_variants.shape}")

test_text = read_required_csv(
    test_text_path,
    sep=r"\|\|",
    engine="python",
    header=None,
    names=["ID", "Text"],
    skiprows=1,
)
print(f"✅ test_text loaded: {test_text.shape}")

# ----------------------------
# 2) Merge & schema checks
# ----------------------------
train_df = pd.merge(train_variants, train_text, how="left", on="ID")
test_df = pd.merge(test_variants, test_text, how="left", on="ID")

print(f"\n🔗 Merged train: {train_df.shape} | Merged test: {test_df.shape}")

# Uniqueness
assert train_df["ID"].is_unique, "Duplicate IDs in train."
assert test_df["ID"].is_unique, "Duplicate IDs in test."

# Expected columns
expected_train_cols = {"ID", "Gene", "Variation", "Class", "Text"}
expected_test_cols = {"ID", "Gene", "Variation", "Text"}

missing_train = expected_train_cols - set(train_df.columns)
missing_test = expected_test_cols - set(test_df.columns)
assert not missing_train, f"Train missing columns: {missing_train}"
assert not missing_test, f"Test missing columns: {missing_test}"

# Null audit
print("\n🔎 Nulls (train):")
print(train_df.isnull().sum())

print("\n🔎 Nulls (test):")
print(test_df.isnull().sum())

# Fill missing text with empty string (keeps rows usable in NLP)
train_df["Text"] = train_df["Text"].fillna("")
test_df["Text"] = test_df["Text"].fillna("")

# ----------------------------
# 3) Quick sanity EDA (concise)
# ----------------------------
# Class distribution (1..9)
class_counts = train_df["Class"].value_counts().sort_index()
print("\n📊 Class distribution (train):")
print(class_counts)

plt.figure(figsize=(8, 4))
sns.barplot(x=class_counts.index, y=class_counts.values, color="steelblue")
plt.title("Class Distribution (Train)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("figures/class_distribution.png")
plt.close()

# Gene & Variation cardinality
n_genes = train_df["Gene"].nunique()
n_vars = train_df["Variation"].nunique()
print(f"\n🧬 Unique Genes: {n_genes} | Unique Variations: {n_vars}")

# Text length (characters)
train_df["text_len"] = train_df["Text"].str.len()
print("\n📝 Text length (chars) summary (train):")
print(train_df["text_len"].describe())

plt.figure(figsize=(8, 4))
sns.histplot(train_df["text_len"], bins=40, kde=True, color="salmon")
plt.title("Training Text Length Distribution")
plt.xlabel("Characters")
plt.ylabel("Count")
plt.tight_layout()
plt.savefig("figures/text_length_distribution.png")
plt.close()

# Peek samples for sanity
print("\n👀 Sample rows:")
print(train_df.sample(3, random_state=42)[["ID", "Gene", "Variation", "Class"]])

# ----------------------------
# 4) Save interim merged data
# ----------------------------
train_df.to_csv("data/train_merged.csv", index=False)
test_df.to_csv("data/test_merged.csv", index=False)
print("\n💾 Saved: data/train_merged.csv & data/test_merged.csv")

# ----------------------------
# 🔎 Observations:
# - Robust loading of both tabular & text sources with schema checks.
# - Basic imbalance detection (classes), high-cardinality features (Gene/Variation).
# - Text coverage quantified (lengths) → informs token budgets & truncation for NLP.
# - Artifacts saved (figures + merged CSVs) to ensure reproducibility.

✅ training_variants loaded: (3321, 4)
✅ training_text loaded: (3321, 2)
✅ test_variants loaded: (5668, 3)
✅ test_text loaded: (5668, 2)

🔗 Merged train: (3321, 5) | Merged test: (5668, 4)

🔎 Nulls (train):
ID           0
Gene         0
Variation    0
Class        0
Text         5
dtype: int64

🔎 Nulls (test):
ID           0
Gene         0
Variation    0
Text         1
dtype: int64

📊 Class distribution (train):
Class
1    568
2    452
3     89
4    686
5    242
6    275
7    953
8     19
9     37
Name: count, dtype: int64

🧬 Unique Genes: 264 | Unique Variations: 2996

📝 Text length (chars) summary (train):
count      3321.000000
mean      63615.965372
std       52189.336718
min           0.000000
25%       31337.000000
50%       46134.000000
75%       80154.000000
max      523393.000000
Name: text_len, dtype: float64

👀 Sample rows:
        ID   Gene         Variation  Class
1057  1057  EWSR1  EWSR1-FEV Fusion      2
812    812  ERCC2             V536M      1
2658  2658  BRCA1        

In [12]:
# =====================================
# Section 2: Data Cleaning
# =====================================
# Goal: Normalize categorical fields, strip whitespace,
#       optimize dtypes for memory, and persist cleaned data.

# ----------------------------
# 1) Normalize categorical/tabular features
# ----------------------------
train_df["Class"] = train_df["Class"].astype("category")  # target: categorical
train_df["Gene"] = train_df["Gene"].str.strip().str.upper()
train_df["Variation"] = train_df["Variation"].str.strip().str.upper()
train_df["Text"] = train_df["Text"].fillna("").str.strip()

test_df["Gene"] = test_df["Gene"].str.strip().str.upper()
test_df["Variation"] = test_df["Variation"].str.strip().str.upper()
test_df["Text"] = test_df["Text"].fillna("").str.strip()

# ----------------------------
# 2) Memory optimization
# ----------------------------
train_mem = train_df.memory_usage(deep=True).sum() / (1024**2)
test_mem = test_df.memory_usage(deep=True).sum() / (1024**2)
print(f"💾 Train memory usage: {train_mem:.2f} MB")
print(f"💾 Test memory usage : {test_mem:.2f} MB")

# ----------------------------
# 3) Quick sanity re-checks
# ----------------------------
print("\n🔎 After cleaning:")
print(
    "Unique Genes:",
    train_df["Gene"].nunique(),
    "| Unique Variations:",
    train_df["Variation"].nunique(),
)
print("Nulls (train):", train_df.isnull().sum().sum())
print("Nulls (test) :", test_df.isnull().sum().sum())

# ----------------------------
# 4) Persist cleaned artifacts
# ----------------------------
train_df.to_pickle("data/train_clean.pkl")
test_df.to_pickle("data/test_clean.pkl")
print("✅ Saved cleaned datasets → data/train_clean.pkl | data/test_clean.pkl")

# ----------------------------
# 🔎 Observations:
# - Gene & Variation standardized to UPPERCASE → avoids duplicates due to casing/spacing.
# - Missing Text handled via empty string → safe for NLP vectorizers/embeddings.
# - Target `Class` converted to categorical → efficient for stratified CV & modeling.
# - Memory footprint reduced → important for CPU-only Kaggle kernels.

💾 Train memory usage: 402.95 MB
💾 Test memory usage : 611.09 MB

🔎 After cleaning:
Unique Genes: 264 | Unique Variations: 2996
Nulls (train): 0
Nulls (test) : 0
✅ Saved cleaned datasets → data/train_clean.pkl | data/test_clean.pkl


In [13]:
# =====================================
# Section 3: Exploratory Data Analysis (EDA)
# =====================================
# Goal: Understand class imbalance, gene/variation patterns, and text coverage.

from nltk.tokenize import word_tokenize

# ----------------------------
# 1) Class Distribution
# ----------------------------
class_counts = train_df["Class"].value_counts().sort_index()
class_proportions = train_df["Class"].value_counts(normalize=True).sort_index()

print("📊 Class Counts:\n", class_counts)
print("\n📊 Class Proportions:\n", class_proportions)

plt.figure(figsize=(12, 6))
sns.countplot(data=train_df, x="Class", order=range(1, 10))
plt.title("Class Distribution (⚠️ Imbalanced: Class 7 dominates)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.savefig("figures/class_distribution.png")
plt.close()

# Observation:
# - Dataset is imbalanced, with Class 7 ~30% of samples.
# - Motivates use of stratified CV and resampling (SMOTE).

# ----------------------------
# 2) Gene & Variation
# ----------------------------
n_genes, n_variations = train_df["Gene"].nunique(), train_df["Variation"].nunique()
print(f"🧬 Unique Genes: {n_genes} | Unique Variations: {n_variations}")

top_genes = train_df["Gene"].value_counts().head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_genes.values, y=top_genes.index)
plt.title("Top 10 Genes (TP53, BRCA1 frequent in cancer mutations)")
plt.xlabel("Count")
plt.ylabel("Gene")
plt.savefig("figures/top_genes.png")
plt.close()

# ----------------------------
# 3) Text Analysis
# ----------------------------
train_df["Text_Length_Chars"] = train_df["Text"].str.len()
train_df["Text_Length_Tokens"] = train_df["Text"].apply(
    lambda x: len(word_tokenize(x)) if x else 0
)

print("\n📝 Text Length Stats (Chars):")
print(train_df["Text_Length_Chars"].describe())

print("\n📝 Text Length Stats (Tokens):")
print(train_df["Text_Length_Tokens"].describe())

plt.figure(figsize=(12, 6))
sns.histplot(train_df["Text_Length_Tokens"], bins=50, kde=True, color="teal")
plt.title("Text Length Distribution (~3k–5k tokens typical)")
plt.xlabel("Tokens per abstract")
plt.ylabel("Frequency")
plt.savefig("figures/text_length_tokens.png")
plt.close()

# Vocabulary size (quick proxy for NLP complexity)
all_text = " ".join(train_df["Text"].dropna())
tokens = word_tokenize(all_text.lower())
print(f"📚 Unique Vocabulary Size: {len(set(tokens))}")

# ----------------------------
# 4) Gene-Class Relationships
# ----------------------------
gene_class = pd.crosstab(train_df["Gene"], train_df["Class"], normalize="index")
plt.figure(figsize=(12, 8))
sns.heatmap(gene_class.loc[top_genes.index], cmap="coolwarm", annot=True, fmt=".2f")
plt.title("Class Distribution by Top Genes")
plt.savefig("figures/gene_class_heatmap.png")
plt.close()

# Observation:
# - Some genes (e.g., TP53, BRCA1) strongly associated with specific classes.
# - Confirms biological relevance of Gene as predictive feature.

# ----------------------------
# 5) Text Length by Class
# ----------------------------
plt.figure(figsize=(12, 6))
sns.boxplot(x="Class", y="Text_Length_Tokens", data=train_df)
plt.title("Text Length by Class (longer abstracts for some classes)")
plt.savefig("figures/text_length_by_class.png")
plt.close()


# ----------------------------
# 6) Variation Type Categorization
# ----------------------------
def extract_mutation_type(var):
    if re.match(r"^[A-Z]\d+[A-Z*]$", var):  # e.g., P309S
        return "Missense"
    elif "del" in var.lower() or "deletion" in var.lower():
        return "Deletion"
    elif "ins" in var.lower() or "insertion" in var.lower():
        return "Insertion"
    elif "fusion" in var.lower():
        return "Fusion"
    elif "trunc" in var.lower() or "truncating" in var.lower():
        return "Truncating"
    return "Other"


train_df["Variation_Type"] = train_df["Variation"].apply(extract_mutation_type)

plt.figure(figsize=(12, 6))
sns.countplot(x="Class", hue="Variation_Type", data=train_df, palette="Set2")
plt.title("Variation Type by Class (Missense dominates across classes)")
plt.savefig("figures/variation_type_by_class.png")
plt.close()

# Observation:
# - Missense mutations dominate, consistent with gain-of-function drivers.
# - Deletions/Insertions rarer but possibly more discriminative.

📊 Class Counts:
 Class
1    568
2    452
3     89
4    686
5    242
6    275
7    953
8     19
9     37
Name: count, dtype: int64

📊 Class Proportions:
 Class
1    0.171033
2    0.136104
3    0.026799
4    0.206564
5    0.072870
6    0.082806
7    0.286962
8    0.005721
9    0.011141
Name: proportion, dtype: float64
🧬 Unique Genes: 264 | Unique Variations: 2996


  plt.savefig("figures/class_distribution.png")
  plt.savefig("figures/class_distribution.png")



📝 Text Length Stats (Chars):
count      3321.000000
mean      63615.864198
std       52189.334222
min           0.000000
25%       31336.000000
50%       46134.000000
75%       80153.000000
max      523393.000000
Name: Text_Length_Chars, dtype: float64

📝 Text Length Stats (Tokens):
count     3321.000000
mean     11285.350798
std       9291.543813
min          0.000000
25%       5569.000000
50%       8202.000000
75%      14187.000000
max      91380.000000
Name: Text_Length_Tokens, dtype: float64
📚 Unique Vocabulary Size: 280884


In [14]:
# =====================================
# Section 4: Feature Engineering
# =====================================
# Goal: Build hybrid feature space combining:
# - Gene & Variation tabular features
# - Text embeddings (TF-IDF baseline + MiniLM SOTA)

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sentence_transformers import SentenceTransformer

stop_words = set(stopwords.words("english"))

# ----------------------------
# 1) Gene Features
# ----------------------------
# Frequency encoding of Gene occurrence
gene_freq = train_df["Gene"].value_counts(normalize=True)
train_df["Gene_Freq"] = train_df["Gene"].map(gene_freq).fillna(0)
test_df["Gene_Freq"] = test_df["Gene"].map(gene_freq).fillna(0)  # unseen → 0

# ----------------------------
# 2) Variation Features
# ----------------------------
# Mutation type (categorical → one-hot)
train_df["Variation_Type"] = train_df["Variation"].apply(extract_mutation_type)
test_df["Variation_Type"] = test_df["Variation"].apply(extract_mutation_type)

ohe_var_type = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
var_type_ohe = ohe_var_type.fit_transform(train_df[["Variation_Type"]])
var_type_ohe_df = pd.DataFrame(
    var_type_ohe, columns=ohe_var_type.get_feature_names_out()
)

test_var_type = ohe_var_type.transform(test_df[["Variation_Type"]])
test_var_type_df = pd.DataFrame(
    test_var_type, columns=ohe_var_type.get_feature_names_out()
)


# Mutation position (numeric extraction from variation string)
def extract_position(var):
    match = re.search(r"\d+", var)
    return int(match.group()) if match else -1


train_df["Variation_Pos"] = train_df["Variation"].apply(extract_position)
test_df["Variation_Pos"] = test_df["Variation"].apply(extract_position)

# Variation frequency encoding
var_freq = train_df["Variation"].value_counts(normalize=True)
train_df["Var_Freq"] = train_df["Variation"].map(var_freq).fillna(0)
test_df["Var_Freq"] = test_df["Variation"].map(var_freq).fillna(0)


# ----------------------------
# 3) Text Preprocessing
# ----------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)  # keep only letters & spaces
    text = " ".join(word for word in text.split() if word not in stop_words)
    return text


train_df["Text_Clean"] = train_df["Text"].apply(preprocess_text)
test_df["Text_Clean"] = test_df["Text"].apply(preprocess_text)

# ----------------------------
# 4) Text Representations
# ----------------------------
# (a) Baseline TF-IDF (5k bi-grams)
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=3)
text_tfidf = tfidf.fit_transform(train_df["Text_Clean"])
text_tfidf_df = pd.DataFrame(
    text_tfidf.toarray(), columns=tfidf.get_feature_names_out()
)

# (b) MiniLM Embeddings (384-dim, fast & strong for biomedical text)
model = SentenceTransformer("all-MiniLM-L6-v2")

train_embeds = model.encode(
    train_df["Text_Clean"].tolist(), batch_size=32, show_progress_bar=True
)
train_embeds_df = pd.DataFrame(
    train_embeds, columns=[f"MiniLM_{i}" for i in range(train_embeds.shape[1])]
)

test_embeds = model.encode(
    test_df["Text_Clean"].tolist(), batch_size=32, show_progress_bar=True
)
test_embeds_df = pd.DataFrame(
    test_embeds, columns=[f"MiniLM_{i}" for i in range(test_embeds.shape[1])]
)

# ----------------------------
# 5) Final Feature Matrix
# ----------------------------
X = pd.concat(
    [
        train_df[["Gene_Freq", "Variation_Pos", "Var_Freq"]],
        var_type_ohe_df,
        train_embeds_df,  # MiniLM replaces TF-IDF for final model
    ],
    axis=1,
)

X_test = pd.concat(
    [
        test_df[["Gene_Freq", "Variation_Pos", "Var_Freq"]],
        test_var_type_df,
        test_embeds_df,
    ],
    axis=1,
)

X.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Target (convert to 0–8 range)
y = train_df["Class"].astype(int) - 1

# ----------------------------
# 6) Save artifacts
# ----------------------------
X.to_pickle("data/X_mini.pkl")
X_test.to_pickle("data/X_test_mini.pkl")
np.save("data/y.npy", y)

print(
    f"✅ Feature Engineering complete → Shapes: X={X.shape}, X_test={X_test.shape}, y={y.shape}"
)

Batches: 100%|██████████| 104/104 [02:28<00:00,  1.43s/it]
Batches: 100%|██████████| 178/178 [04:12<00:00,  1.42s/it]

✅ Feature Engineering complete → Shapes: X=(3321, 393), X_test=(5668, 393), y=(3321,)





In [None]:
# =====================================
# Section 5: Modeling & Evaluation
# =====================================
# Goal: Train robust multi-class classifier (9 classes)
# - Pipeline: SMOTE + XGBoost
# - CV metrics: log loss, F1-macro
# - Hyperparameter tuning (Optuna)
# - Explainability (SHAP)
# - Ablation study (text vs tabular)

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.metrics import (
    f1_score,
    make_scorer,
    confusion_matrix,
    precision_recall_curve,
    log_loss,
)
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
import shap
import optuna

# ----------------------------
# 1) Baseline Pipeline
# ----------------------------
pipe = Pipeline(
    [
        ("smote", SMOTE(random_state=42)),
        (
            "model",
            XGBClassifier(
                objective="multi:softprob",
                eval_metric="mlogloss",
                n_jobs=-1,
                random_state=42,
            ),
        ),
    ]
)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "log_loss": "neg_log_loss",
    "f1_macro": make_scorer(f1_score, average="macro"),
}

results = cross_validate(
    pipe, X, y, cv=skf, scoring=scoring, return_train_score=True, n_jobs=-1
)

print(
    f"CV Log Loss (Test): {-results['test_log_loss'].mean():.3f} ± {results['test_log_loss'].std():.3f}"
)
print(
    f"CV F1 Macro (Test): {results['test_f1_macro'].mean():.3f} ± {results['test_f1_macro'].std():.3f}"
)
print(f"CV Log Loss (Train): {-results['train_log_loss'].mean():.3f} (overfit check)")


# ----------------------------
# 2) Hyperparameter Tuning (Optuna)
# ----------------------------
def objective(trial):
    pipe_opt = Pipeline(
        [
            ("smote", SMOTE(random_state=42)),
            (
                "model",
                XGBClassifier(
                    max_depth=trial.suggest_int("max_depth", 3, 10),
                    learning_rate=trial.suggest_float("learning_rate", 0.01, 0.3),
                    n_estimators=trial.suggest_int("n_estimators", 50, 300),
                    objective="multi:softprob",
                    eval_metric="mlogloss",
                    n_jobs=-1,
                    random_state=42,
                ),
            ),
        ]
    )
    scores = cross_validate(pipe_opt, X, y, cv=skf, scoring="neg_log_loss", n_jobs=-1)
    return -scores["test_score"].mean()


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20, n_jobs=-1)

print("Best Params:", study.best_params)
print("Best CV Log Loss:", study.best_value)

# Refit final model
best_params = study.best_params
pipe.named_steps["model"].set_params(**best_params)
pipe.fit(X, y)

# ----------------------------
# 3) Explainability (SHAP)
# ----------------------------
explainer = shap.TreeExplainer(pipe.named_steps["model"])
shap_values = explainer.shap_values(X)

shap.summary_plot(shap_values, X, show=False)
plt.title("SHAP Feature Importance (MiniLM embeddings dominate)")
plt.savefig("figures/shap_importance.png")
plt.close()

# ----------------------------
# 4) Validation on Hold-out
# ----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
pipe.fit(X_train, y_train)

y_preds = pipe.predict(X_val)
y_probs = pipe.predict_proba(X_val)

cm = confusion_matrix(y_val, y_preds)
plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=range(1, 10),
    yticklabels=range(1, 10),
)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.savefig("figures/confusion_matrix.png")
plt.close()

# Precision-Recall curves
plt.figure(figsize=(12, 8))
for i in range(9):
    precision, recall, _ = precision_recall_curve(y_val == i, y_probs[:, i])
    plt.plot(recall, precision, label=f"Class {i+1}")
plt.title("Precision-Recall Curves")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.savefig("figures/precision_recall_curves.png")
plt.close()

# ----------------------------
# 5) Ablation Study
# ----------------------------
ablation_results = []
feature_groups = {
    "Full": X,
    "No MiniLM": X[[c for c in X.columns if not c.startswith("MiniLM")]],
    "No Tabular": X[[c for c in X.columns if c.startswith("MiniLM")]],
    "Only Gene Freq": X[["Gene_Freq"]],
}

for name, X_sub in feature_groups.items():
    scores = cross_validate(pipe, X_sub, y, cv=skf, scoring="neg_log_loss", n_jobs=-1)
    ablation_results.append(
        {
            "Features": name,
            "CV Log Loss": -scores["test_score"].mean(),
            "CV Log Loss Std": scores["test_score"].std(),
        }
    )

ablation_df = pd.DataFrame(ablation_results)
print("\nAblation Study Results:\n", ablation_df)

plt.figure(figsize=(10, 6))
sns.barplot(x="CV Log Loss", y="Features", data=ablation_df)
plt.title("Ablation Study: Feature Group Impact")
plt.savefig("figures/ablation_study.png")
plt.close()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

CV Log Loss (Test): 1.390 ± 0.075
CV F1 Macro (Test): 0.514 ± 0.045
CV Log Loss (Train): 0.129 (overfit check)


[I 2025-09-10 13:06:17,325] Trial 4 finished with value: 1.1150720146569846 and parameters: {'max_depth': 6, 'learning_rate': 0.07684522233458667, 'n_estimators': 86}. Best is trial 4 with value: 1.1150720146569846.
[I 2025-09-10 13:06:42,079] Trial 1 finished with value: 1.21049501526583 and parameters: {'max_depth': 5, 'learning_rate': 0.2131435688694639, 'n_estimators': 89}. Best is trial 4 with value: 1.1150720146569846.
[I 2025-09-10 13:06:46,788] Trial 7 finished with value: 1.149937293448442 and parameters: {'max_depth': 3, 'learning_rate': 0.284854751082224, 'n_estimators': 105}. Best is trial 4 with value: 1.1150720146569846.
[I 2025-09-10 13:07:00,975] Trial 3 finished with value: 1.6261859452165144 and parameters: {'max_depth': 7, 'learning_rate': 0.23886247652589798, 'n_estimators': 267}. Best is trial 4 with value: 1.1150720146569846.
[I 2025-09-10 13:07:18,512] Trial 5 finished with value: 1.2027507702823104 and parameters: {'max_depth': 3, 'learning_rate': 0.294459496569

Best Params: {'max_depth': 5, 'learning_rate': 0.057849572446618355, 'n_estimators': 104}
Best CV Log Loss: 1.1070038168397571





Ablation Study Results:
          Features  CV Log Loss  CV Log Loss Std
0            Full     1.107004         0.034651
1       No MiniLM     1.447153         0.040429
2      No Tabular     1.208260         0.041141
3  Only Gene Freq     1.628369         0.041559



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x="CV Log Loss", y="Features", data=ablation_df, palette="viridis")


<Figure size 640x480 with 0 Axes>

In [18]:
# =====================================
# Section 6: Inference & Submission
# =====================================

# ----------------------------
# 1) Baseline: TF-IDF Pipeline
# ----------------------------
X_tfidf = pd.concat(
    [
        train_df[["Gene_Freq", "Variation_Pos", "Var_Freq"]],
        var_type_ohe_df,
        text_tfidf_df,
    ],
    axis=1,
)
X_tfidf.fillna(0, inplace=True)

pipe_tfidf = Pipeline(
    [
        ("smote", SMOTE(random_state=42)),
        (
            "model",
            XGBClassifier(
                objective="multi:softprob",
                eval_metric="mlogloss",
                n_jobs=-1,
                random_state=42,
            ),
        ),
    ]
)

scores_tfidf = cross_validate(pipe_tfidf, X_tfidf, y, cv=skf, scoring="neg_log_loss")
print(f"📊 TF-IDF CV Log Loss: {-scores_tfidf['test_score'].mean():.3f}")

# ----------------------------
# 2) Ensemble: MiniLM + TF-IDF
# ----------------------------
pipe.fit(X, y)
pipe_tfidf.fit(X_tfidf, y)

# Validation set for ensemble comparison
X_train_tfidf, X_val_tfidf, y_train_tfidf, y_val_tfidf = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)
val_index = y_val.index

val_probs_mini = pipe.predict_proba(X.loc[val_index])
val_probs_tfidf = pipe_tfidf.predict_proba(X_val_tfidf.loc[val_index])
ensemble_probs = (val_probs_mini + val_probs_tfidf) / 2

print("📊 Ensemble Log Loss:", log_loss(y_val, ensemble_probs))

# ----------------------------
# 3) Inference on Test Set
# ----------------------------
test_probs = pipe.predict_proba(X_test)  # MiniLM-based final pipeline

submission = pd.DataFrame(test_probs, columns=[f"class{i+1}" for i in range(9)])
submission.insert(0, "ID", test_df["ID"])
submission.to_csv("submission.csv", index=False)

print("✅ Submission generated: submission.csv")
print("Format preview:\n", submission.head(3))



📊 TF-IDF CV Log Loss: 1.213




📊 Ensemble Log Loss: 0.29384796927779067
✅ Submission generated: submission.csv
Format preview:
    ID    class1    class2    class3    class4    class5    class6    class7  \
0   0  0.022920  0.122392  0.040850  0.072435  0.024257  0.061330  0.629683   
1   1  0.152938  0.215377  0.021443  0.109514  0.104614  0.231969  0.133993   
2   2  0.107440  0.322793  0.024154  0.025870  0.078737  0.166525  0.234342   

     class8    class9  
0  0.016359  0.009774  
1  0.014098  0.016054  
2  0.029229  0.010910  
