<a href="https://colab.research.google.com/github/likezhu7-prog/Classification-Assignment-/blob/main/Copy_of_model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================================
# 0. Imports
# ===============================================
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

from transformers import AutoTokenizer, AutoModel
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# ===============================================
# 1. Load dataset
# ===============================================
df = pd.read_csv("cancer_cleaned_trimmed.csv")
X = df["cleaned_text"].astype(str)
y = df["label"].astype(int)

In [None]:
# ===============================================
# 2. Train / Validation / Test split
# ===============================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

In [None]:
# ===============================================
# 3. TF-IDF + Traditional Models (10-fold CV)
# ===============================================

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

models = {
    "SVM (LinearSVC)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=300, n_jobs=-1),
    "Naive Bayes": MultinomialNB(),
    "kNN": KNeighborsClassifier(n_neighbors=5, metric="cosine"),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y)),
        eval_metric="mlogloss",
        n_jobs=-1
    ),
    "SGD": SGDClassifier(loss="hinge", random_state=42)
}

pipelines = {
    name: Pipeline([
        ("tfidf", tfidf),
        ("clf", model)
    ])
    for name, model in models.items()
}

def evaluate_models(pipelines, X_train, y_train, cv=10):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    results = []
    confusion_mats = {}

    for name, pipe in pipelines.items():
        print(f"\n=== Evaluating {name} ===")

        y_pred = cross_val_predict(pipe, X_train, y_train, cv=skf, n_jobs=-1)

        acc = accuracy_score(y_train, y_pred)
        prec = precision_score(y_train, y_pred, average="macro")
        f1 = f1_score(y_train, y_pred, average="macro")

        results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision_macro": prec,
            "F1_macro": f1
        })

        confusion_mats[name] = confusion_matrix(y_train, y_pred)

        print(f"Accuracy: {acc:.4f}")
        print(f"Macro Precision: {prec:.4f}")
        print(f"Macro F1: {f1:.4f}")

    return pd.DataFrame(results), confusion_mats

results_df, confusion_mats = evaluate_models(pipelines, X_train, y_train)
print("\n=== Traditional Models Summary ===")
print(results_df.sort_values(by="F1_macro", ascending=False))

In [None]:
# ===============================================
# 4. BERT: Prepare model
# ===============================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

bert_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name).to(device)
bert_model.eval()

In [None]:
# ===============================================
# 5. BERT Encoding Function
# ===============================================
def bert_encode(texts, batch_size=16, max_length=256):
    all_embeddings = []
    if not isinstance(texts, (list, tuple)):
        texts = list(texts)

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            enc = tokenizer(
                batch, padding=True, truncation=True,
                max_length=max_length, return_tensors="pt"
            )
            enc = {k: v.to(device) for k, v in enc.items()}
            outputs = bert_model(**enc)
            cls_vec = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_vec)

    return np.vstack(all_embeddings)

In [None]:
# ===============================================
# 6. BERT encode Train / Val / Test
# ===============================================
print("\nEncoding texts using BERT...")
X_train_bert = bert_encode(X_train)
X_val_bert = bert_encode(X_val)
X_test_bert = bert_encode(X_test)

print("BERT Train:", X_train_bert.shape)
print("BERT Val:", X_val_bert.shape)
print("BERT Test:", X_test_bert.shape)

In [None]:
# ===============================================
# 7. BERT + Logistic Regression
# ===============================================
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score

clf_bert = LogisticRegression(max_iter=1000, n_jobs=-1, multi_class="multinomial")
clf_bert.fit(X_train_bert, y_train)

y_val_pred = clf_bert.predict(X_val_bert)
val_acc = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred, average="macro")
val_f1 = f1_score(y_val, y_val_pred, average="macro")

print("\n=== BERT Validation Results ===")
print("Validation Accuracy:", round(val_acc, 4))
print("Validation Macro Precision:", round(val_precision, 4))
print("Validation Macro F1:", round(val_f1, 4))
print("\nValidation classification report:")
print(classification_report(y_val, y_val_pred, digits=3))


In [None]:
# ===============================================
# 8. Final training on Train+Val, test on Test (BERT)
# ===============================================

from sklearn.metrics import precision_score

X_trainval_bert = np.vstack([X_train_bert, X_val_bert])
y_trainval = pd.concat([y_train, y_val])

clf_bert_final = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    multi_class="multinomial"
)
clf_bert_final.fit(X_trainval_bert, y_trainval)

# Test evaluation for BERT
y_test_pred_bert = clf_bert_final.predict(X_test_bert)

bert_test_acc = accuracy_score(y_test, y_test_pred_bert)
bert_test_precision = precision_score(y_test, y_test_pred_bert, average="macro")
bert_test_f1 = f1_score(y_test, y_test_pred_bert, average="macro")

print("\n=== BERT Test Results ===")
print("Test Accuracy:", round(bert_test_acc, 4))
print("Test Macro Precision:", round(bert_test_precision, 4))
print("Test Macro F1:", round(bert_test_f1, 4))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred_bert, digits=3))



In [None]:
# ===============================================
# 9. Model Selection Table (for choosing champion model)
# ===============================================

# Traditional models: 10-fold CV on Train
# BERT: performance on Validation set

model_selection_df = results_df.copy()
model_selection_df["Source"] = "CV on Train"

bert_selection_row = {
    "Model": "BERT + Logistic Regression",
    "Accuracy": val_acc,
    "Precision_macro": val_precision,
    "F1_macro": val_f1,
    "Source": "Validation"
}

model_selection_df = pd.concat(
    [model_selection_df, pd.DataFrame([bert_selection_row])],
    ignore_index=True
)

print("\n=== Model Selection ===")
print(model_selection_df.sort_values(by="F1_macro", ascending=False))


In [None]:
# ===== Final Testing for Champion Model: SVM =====

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

svm_champion = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=3,
        max_df=0.9,
        sublinear_tf=True
    )),
    ("clf", LinearSVC())
])

# Train + Validation
X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

# Train champion
svm_champion.fit(X_trainval, y_trainval)

# Final Test predictions
y_test_pred_svm = svm_champion.predict(X_test)

# Metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report

test_acc_svm = accuracy_score(y_test, y_test_pred_svm)
test_precision_svm = precision_score(y_test, y_test_pred_svm, average="macro")
test_f1_svm = f1_score(y_test, y_test_pred_svm, average="macro")

print("=== Final Test Results: SVM (Champion Model) ===")
print("Test Accuracy:", test_acc_svm)
print("Test Macro Precision:", test_precision_svm)
print("Test Macro F1:", test_f1_svm)
print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred_svm, digits=3))


In [None]:
# ===== Confusion Matrix for SVM =====
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_test_pred_svm)

class_names = ["Colon_Cancer", "Liver_Cancer", "Lung_Cancer", "Stomach_Cancer", "Thyroid_Cancer"]

plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d",
            xticklabels=class_names,
            yticklabels=class_names,
            cmap="Blues")
plt.title("Confusion Matrix - SVM")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.tight_layout()
plt.show()
