<a href="https://colab.research.google.com/github/likezhu7-prog/Classification-Assignment-/blob/main/model_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# ===============================================
# 1. Load dataset
# ===============================================
df = pd.read_csv("merged_cancer.csv")
X = df["cleaned_text"].astype(str)
y = df["label"].astype(int)

In [11]:
# ===============================================
# 2. Train / Validation / Test split
# ===============================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=42, stratify=y_temp
)

print("Train size:", len(X_train))
print("Validation size:", len(X_val))
print("Test size:", len(X_test))

Train size: 600
Validation size: 200
Test size: 200


In [9]:
# ===============================================
# 0. Imports
# ===============================================
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_predict
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

from transformers import AutoTokenizer, AutoModel
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
# ===============================================
# 3. TF-IDF + Traditional Models (10-fold CV)
# ===============================================

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=3,
    max_df=0.9,
    sublinear_tf=True
)

models = {
    "SVM (LinearSVC)": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=300, n_jobs=-1),
    "Naive Bayes": MultinomialNB(),
    "kNN": KNeighborsClassifier(n_neighbors=5, metric="cosine"),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        objective="multi:softmax",
        num_class=len(np.unique(y)),
        eval_metric="mlogloss",
        n_jobs=-1
    ),
    "SGD": SGDClassifier(loss="hinge", random_state=42)
}

pipelines = {
    name: Pipeline([
        ("tfidf", tfidf),
        ("clf", model)
    ])
    for name, model in models.items()
}

def evaluate_models(pipelines, X_train, y_train, cv=10):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    results = []
    confusion_mats = {}

    for name, pipe in pipelines.items():
        print(f"\n=== Evaluating {name} ===")

        y_pred = cross_val_predict(pipe, X_train, y_train, cv=skf, n_jobs=-1)

        acc = accuracy_score(y_train, y_pred)
        prec = precision_score(y_train, y_pred, average="macro")
        f1 = f1_score(y_train, y_pred, average="macro")

        results.append({
            "Model": name,
            "Accuracy": acc,
            "Precision_macro": prec,
            "F1_macro": f1
        })

        confusion_mats[name] = confusion_matrix(y_train, y_pred)

        print(f"Accuracy: {acc:.4f}")
        print(f"Macro Precision: {prec:.4f}")
        print(f"Macro F1: {f1:.4f}")

    return pd.DataFrame(results), confusion_mats

results_df, confusion_mats = evaluate_models(pipelines, X_train, y_train)
print("\n=== Traditional Models Summary ===")
print(results_df.sort_values(by="F1_macro", ascending=False))


=== Evaluating SVM (LinearSVC) ===
Accuracy: 0.7583
Macro Precision: 0.7518
Macro F1: 0.7546

=== Evaluating Random Forest ===
Accuracy: 0.7217
Macro Precision: 0.7261
Macro F1: 0.7223

=== Evaluating Naive Bayes ===
Accuracy: 0.7183
Macro Precision: 0.7003
Macro F1: 0.7039

=== Evaluating kNN ===
Accuracy: 0.6933
Macro Precision: 0.6801
Macro F1: 0.6790

=== Evaluating XGBoost ===
Accuracy: 0.7017
Macro Precision: 0.7026
Macro F1: 0.7015

=== Evaluating SGD ===
Accuracy: 0.7467
Macro Precision: 0.7452
Macro F1: 0.7456

=== Traditional Models Summary ===
             Model  Accuracy  Precision_macro  F1_macro
0  SVM (LinearSVC)  0.758333         0.751774  0.754645
5              SGD  0.746667         0.745245  0.745608
1    Random Forest  0.721667         0.726062  0.722341
2      Naive Bayes  0.718333         0.700294  0.703898
4          XGBoost  0.701667         0.702634  0.701497
3              kNN  0.693333         0.680061  0.678983


In [14]:
# ===============================================
# 4. BERT: Prepare model
# ===============================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

bert_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name).to(device)
bert_model.eval()

Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]



vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertModel LOAD REPORT from: bert-base-uncased
Key                                        | Status     |  | 
-------------------------------------------+------------+--+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED |  | 
cls.predictions.transform.dense.weight     | UNEXPECTED |  | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED |  | 
cls.seq_relationship.bias                  | UNEXPECTED |  | 
cls.seq_relationship.weight                | UNEXPECTED |  | 
cls.predictions.transform.dense.bias       | UNEXPECTED |  | 
cls.predictions.bias                       | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [15]:
# ===============================================
# 5. BERT Encoding Function
# ===============================================
def bert_encode(texts, batch_size=16, max_length=256):
    all_embeddings = []
    if not isinstance(texts, (list, tuple)):
        texts = list(texts)

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i + batch_size]
            enc = tokenizer(
                batch, padding=True, truncation=True,
                max_length=max_length, return_tensors="pt"
            )
            enc = {k: v.to(device) for k, v in enc.items()}
            outputs = bert_model(**enc)
            cls_vec = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            all_embeddings.append(cls_vec)

    return np.vstack(all_embeddings)

In [16]:
# ===============================================
# 6. BERT encode Train / Val / Test
# ===============================================
print("\nEncoding texts using BERT...")
X_train_bert = bert_encode(X_train)
X_val_bert = bert_encode(X_val)
X_test_bert = bert_encode(X_test)

print("BERT Train:", X_train_bert.shape)
print("BERT Val:", X_val_bert.shape)
print("BERT Test:", X_test_bert.shape)


Encoding texts using BERT...
BERT Train: (600, 768)
BERT Val: (200, 768)
BERT Test: (200, 768)


In [17]:
# ===============================================
# 7. BERT + Logistic Regression
# ===============================================
from sklearn.linear_model import LogisticRegression

clf_bert = LogisticRegression(max_iter=1000, n_jobs=-1, multi_class="multinomial")
clf_bert.fit(X_train_bert, y_train)

y_val_pred = clf_bert.predict(X_val_bert)
val_acc = accuracy_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred, average="macro")

print("\n=== BERT Validation Results ===")
print("Validation Accuracy:", round(val_acc, 4))
print("Validation Macro F1:", round(val_f1, 4))
print(classification_report(y_val, y_val_pred))




=== BERT Validation Results ===
Validation Accuracy: 0.635
Validation Macro F1: 0.6435
              precision    recall  f1-score   support

           0       0.43      0.57      0.49        40
           1       0.78      0.72      0.75        40
           2       0.89      0.62      0.74        40
           3       0.85      0.88      0.86        40
           4       0.37      0.38      0.37        40

    accuracy                           0.64       200
   macro avg       0.67      0.64      0.64       200
weighted avg       0.67      0.64      0.64       200



In [25]:
# ===============================================
# 8. Final training on Train+Val, test on Test
# ===============================================

from sklearn.metrics import precision_score

X_trainval_bert = np.vstack([X_train_bert, X_val_bert])
y_trainval = pd.concat([y_train, y_val])

clf_bert_final = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    multi_class="multinomial"
)
clf_bert_final.fit(X_trainval_bert, y_trainval)

# Test evaluation
y_test_pred = clf_bert_final.predict(X_test_bert)

test_acc = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred, average="macro")
test_f1 = f1_score(y_test, y_test_pred, average="macro")

print("\n=== BERT Test Results ===")
print("Test Accuracy:", round(test_acc, 4))
print("Test Macro Precision:", round(test_precision, 4))
print("Test Macro F1:", round(test_f1, 4))
print("\nClassification Report (Test):")
print(classification_report(y_test, y_test_pred, digits=3))





=== BERT Test Results ===
Test Accuracy: 0.72
Test Macro Precision: 0.7145
Test Macro F1: 0.7157

Classification Report (Test):
              precision    recall  f1-score   support

           0      0.556     0.625     0.588        40
           1      0.895     0.850     0.872        40
           2      0.814     0.875     0.843        40
           3      0.854     0.875     0.864        40
           4      0.455     0.375     0.411        40

    accuracy                          0.720       200
   macro avg      0.714     0.720     0.716       200
weighted avg      0.714     0.720     0.716       200



In [29]:
# ===============================================
# 9. Add BERT to unified result table
# ===============================================

results_df = results_df[results_df["Model"] != "BERT + Logistic Regression"].copy()

bert_row = {
    "Model": "BERT + Logistic Regression",
    "Accuracy": test_acc,
    "Precision_macro": test_precision,
    "F1_macro": test_f1
}

results_df = pd.concat(
    [results_df, pd.DataFrame([bert_row])],
    ignore_index=True
)

print("\n=== All Models ===")
print(results_df.sort_values(by="F1_macro", ascending=False))


=== All Models ===
                        Model  Accuracy  Precision_macro  F1_macro
0             SVM (LinearSVC)  0.758333         0.751774  0.754645
5                         SGD  0.746667         0.745245  0.745608
1               Random Forest  0.721667         0.726062  0.722341
6  BERT + Logistic Regression  0.720000         0.714490  0.715712
2                 Naive Bayes  0.718333         0.700294  0.703898
4                     XGBoost  0.701667         0.702634  0.701497
3                         kNN  0.693333         0.680061  0.678983


In [31]:
# ===== Final Testing for Champion Model: SVM =====

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

svm_champion = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=3,
        max_df=0.9,
        sublinear_tf=True
    )),
    ("clf", LinearSVC())
])

# Train + Validation
X_trainval = pd.concat([X_train, X_val])
y_trainval = pd.concat([y_train, y_val])

# Train champion
svm_champion.fit(X_trainval, y_trainval)

# Final Test predictions
y_test_pred_svm = svm_champion.predict(X_test)

# Metrics
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report

test_acc_svm = accuracy_score(y_test, y_test_pred_svm)
test_precision_svm = precision_score(y_test, y_test_pred_svm, average="macro")
test_f1_svm = f1_score(y_test, y_test_pred_svm, average="macro")

print("=== Final Test Results: SVM (Champion Model) ===")
print("Test Accuracy:", test_acc_svm)
print("Test Macro Precision:", test_precision_svm)
print("Test Macro F1:", test_f1_svm)
print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred_svm, digits=3))


=== Final Test Results: SVM (Champion Model) ===
Test Accuracy: 0.805
Test Macro Precision: 0.7983971967165245
Test Macro F1: 0.7990755676148934

Classification Report:

              precision    recall  f1-score   support

           0      0.588     0.500     0.541        40
           1      1.000     1.000     1.000        40
           2      0.755     0.925     0.831        40
           3      1.000     1.000     1.000        40
           4      0.649     0.600     0.623        40

    accuracy                          0.805       200
   macro avg      0.798     0.805     0.799       200
weighted avg      0.798     0.805     0.799       200

