In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [None]:
df = pd.read_csv("merged_cancer.csv")

In [None]:
X_text = df["cleaned_text"].astype(str)
y = df["label"]

In [None]:
print("Number of documents:", X_text.shape[0])
print("Label distribution:\n", y.value_counts())

Number of documents: 1000
Label distribution:
 label
0    200
1    200
2    200
3    200
4    200
Name: count, dtype: int64


In [None]:
def make_tfidf_pipeline(clf):
    return Pipeline([
        ("tfidf", TfidfVectorizer(
            max_features=5000,      # vocabulary size
            ngram_range=(1, 2),     # uni-gram + bi-gram
            stop_words="english"    # English stopwords
        )),
        ("clf", clf)
    ])

In [None]:
models = {
    "SVM_TFIDF": make_tfidf_pipeline(LinearSVC()),
    "RandomForest_TFIDF": make_tfidf_pipeline(
        RandomForestClassifier(n_estimators=200, random_state=42)
    ),
    "NaiveBayes_TFIDF": make_tfidf_pipeline(MultinomialNB()),
    "kNN_TFIDF": make_tfidf_pipeline(KNeighborsClassifier(n_neighbors=5)),
    "SGD_TFIDF": make_tfidf_pipeline(
        SGDClassifier(loss="hinge", random_state=42)
    )
}

In [None]:
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_results = {}

for name, pipe in models.items():
    print(f"\nTraining and evaluating model: {name}")
    scores = cross_val_score(pipe, X_text, y, cv=skf, scoring="accuracy")
    cv_results[name] = {
        "mean_accuracy": scores.mean(),
        "std_accuracy": scores.std(),
        "all_scores": scores
    }
    print(f"{name}: mean accuracy = {scores.mean():.4f} (+/- {scores.std():.4f})")


Training and evaluating model: SVM_TFIDF
SVM_TFIDF: mean accuracy = 0.7760 (+/- 0.0284)

Training and evaluating model: RandomForest_TFIDF
RandomForest_TFIDF: mean accuracy = 0.7610 (+/- 0.0362)

Training and evaluating model: NaiveBayes_TFIDF
NaiveBayes_TFIDF: mean accuracy = 0.7460 (+/- 0.0276)

Training and evaluating model: kNN_TFIDF
kNN_TFIDF: mean accuracy = 0.6990 (+/- 0.0212)

Training and evaluating model: SGD_TFIDF
SGD_TFIDF: mean accuracy = 0.7720 (+/- 0.0256)


In [None]:
champion = make_tfidf_pipeline(LinearSVC())

X_train, X_test, y_train, y_test = train_test_split(
    X_text, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

champion.fit(X_train, y_train)
y_pred = champion.predict(X_test)

print("\n=== Final evaluation for champion model (SVM_TFIDF) ===")
print("Classification report:\n")
print(classification_report(y_test, y_pred))

print("Confusion matrix:\n")
print(confusion_matrix(y_test, y_pred))


=== Final evaluation for champion model (SVM_TFIDF) ===
Classification report:

              precision    recall  f1-score   support

           0       0.56      0.55      0.56        40
           1       0.97      0.97      0.97        40
           2       0.82      0.93      0.87        40
           3       1.00      1.00      1.00        40
           4       0.56      0.50      0.53        40

    accuracy                           0.79       200
   macro avg       0.78      0.79      0.79       200
weighted avg       0.78      0.79      0.79       200

Confusion matrix:

[[22  0  3  0 15]
 [ 1 39  0  0  0]
 [ 2  0 37  0  1]
 [ 0  0  0 40  0]
 [14  1  5  0 20]]
