│   │     • Build a TF-IDF pipeline  
│   │     • Train & compare:  
│   │         1. Logistic Regression (softmax)  
│   │         2. Decision Tree  
│   │         3. Random Forest  
│   │         4. k-NN  
│   │         5. Naïve Bayes  
│   │     • Report accuracy & macro-F1 for each  

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
import warnings
warnings.filterwarnings('ignore')

# 1. Load and prepare data
def load_data():
    train_df = pd.read_csv("../data/processed/train.csv")
    test_df = pd.read_csv("../data/processed/test.csv")

    # Combine headline and short description
    train_df["combined_text"] = train_df["headline"].fillna('') + " " + train_df["short_description"].fillna('')
    test_df["combined_text"] = test_df["headline"].fillna('') + " " + test_df["short_description"].fillna('')

    # Remove rows with empty combined_text
    train_df = train_df[train_df["combined_text"].str.strip() != ""]
    test_df = test_df[test_df["combined_text"].str.strip() != ""]

    le = LabelEncoder()
    y_train = le.fit_transform(train_df["category"])
    y_test = le.transform(test_df["category"])

    return train_df["combined_text"], test_df["combined_text"], y_train, y_test, le
# 2. Vectorize text using TF-IDF
def vectorize_text(train_texts, test_texts):
    vectorizer = TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        stop_words='english'
    )
    X_train = vectorizer.fit_transform(train_texts)
    X_test = vectorizer.transform(test_texts)
    return X_train, X_test

# 3. Train and evaluate a model
def train_and_evaluate(model, model_name, X_train, y_train, X_test, y_test, label_encoder):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"📌 {model_name}")
    print(f"Accuracy: {acc:.4f} | Macro F1: {f1:.4f}")
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("=" * 80)
# 4. Load and preprocess data
train_texts, test_texts, y_train, y_test, le = load_data()
X_train, X_test = vectorize_text(train_texts, test_texts)


Use the below code to run all the 5 models one by one and give you the reports

In [12]:

# 5. Define models
models = [
    (LogisticRegression(max_iter=1000, multi_class='multinomial', solver='saga', random_state=42), "Logistic Regression"),
    (DecisionTreeClassifier(max_depth=None, random_state=42, min_samples_split=5), "Decision Tree"),
    (RandomForestClassifier(n_estimators=200, random_state=42, max_depth=200, min_samples_split=5), "Random Forest"),
    (KNeighborsClassifier(n_neighbors=1), "k-Nearest Neighbors"),
    (MultinomialNB(), "Naive Bayes")
]

# 6. Train and evaluate each model
for clf, name in models:
    train_and_evaluate(clf, name, X_train, y_train, X_test, y_test, le)

📌 Logistic Regression
Accuracy: 0.5942 | Macro F1: 0.4456
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.41      0.19      0.26       302
ARTS & CULTURE       0.39      0.13      0.19       268
  BLACK VOICES       0.51      0.35      0.41       917
      BUSINESS       0.50      0.47      0.48      1198
       COLLEGE       0.47      0.32      0.38       229
        COMEDY       0.58      0.39      0.47      1080
         CRIME       0.54      0.53      0.54       712
CULTURE & ARTS       0.68      0.22      0.33       215
       DIVORCE       0.83      0.66      0.73       685
     EDUCATION       0.44      0.27      0.34       203
 ENTERTAINMENT       0.55      0.77      0.64      3473
   ENVIRONMENT       0.59      0.21      0.31       289
         FIFTY       0.56      0.14      0.23       280
  FOOD & DRINK       0.61      0.72      0.66      1268
     GOOD NEWS       0.45      0.13      0.20       280
         GREEN       

To work and hypertune the models indivisually, use the below code

In [None]:
model_lr = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='saga', random_state=42)
train_and_evaluate(model_lr, "Logistic Regression", X_train, y_train, X_test, y_test, le)
# Time to train model: 7.5 seconds

📌 Logistic Regression
Accuracy: 0.5942 | Macro F1: 0.4456
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.41      0.19      0.26       302
ARTS & CULTURE       0.39      0.13      0.19       268
  BLACK VOICES       0.51      0.35      0.41       917
      BUSINESS       0.50      0.47      0.48      1198
       COLLEGE       0.47      0.32      0.38       229
        COMEDY       0.58      0.39      0.47      1080
         CRIME       0.54      0.53      0.54       712
CULTURE & ARTS       0.68      0.22      0.33       215
       DIVORCE       0.83      0.66      0.73       685
     EDUCATION       0.44      0.27      0.34       203
 ENTERTAINMENT       0.55      0.77      0.64      3473
   ENVIRONMENT       0.59      0.21      0.31       289
         FIFTY       0.56      0.14      0.23       280
  FOOD & DRINK       0.61      0.72      0.66      1268
     GOOD NEWS       0.45      0.13      0.20       280
         GREEN       

In [None]:
model_dt = DecisionTreeClassifier(max_depth=None, random_state=42, min_samples_split=5)
train_and_evaluate(model_dt, "Decision Tree", X_train, y_train, X_test, y_test, le)
# Time to train model: 1 min 52.5 seconds

📌 Decision Tree
Accuracy: 0.4189 | Macro F1: 0.2877
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.11      0.12      0.12       302
ARTS & CULTURE       0.09      0.10      0.10       268
  BLACK VOICES       0.24      0.26      0.25       917
      BUSINESS       0.25      0.29      0.27      1198
       COLLEGE       0.20      0.18      0.19       229
        COMEDY       0.25      0.30      0.27      1080
         CRIME       0.27      0.29      0.28       712
CULTURE & ARTS       0.22      0.20      0.21       215
       DIVORCE       0.57      0.53      0.55       685
     EDUCATION       0.19      0.20      0.19       203
 ENTERTAINMENT       0.43      0.49      0.46      3473
   ENVIRONMENT       0.20      0.15      0.17       289
         FIFTY       0.10      0.08      0.09       280
  FOOD & DRINK       0.41      0.46      0.43      1268
     GOOD NEWS       0.05      0.05      0.05       280
         GREEN       0.20  

In [20]:
model_rf = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=200, min_samples_split=5)
train_and_evaluate(model_rf, "Random Forest", X_train, y_train, X_test, y_test, le)
# Time to train model: 

📌 Random Forest
Accuracy: 0.5008 | Macro F1: 0.3141
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.34      0.09      0.14       302
ARTS & CULTURE       0.50      0.03      0.06       268
  BLACK VOICES       0.52      0.25      0.34       917
      BUSINESS       0.47      0.26      0.33      1198
       COLLEGE       0.44      0.18      0.25       229
        COMEDY       0.73      0.28      0.41      1080
         CRIME       0.46      0.39      0.42       712
CULTURE & ARTS       0.71      0.16      0.27       215
       DIVORCE       0.87      0.59      0.70       685
     EDUCATION       0.35      0.13      0.19       203
 ENTERTAINMENT       0.30      0.71      0.42      3473
   ENVIRONMENT       0.94      0.11      0.20       289
         FIFTY       0.44      0.01      0.03       280
  FOOD & DRINK       0.56      0.58      0.57      1268
     GOOD NEWS       0.33      0.01      0.02       280
         GREEN       0.43  

In [9]:
model_knn = KNeighborsClassifier(n_neighbors=1)
train_and_evaluate(model_knn, "k-Nearest Neighbors", X_train, y_train, X_test, y_test, le)

📌 k-Nearest Neighbors
Accuracy: 0.1761 | Macro F1: 0.1562
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.18      0.04      0.07       302
ARTS & CULTURE       0.21      0.02      0.04       268
  BLACK VOICES       0.42      0.11      0.17       917
      BUSINESS       0.42      0.10      0.16      1198
       COLLEGE       0.14      0.04      0.07       229
        COMEDY       0.29      0.15      0.19      1080
         CRIME       0.36      0.06      0.10       712
CULTURE & ARTS       0.53      0.04      0.07       215
       DIVORCE       0.71      0.16      0.26       685
     EDUCATION       0.31      0.08      0.13       203
 ENTERTAINMENT       0.60      0.23      0.33      3473
   ENVIRONMENT       0.55      0.10      0.17       289
         FIFTY       0.09      0.03      0.04       280
  FOOD & DRINK       0.60      0.18      0.28      1268
     GOOD NEWS       0.09      0.02      0.03       280
         GREEN       

In [None]:
model_nb = MultinomialNB()
train_and_evaluate(model_nb, "Naive Bayes", X_train, y_train, X_test, y_test, le)

📌 Naive Bayes
Accuracy: 0.5294 | Macro F1: 0.2966
Classification Report:
                 precision    recall  f1-score   support

          ARTS       0.56      0.02      0.03       302
ARTS & CULTURE       0.67      0.01      0.01       268
  BLACK VOICES       0.58      0.18      0.27       917
      BUSINESS       0.48      0.34      0.40      1198
       COLLEGE       0.38      0.02      0.04       229
        COMEDY       0.67      0.26      0.38      1080
         CRIME       0.53      0.52      0.52       712
CULTURE & ARTS       0.85      0.05      0.10       215
       DIVORCE       0.85      0.46      0.60       685
     EDUCATION       0.75      0.01      0.03       203
 ENTERTAINMENT       0.48      0.79      0.60      3473
   ENVIRONMENT       0.94      0.10      0.18       289
         FIFTY       0.00      0.00      0.00       280
  FOOD & DRINK       0.57      0.72      0.64      1268
     GOOD NEWS       0.82      0.03      0.06       280
         GREEN       0.47    