In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


def load_data():
    # Selecting only sport and politics related categories (out of the 20 topics od data present in the dataset)
    categories = [
        'rec.sport.baseball',
        'rec.sport.hockey',
        'talk.politics.misc',
        'talk.politics.guns',
        'talk.politics.mideast'
    ]

    # Fetch desired dataset (The 20 newsgroups text dataset)
    dataset = fetch_20newsgroups(
        subset='all',
        categories=categories,
        remove=('headers', 'footers', 'quotes')
    )

    X = dataset.data
    y = dataset.target

    # In this filtered dataset:
    # Index 0,1 correspond to sports
    # Remaining correspond to politics
    sport_indices = [0, 1]

    # COnverting to binary labels
    y_binary = np.array([0 if label in sport_indices else 1 for label in y])

    print("Class Distribution:", Counter(y_binary))
    return X, y_binary


def get_features(X_train, X_test, feature_type):

    if feature_type == "bow":
        # Bag of Words representation
        vectorizer = CountVectorizer(stop_words='english')
    elif feature_type == "tfidf":
        # TF-IDF representation
        vectorizer = TfidfVectorizer(stop_words='english')
    elif feature_type == "tfidf_bigram":
        # TF-IDF with unigrams + bigrams
        vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    else:
        raise ValueError("Invalid feature type")
    
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    return X_train_vec, X_test_vec


def plot_confusion_matrix(cm, model_name, feature_type):

    plt.figure()
    plt.imshow(cm)
    plt.title(f"{model_name} - {feature_type}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")

    # Binary class labels
    plt.xticks([0, 1], ["Sport", "Politics"])
    plt.yticks([0, 1], ["Sport", "Politics"])

    for i in range(2):
        for j in range(2):
            plt.text(j, i, cm[i, j], ha='center')

    plt.colorbar()
    os.makedirs("results", exist_ok=True)
    plt.savefig(f"results/{model_name}_{feature_type}_cm.png")
    plt.close()


def main():
    # Load and preprocess dataset
    # X -> Text documents
    # y -> Binary Label (0: Sports & 1: Politics)
    X, y = load_data()

    # Split into training and testing (80-20 split)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y       # to preserve class distribution in the split
    )

    feature_types = ["bow", "tfidf", "tfidf_bigram"]

    models = {
        "Naive_Bayes": MultinomialNB(),
        "Logistic_Regression": LogisticRegression(max_iter=2000),
        "Linear_SVM": LinearSVC()
    }

    for feature in feature_types:

        # Convert text to numerical features
        X_train_vec, X_test_vec = get_features(X_train, X_test, feature)

        for model_name, model in models.items():

            # Train model on train and Predict results on test
            model.fit(X_train_vec, y_train)
            y_pred = model.predict(X_test_vec)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            cm = confusion_matrix(y_test, y_pred)
            plot_confusion_matrix(cm, model_name, feature)

            print(f"\n{model_name} | {feature}")
            print(f"Accuracy : {acc:.4f}")
            print(f"F1 Score : {f1:.4f}")
            print("-" * 40)

if __name__ == "__main__":
    main()

Class Distribution: Counter({1: 2625, 0: 1993})

Naive_Bayes | bow
Accuracy : 0.9686
F1 Score : 0.9730
----------------------------------------

Logistic_Regression | bow
Accuracy : 0.9502
F1 Score : 0.9569
----------------------------------------

Linear_SVM | bow
Accuracy : 0.9286
F1 Score : 0.9368
----------------------------------------

Naive_Bayes | tfidf
Accuracy : 0.9502
F1 Score : 0.9578
----------------------------------------

Logistic_Regression | tfidf
Accuracy : 0.9470
F1 Score : 0.9551
----------------------------------------

Linear_SVM | tfidf
Accuracy : 0.9632
F1 Score : 0.9682
----------------------------------------

Naive_Bayes | tfidf_bigram
Accuracy : 0.9221
F1 Score : 0.9355
----------------------------------------

Logistic_Regression | tfidf_bigram
Accuracy : 0.9416
F1 Score : 0.9507
----------------------------------------

Linear_SVM | tfidf_bigram
Accuracy : 0.9610
F1 Score : 0.9664
----------------------------------------
