#### BoW -> Naive Bayes 

Using a Bag of Words appoach with Naive Bayes

In [1]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import polars as pl
from pathlib import Path

In [2]:
# read in the data and only select the article and the publication
# and getting them split into training and testing
DATA_PATH = "../../../../data/all-the-news-2-1-SMALL-CLEANED.csv"
df = pl.read_csv(DATA_PATH)
df_cleaned = df.select(["clean_article","publication","split"])
df_train = df_cleaned.filter(pl.col("split") == "train")
df_train = df_train.select(["clean_article","publication"])
df_test = df_cleaned.filter(pl.col("split") == "test")
df_test = df_test.select(["clean_article","publication"])

In [3]:
# encodes the article as a bag of words
vectorizer = CountVectorizer(max_features=1000, stop_words='english')
X_train = vectorizer.fit_transform(df_train['clean_article']).toarray()
X_test = vectorizer.transform(df_test['clean_article']).toarray()

# Encode publisher labels
le = LabelEncoder()
y_train = le.fit_transform(df_train['publication'])
y_test = le.transform(df_test['publication'])

In [4]:
# train our model
gnb = GaussianNB()
gnb.fit(X_train, y_train)

#### Results!

In [5]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Test the Multinomial Naïve Bayes model
y_pred = gnb.predict(X_test)

# Classification report
report = classification_report(y_test, y_pred, target_names=le.classes_)
print(report)

# Per-class accuracy using confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=range(len(le.classes_)))
hits = cm.diagonal()
total_true = cm.sum(axis=1)
acc_per_class = hits / total_true

# Create a DataFrame with accuracy info
acc_df = pd.DataFrame({
    "publication": le.classes_,
    "n_test":      total_true,
    "correct":     hits,
    "accuracy":    acc_per_class.round(3)
}).sort_values("accuracy", ascending=False)

display(acc_df.style.bar(subset=["accuracy"], vmin=0, vmax=1, color='#66c2a5'))


                    precision    recall  f1-score   support

     Buzzfeed News       0.41      0.25      0.31      1000
               CNN       0.48      0.24      0.32      1000
         Economist       0.87      0.72      0.79      1000
          Fox News       0.71      0.70      0.71      1000
            People       0.38      0.85      0.52      1000
          Politico       0.45      0.51      0.48      1000
           Reuters       0.71      0.77      0.74      1000
          The Hill       0.63      0.61      0.62      1000
The New York Times       0.67      0.41      0.51      1000
              Vice       0.43      0.44      0.44      1000

          accuracy                           0.55     10000
         macro avg       0.57      0.55      0.54     10000
      weighted avg       0.57      0.55      0.54     10000



Unnamed: 0,publication,n_test,correct,accuracy
4,People,1000,853,0.853
6,Reuters,1000,768,0.768
2,Economist,1000,723,0.723
3,Fox News,1000,703,0.703
7,The Hill,1000,613,0.613
5,Politico,1000,505,0.505
9,Vice,1000,440,0.44
8,The New York Times,1000,408,0.408
0,Buzzfeed News,1000,246,0.246
1,CNN,1000,244,0.244
