#### Baselines 

Trying out some simple baselines and a rule based classifier 

In [3]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)
import re, random
from collections import Counter

In [4]:
DATA_PATH = "../../data/all-the-news-2-1-SMALL-CLEANED.csv"
df = pd.read_csv(DATA_PATH)
# Define features and labels
X = df["clean_article"]         # The news articles
y = df["publication"]     # The publishers (labels)


In [5]:
# Update scoring metrics for multiclass classification using "macro" averaging
scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

# Loop through the dummy classifier strategies:
# - "most_frequent": always predicts the most common class.
# - "stratified": random predictions following the training set's class distribution.
# - "uniform": completely random predictions.
for strategy in ["most_frequent", "stratified", "uniform"]:
    print(f"--- Strategy: {strategy} ---")
    
    # Initialize the dummy classifier with the current strategy
    dummy_classifier = DummyClassifier(strategy=strategy, random_state=42)
    
    # Run 5-fold cross-validation
    scores = cross_validate(dummy_classifier, X, y, cv=5, scoring=scoring_metrics)
    
    # Print the mean of each scoring metric across folds
    for metric, score in scores.items():
        print(f"{metric}: {score.mean()}")
    print()

--- Strategy: most_frequent ---


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


fit_time: 0.02360363006591797
score_time: 0.12673234939575195
test_accuracy: 0.1
test_precision_macro: 0.01
test_recall_macro: 0.1
test_f1_macro: 0.01818181818181818

--- Strategy: stratified ---
fit_time: 0.022503185272216796
score_time: 0.14541869163513182
test_accuracy: 0.0982
test_precision_macro: 0.09821202435466034
test_recall_macro: 0.09820000000000001
test_f1_macro: 0.09819609169624727

--- Strategy: uniform ---
fit_time: 0.02249875068664551
score_time: 0.14110960960388183
test_accuracy: 0.10264999999999999
test_precision_macro: 0.10261707377007004
test_recall_macro: 0.10264999999999999
test_f1_macro: 0.10262713172068871



In [6]:
import re, random

class FewRuleClassifier:
    def __init__(self, seed=None):
        self.pubs = [
            "The New York Times","The Hill","People","CNN","Vice",
            "Fox News","BuzzFeed News","Politico","The Economist","Reuters"
        ]
        if seed is not None:
            random.seed(seed)

    def classify(self, text):

        # 1) Reuters — matches "(Reuters) –" or CITY – at top
        if re.match(r'.{0,40}\(Reuters\)\s+[–-]', text) or \
           re.match(r'^[A-Z][A-Z ]{2,30}[–-]', text):
            return "Reuters"

        # 2) The Economist — ≥3 distinct Brit-spellings
        brit_words = re.findall(
            r'\b(colou?r|organis|recognis|analys|centre|defence|licence)\w*\b',
            text.lower()
        )
        if len(set(brit_words)) >= 3:
            return "The Economist"

        # 3) People — lots of “!” *and* short sentences
        words = re.findall(r'\w+', text)
        sents = [s for s in re.split(r'[.!?]+', text) if s.strip()]
        if words:
            exc_ratio = text.count('!') / len(words)
            avg_sent  = len(words) / len(sents) if sents else 99
            if exc_ratio > 0.015 and avg_sent < 15:
                return "People"

        # 4) fallback: pure uniform random
        return random.choice(self.pubs)


In [9]:
clf = FewRuleClassifier()
preds = [clf.classify(txt) for txt in df["clean_article"]]

from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(df["publication"], preds))
print(classification_report(df["publication"], preds))

Accuracy: 0.08207


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                    precision    recall  f1-score   support

     BuzzFeed News       0.00      0.00      0.00         0
     Buzzfeed News       0.00      0.00      0.00     10000
               CNN       0.10      0.10      0.10     10000
         Economist       0.00      0.00      0.00     10000
          Fox News       0.10      0.10      0.10     10000
            People       0.12      0.13      0.12     10000
          Politico       0.10      0.10      0.10     10000
           Reuters       0.10      0.10      0.10     10000
     The Economist       0.00      0.00      0.00         0
          The Hill       0.10      0.10      0.10     10000
The New York Times       0.10      0.10      0.10     10000
              Vice       0.10      0.10      0.10     10000

          accuracy                           0.08    100000
         macro avg       0.07      0.07      0.07    100000
      weighted avg       0.08      0.08      0.08    100000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
