In [2]:
! pip install gensim

import polars as pl
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, Phrases, phrases
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import wordpunct_tokenize
from tqdm.auto import tqdm
import multiprocessing, warnings, re, string, os

warnings.filterwarnings("ignore")
tqdm.pandas()

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-win_amd64.whl (24.0 MB)
     --------------------------------------- 24.0/24.0 MB 54.7 MB/s eta 0:00:00
Collecting smart-open>=1.8.1
  Downloading smart_open-7.1.0-py3-none-any.whl (61 kB)
     ---------------------------------------- 61.7/61.7 kB ? eta 0:00:00
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.3 smart-open-7.1.0

[notice] A new release of pip available: 22.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
# read in the data and only select the article and the publication
# and getting them split into training and testing
DATA_PATH = "../../../../data/all-the-news-2-1-SMALL-CLEANED.csv"
df = pd.read_csv(DATA_PATH)
print(df.head(3)[["publication", "clean_article", "split"]])

# Split provided by the file
df_train = df[df["split"] == "train"].reset_index(drop=True)
df_test  = df[df["split"] == "test"].reset_index(drop=True)

          publication                                      clean_article  \
0  The New York Times   a love of [NAME] and slap bracelets, [NAME] s...   
1  The New York Times  warm, occasionally downright balmy, weather, a...   
2  The New York Times  dably confused. When he was a boy, Havana was ...   

   split  
0  train  
1  train  
2  train  


In [11]:
def simple_tokenizer(text: str):
    text = text.lower()
    # remove punctuation but keep intra‑word ’ characters if any
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    return wordpunct_tokenize(text)

# Tokenize every article (lazy eval with progress bar)
train_tokens = df_train["clean_article"].progress_map(simple_tokenizer)
test_tokens  = df_test["clean_article"].map(simple_tokenizer)

# Learn bigram phrases from training corpus
bigram_phrases = Phrases(train_tokens, min_count=5, threshold=10)
bigram_phraser = phrases.Phraser(bigram_phrases)

# Apply bigrams
train_tokens = train_tokens.apply(lambda x: bigram_phraser[x])
test_tokens  = test_tokens.apply(lambda x: bigram_phraser[x])

100%|██████████| 90000/90000 [00:17<00:00, 5037.05it/s]


In [12]:
EMBED_DIM  = 200        # vector size
WINDOW     = 5
MIN_COUNT  = 3
SG         = 1          # 1 = skip‑gram, 0 = CBOW
EPOCHS     = 10

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from tqdm.auto import tqdm

class EpochLogger(CallbackAny2Vec):
    """Logs loss & shows a tqdm bar for each epoch."""
    def __init__(self, total_epochs):
        self.epoch     = 0
        self.pbar      = tqdm(total=total_epochs, desc="Word2Vec epochs")

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.pbar.set_postfix({"loss": f"{loss:.2f}"})
        self.pbar.update(1)
        self.epoch += 1
        if self.epoch == self.pbar.total:
            self.pbar.close()

logger = EpochLogger(EPOCHS)

w2v_model = Word2Vec(
    sentences=list(train_tokens),
    vector_size=EMBED_DIM,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=multiprocessing.cpu_count() - 1,
    sg=SG,
    epochs=EPOCHS,
    compute_loss=True,      # required to query loss
    callbacks=[logger],
)

Word2Vec epochs: 100%|██████████| 10/10 [05:39<00:00, 34.00s/it, loss=69519680.00]


In [13]:
def sent_vector(tokens, model, dim):
    """Average the word vectors for tokens present in the model’s vocab.
       Returns a zero‑vector if no token is in the vocab."""
    valid_vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not valid_vecs:
        return np.zeros(dim)
    return np.mean(valid_vecs, axis=0)

# Vectorize train & test articles
X_train = np.vstack([sent_vector(tok, w2v_model, EMBED_DIM) for tok in tqdm(train_tokens)])
X_test  = np.vstack([sent_vector(tok, w2v_model, EMBED_DIM) for tok in tqdm(test_tokens)])

100%|██████████| 90000/90000 [01:02<00:00, 1433.18it/s]
100%|██████████| 10000/10000 [00:06<00:00, 1432.76it/s]


In [14]:
lbl = LabelEncoder()
y_train = lbl.fit_transform(df_train["publication"])
y_test  = lbl.transform(df_test["publication"])

In [19]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

clf = LinearSVC(               
    C=1.0,                         
    max_iter=1000,                  
    multi_class="ovr", 
    verbose=2                      
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print(classification_report(y_test, y_pred, target_names=lbl.classes_))

[LibLinear]
Accuracy: 0.7751

                    precision    recall  f1-score   support

     Buzzfeed News       0.63      0.50      0.56      1000
               CNN       0.67      0.72      0.69      1000
         Economist       0.88      0.95      0.91      1000
          Fox News       0.76      0.73      0.74      1000
            People       0.81      0.91      0.86      1000
          Politico       0.65      0.70      0.68      1000
           Reuters       0.90      0.91      0.90      1000
          The Hill       0.87      0.78      0.82      1000
The New York Times       0.79      0.72      0.76      1000
              Vice       0.78      0.81      0.80      1000

          accuracy                           0.78     10000
         macro avg       0.77      0.78      0.77     10000
      weighted avg       0.77      0.78      0.77     10000



In [20]:
from sklearn.metrics import confusion_matrix

# ❶ Confusion‑matrix rows = true labels, cols = predicted labels
cm = confusion_matrix(y_test, y_pred, labels=range(len(lbl.classes_)))

# ❷ Diagonal elements are “hits” for each class
hits          = cm.diagonal()
total_true    = cm.sum(axis=1)
acc_per_class = hits / total_true

acc_df = pd.DataFrame({
    "publication": lbl.classes_,
    "n_test":      total_true,
    "correct":     hits,
    "accuracy":    acc_per_class.round(3)
}).sort_values("accuracy", ascending=False)

display(acc_df.style.bar(subset=["accuracy"], vmin=0, vmax=1, color='#66c2a5'))

Unnamed: 0,publication,n_test,correct,accuracy
2,Economist,1000,950,0.95
4,People,1000,914,0.914
6,Reuters,1000,911,0.911
9,Vice,1000,813,0.813
7,The Hill,1000,784,0.784
3,Fox News,1000,728,0.728
1,CNN,1000,725,0.725
8,The New York Times,1000,723,0.723
5,Politico,1000,705,0.705
0,Buzzfeed News,1000,498,0.498
