In [1]:
!pip install -q pandas scikit-learn gensim nltk tqdm

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, Phrases, phrases
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import wordpunct_tokenize
from tqdm.auto import tqdm
import multiprocessing, warnings, re, string, os

warnings.filterwarnings("ignore")
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_PATH = "../../../../data/all-the-news-2-1-SMALL-CLEANED.csv"

df = pd.read_csv(DATA_PATH)
print(df.head(3)[["publication", "clean_article", "split"]])

# Split provided by the file
train_df = df[df["split"] == "train"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(f"Train rows: {len(train_df):,}  •  Test rows: {len(test_df):,}")

          publication                                      clean_article  \
0  The New York Times   a love of [NAME] and slap bracelets, [NAME] s...   
1  The New York Times  warm, occasionally downright balmy, weather, a...   
2  The New York Times  dably confused. When he was a boy, Havana was ...   

   split  
0  train  
1  train  
2  train  
Train rows: 90,000  •  Test rows: 10,000


In [4]:
def simple_tokenizer(text: str):
    text = text.lower()
    # remove punctuation but keep intra‑word ’ characters if any
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    return wordpunct_tokenize(text)

# Tokenize every article (lazy eval with progress bar)
train_tokens = train_df["clean_article"].progress_map(simple_tokenizer)
test_tokens  = test_df["clean_article"].map(simple_tokenizer)

# Learn bigram phrases from training corpus
bigram_phrases = Phrases(train_tokens, min_count=5, threshold=10)
bigram_phraser = phrases.Phraser(bigram_phrases)

# Apply bigrams
train_tokens = train_tokens.apply(lambda x: bigram_phraser[x])
test_tokens  = test_tokens.apply(lambda x: bigram_phraser[x])

100%|██████████| 90000/90000 [00:10<00:00, 8598.52it/s] 


In [5]:
EMBED_DIM  = 200        # vector size
WINDOW     = 5
MIN_COUNT  = 3
SG         = 1          # 1 = skip‑gram, 0 = CBOW
EPOCHS     = 10

from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from tqdm.auto import tqdm

class EpochLogger(CallbackAny2Vec):
    """Logs loss & shows a tqdm bar for each epoch."""
    def __init__(self, total_epochs):
        self.epoch     = 0
        self.pbar      = tqdm(total=total_epochs, desc="Word2Vec epochs")

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.pbar.set_postfix({"loss": f"{loss:.2f}"})
        self.pbar.update(1)
        self.epoch += 1
        if self.epoch == self.pbar.total:
            self.pbar.close()

logger = EpochLogger(EPOCHS)

w2v_model = Word2Vec(
    sentences=list(train_tokens),
    vector_size=EMBED_DIM,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=multiprocessing.cpu_count() - 1,
    sg=SG,
    epochs=EPOCHS,
    compute_loss=True,      # required to query loss
    callbacks=[logger],
)

Word2Vec epochs: 100%|██████████| 10/10 [03:54<00:00, 23.44s/it, loss=70370616.00]


In [6]:
def sent_vector(tokens, model, dim):
    """Average the word vectors for tokens present in the model’s vocab.
       Returns a zero‑vector if no token is in the vocab."""
    valid_vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not valid_vecs:
        return np.zeros(dim)
    return np.mean(valid_vecs, axis=0)

# Vectorize train & test articles
X_train = np.vstack([sent_vector(tok, w2v_model, EMBED_DIM) for tok in tqdm(train_tokens)])
X_test  = np.vstack([sent_vector(tok, w2v_model, EMBED_DIM) for tok in tqdm(test_tokens)])

100%|██████████| 90000/90000 [00:29<00:00, 3057.06it/s]
100%|██████████| 10000/10000 [00:03<00:00, 2916.63it/s]


In [7]:
lbl = LabelEncoder()
y_train = lbl.fit_transform(train_df["publication"])
y_test  = lbl.transform(test_df["publication"])

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

print("➜ fitting multinomial logistic regression …")

clf = LogisticRegression(
    max_iter=500,
    n_jobs=-1,
    multi_class="multinomial",
    solver="saga",   # <── swap lbfgs → saga for built‑in progress
    verbose=2        # 0 = silent, 1 = compact, 2 = detailed
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print(classification_report(y_test, y_pred, target_names=lbl.classes_))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 14 concurrent workers.


➜ fitting multinomial logistic regression …
Epoch 1, change: 1
Epoch 2, change: 0.22528766
Epoch 3, change: 0.10609978
Epoch 4, change: 0.064357772
Epoch 5, change: 0.037169386
Epoch 6, change: 0.027772887
Epoch 7, change: 0.021195205
Epoch 8, change: 0.016564978
Epoch 9, change: 0.011365883
Epoch 10, change: 0.0095630484
Epoch 11, change: 0.0078384262
Epoch 12, change: 0.0067749382
Epoch 13, change: 0.0060203695
Epoch 14, change: 0.0054770941
Epoch 15, change: 0.0049634045
Epoch 16, change: 0.0046122763
Epoch 17, change: 0.0043170792
Epoch 18, change: 0.0040159337
Epoch 19, change: 0.0037848926
Epoch 20, change: 0.0036282705
Epoch 21, change: 0.0033824567
Epoch 22, change: 0.0031073925
Epoch 23, change: 0.0028573468
Epoch 24, change: 0.0026119046
Epoch 25, change: 0.0024275014
Epoch 26, change: 0.0022909422
Epoch 27, change: 0.0021519039
Epoch 28, change: 0.0020571619
Epoch 29, change: 0.0019627516
Epoch 30, change: 0.0017946761
Epoch 31, change: 0.0017236859
Epoch 32, change: 0.00159

In [9]:
from sklearn.metrics import confusion_matrix

# ❶ Confusion‑matrix rows = true labels, cols = predicted labels
cm = confusion_matrix(y_test, y_pred, labels=range(len(lbl.classes_)))

# ❷ Diagonal elements are “hits” for each class
hits          = cm.diagonal()
total_true    = cm.sum(axis=1)
acc_per_class = hits / total_true

acc_df = pd.DataFrame({
    "publication": lbl.classes_,
    "n_test":      total_true,
    "correct":     hits,
    "accuracy":    acc_per_class.round(3)
}).sort_values("accuracy", ascending=False)

display(acc_df.style.bar(subset=["accuracy"], vmin=0, vmax=1, color='#66c2a5'))

Unnamed: 0,publication,n_test,correct,accuracy
2,Economist,1000,929,0.929
4,People,1000,889,0.889
6,Reuters,1000,846,0.846
9,Vice,1000,792,0.792
7,The Hill,1000,739,0.739
5,Politico,1000,705,0.705
1,CNN,1000,703,0.703
8,The New York Times,1000,694,0.694
3,Fox News,1000,672,0.672
0,Buzzfeed News,1000,545,0.545


In [10]:
import joblib, pickle, pathlib
MODEL_DIR = pathlib.Path("models"); MODEL_DIR.mkdir(exist_ok=True)

w2v_model.save(MODEL_DIR / "news_w2v.model")
joblib.dump(clf, MODEL_DIR / "logreg.pkl")
pickle.dump(lbl, open(MODEL_DIR / "label_encoder.pkl", "wb"))
print("Models saved to", MODEL_DIR.resolve())

AttributeError: 'PosixPath' object has no attribute 'endswith'