#### Word2Vec -> Logistic Regression

Using Word2Vec embeddings with Logistic Regression

In [1]:
!pip install -q pandas scikit-learn gensim nltk tqdm

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, Phrases, phrases
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import wordpunct_tokenize
from tqdm.auto import tqdm
import multiprocessing, warnings, re, string, os

warnings.filterwarnings("ignore")
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
DATA_PATH = "../../../../data/all-the-news-2-1-SMALL-CLEANED.csv"

df = pd.read_csv(DATA_PATH)
print(df.head(3)[["publication", "clean_article", "split"]])

# Split provided by the file
train_df = df[df["split"] == "train"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print(f"Train rows: {len(train_df):,}  •  Test rows: {len(test_df):,}")

          publication                                      clean_article  \
0  The New York Times   a love of [NAME] and slap bracelets, [NAME] s...   
1  The New York Times  warm, occasionally downright balmy, weather, a...   
2  The New York Times  dably confused. When he was a boy, Havana was ...   

   split  
0  train  
1  train  
2  train  
Train rows: 90,000  •  Test rows: 10,000


In [None]:
def simple_tokenizer(text: str):
    text = text.lower()
    # remove punctuation but keep intra‑word ’ characters if any
    text = re.sub(f"[{re.escape(string.punctuation)}]", " ", text)
    return wordpunct_tokenize(text)

# Tokenize every article (lazy eval with progress bar)
train_tokens = train_df["clean_article"].progress_map(simple_tokenizer)
test_tokens  = test_df["clean_article"].map(simple_tokenizer)

# Learn bigram phrases from training corpus e.g "united states" -> "united_states"
bigram_phrases = Phrases(train_tokens, min_count=5, threshold=10)
bigram_phraser = phrases.Phraser(bigram_phrases)

# merge common tokens into 1 token
train_tokens = train_tokens.apply(lambda x: bigram_phraser[x])
test_tokens  = test_tokens.apply(lambda x: bigram_phraser[x])

100%|██████████| 90000/90000 [00:10<00:00, 8405.58it/s] 


In [5]:
EMBED_DIM  = 200        # vector size
WINDOW     = 5
MIN_COUNT  = 3
SG         = 1          # 1 = skip‑gram, 0 = CBOW
EPOCHS     = 10

# Add some logging to the Word2Vec model
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from tqdm.auto import tqdm

class EpochLogger(CallbackAny2Vec):
    """Logs loss & shows a tqdm bar for each epoch."""
    def __init__(self, total_epochs):
        self.epoch     = 0
        self.pbar      = tqdm(total=total_epochs, desc="Word2Vec epochs")

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.pbar.set_postfix({"loss": f"{loss:.2f}"})
        self.pbar.update(1)
        self.epoch += 1
        if self.epoch == self.pbar.total:
            self.pbar.close()

logger = EpochLogger(EPOCHS)

# Initialize the Word2Vec model
w2v_model = Word2Vec(
    sentences=list(train_tokens),
    vector_size=EMBED_DIM,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=multiprocessing.cpu_count() - 1,
    sg=SG,
    epochs=EPOCHS,
    compute_loss=True,      # required to query loss
    callbacks=[logger],
)

Word2Vec epochs: 100%|██████████| 10/10 [04:01<00:00, 24.15s/it, loss=70295008.00]


In [6]:
# Convert tokens to a vector
def sent_vector(tokens, model, dim):
    """Average the word vectors for tokens present in the model’s vocab.
       Returns a zero‑vector if no token is in the vocab."""
    valid_vecs = [model.wv[t] for t in tokens if t in model.wv]
    if not valid_vecs:
        return np.zeros(dim)
    return np.mean(valid_vecs, axis=0)

# Vectorize train & test articles
X_train = np.vstack([sent_vector(tok, w2v_model, EMBED_DIM) for tok in tqdm(train_tokens)])
X_test  = np.vstack([sent_vector(tok, w2v_model, EMBED_DIM) for tok in tqdm(test_tokens)])

100%|██████████| 90000/90000 [00:31<00:00, 2880.61it/s]
100%|██████████| 10000/10000 [00:03<00:00, 2723.11it/s]


In [7]:
lbl = LabelEncoder()
y_train = lbl.fit_transform(train_df["publication"])
y_test  = lbl.transform(test_df["publication"])

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

print("➜ fitting multinomial logistic regression …")

# Fit the model
clf = LogisticRegression(
    max_iter=500,
    n_jobs=-1,
    multi_class="multinomial",
    solver="saga",   # <── swap lbfgs → saga for built‑in progress
    verbose=2        # 0 = silent, 1 = compact, 2 = detailed
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print(f"\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\n")
print(classification_report(y_test, y_pred, target_names=lbl.classes_))


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 14 concurrent workers.


➜ fitting multinomial logistic regression …
Epoch 1, change: 1
Epoch 2, change: 0.16569284
Epoch 3, change: 0.07678894
Epoch 4, change: 0.054561283
Epoch 5, change: 0.035400048
Epoch 6, change: 0.025132937
Epoch 7, change: 0.019676723
Epoch 8, change: 0.014282386
Epoch 9, change: 0.01162727
Epoch 10, change: 0.0092766006
Epoch 11, change: 0.00705009
Epoch 12, change: 0.0064125452
Epoch 13, change: 0.0059442748
Epoch 14, change: 0.0056426586
Epoch 15, change: 0.0052593956
Epoch 16, change: 0.0048964145
Epoch 17, change: 0.0044792225
Epoch 18, change: 0.0042592487
Epoch 19, change: 0.0040646056
Epoch 20, change: 0.0037165384
Epoch 21, change: 0.0036619639
Epoch 22, change: 0.003476426
Epoch 23, change: 0.0032403467
Epoch 24, change: 0.0030288748
Epoch 25, change: 0.0027148959
Epoch 26, change: 0.0025430794
Epoch 27, change: 0.0024351915
Epoch 28, change: 0.0023053747
Epoch 29, change: 0.0022139149
Epoch 30, change: 0.0020882462
Epoch 31, change: 0.001911258
Epoch 32, change: 0.0018391768

#### Results! 

In [9]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

# Compute per-class metrics
precision, recall, f1, support = precision_recall_fscore_support(
    y_test, y_pred, labels=range(len(lbl.classes_)), zero_division=0
)

# Compute accuracy per class (same as before using confusion matrix)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred, labels=range(len(lbl.classes_)))
hits = cm.diagonal()
total_true = cm.sum(axis=1)
acc_per_class = hits / total_true

# Create a DataFrame with all metrics
metrics_df = pd.DataFrame({
    "Publication": lbl.classes_,
    "Accuracy": acc_per_class.round(3),
    "Precision": precision.round(3),
    "Recall": recall.round(3),
    "F1": f1.round(3)
}).sort_values("Publication")

# Add a Top-Line summary row
topline_precision, topline_recall, topline_f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='macro', zero_division=0
)
topline_accuracy = accuracy_score(y_test, y_pred)

topline = pd.DataFrame([{
    "Publication": "**Top-Line**",
    "Accuracy": round(topline_accuracy, 3),
    "Precision": round(topline_precision, 3),
    "Recall": round(topline_recall, 3),
    "F1": round(topline_f1, 3)
}])

# Append Top-Line row
metrics_df = pd.concat([metrics_df, topline], ignore_index=True)

# Display
display(metrics_df.style.bar(subset=["Accuracy", "Precision", "Recall", "F1"], vmin=0, vmax=1, color='#66c2a5'))


Unnamed: 0,Publication,Accuracy,Precision,Recall,F1
0,Buzzfeed News,0.539,0.57,0.539,0.554
1,CNN,0.708,0.652,0.708,0.679
2,Economist,0.932,0.844,0.932,0.886
3,Fox News,0.695,0.777,0.695,0.734
4,People,0.892,0.815,0.892,0.852
5,Politico,0.717,0.637,0.717,0.675
6,Reuters,0.84,0.851,0.84,0.845
7,The Hill,0.747,0.848,0.747,0.794
8,The New York Times,0.675,0.771,0.675,0.72
9,Vice,0.788,0.783,0.788,0.785
