# Question 1: Text Vectorization and Supervised Learning (English)

Working with the `ag_news` dataset — news articles categorized into four topics:
- World (0)
- Sports (1)
- Business (2)
- Sci/Tech (3)

In [1]:
import re
import pandas as pd
from datasets import load_dataset
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

RANDOM_SEED = 42

## 1.1 Data Preparation

Load the `ag_news` dataset from Hugging Face and convert to pandas DataFrames.

In [2]:
dataset = load_dataset("ag_news")

train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

print(f"Training set size: {len(train_df)}")
print(f"Testing set size:  {len(test_df)}")
train_df.head()

Training set size: 120000
Testing set size:  7600


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2
3,Iraq Halts Oil Exports from Main Southern Pipe...,2
4,"Oil prices soar to all-time record, posing new...",2


## 1.2(a) Preprocessing

- Convert to lowercase
- Remove punctuation and special characters
- Remove standard English stop words

In [3]:
def preprocess(text: str) -> str:
    """Lowercase, strip punctuation/special chars, collapse whitespace."""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

train_df["clean"] = train_df["text"].apply(preprocess)
test_df["clean"] = test_df["text"].apply(preprocess)

train_df[["text", "clean"]].head()

Unnamed: 0,text,clean
0,Wall St. Bears Claw Back Into the Black (Reute...,wall st bears claw back into the black reuters...
1,Carlyle Looks Toward Commercial Aerospace (Reu...,carlyle looks toward commercial aerospace reut...
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,oil and economy cloud stocks outlook reuters r...
3,Iraq Halts Oil Exports from Main Southern Pipe...,iraq halts oil exports from main southern pipe...
4,"Oil prices soar to all-time record, posing new...",oil prices soar to all time record posing new ...


## 1.2(b) Vectorization — Bag of Words

In [4]:
bow_vectorizer = CountVectorizer(stop_words="english")

X_train_bow = bow_vectorizer.fit_transform(train_df["clean"])
X_test_bow = bow_vectorizer.transform(test_df["clean"])

bow_vocab = bow_vectorizer.get_feature_names_out()
print(f"BoW dictionary size: {len(bow_vocab)}")

# Top 10 most frequent words in the training corpus
bow_word_counts = X_train_bow.sum(axis=0).A1  # dense 1-d array
top10_bow_idx = bow_word_counts.argsort()[::-1][:10]
print("\nTop 10 words (BoW):")
for rank, idx in enumerate(top10_bow_idx, 1):
    print(f"  {rank}. {bow_vocab[idx]}  (count: {int(bow_word_counts[idx])})")

BoW dictionary size: 61488

Top 10 words (BoW):
  1. new  (count: 21428)
  2. said  (count: 20267)
  3. reuters  (count: 19340)
  4. ap  (count: 16277)
  5. gt  (count: 13239)
  6. lt  (count: 13183)
  7. year  (count: 9773)
  8. quot  (count: 9596)
  9. world  (count: 8634)
  10. company  (count: 7656)


## 1.2(b) Vectorization — TF-IDF

In [5]:
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df["clean"])
X_test_tfidf = tfidf_vectorizer.transform(test_df["clean"])

tfidf_vocab = tfidf_vectorizer.get_feature_names_out()
print(f"TF-IDF dictionary size: {len(tfidf_vocab)}")

# Top 10 words by total TF-IDF weight across training corpus
tfidf_word_weights = X_train_tfidf.sum(axis=0).A1
top10_tfidf_idx = tfidf_word_weights.argsort()[::-1][:10]
print("\nTop 10 words (TF-IDF):")
for rank, idx in enumerate(top10_tfidf_idx, 1):
    print(f"  {rank}. {tfidf_vocab[idx]}  (total weight: {tfidf_word_weights[idx]:.2f})")

TF-IDF dictionary size: 61488

Top 10 words (TF-IDF):
  1. new  (total weight: 1653.94)
  2. reuters  (total weight: 1576.99)
  3. ap  (total weight: 1559.36)
  4. said  (total weight: 1489.33)
  5. gt  (total weight: 1163.33)
  6. lt  (total weight: 1160.21)
  7. quot  (total weight: 1030.11)
  8. year  (total weight: 958.83)
  9. oil  (total weight: 924.37)
  10. world  (total weight: 887.45)


## 1.2(c) Modeling — Logistic Regression with BoW features

In [6]:
y_train = train_df["label"]
y_test = test_df["label"]

clf_bow = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
clf_bow.fit(X_train_bow, y_train)

bow_train_acc = accuracy_score(y_train, clf_bow.predict(X_train_bow))
bow_test_acc = accuracy_score(y_test, clf_bow.predict(X_test_bow))

print("=== BoW + Logistic Regression ===")
print(f"Training accuracy: {bow_train_acc:.4f}")
print(f"Testing accuracy:  {bow_test_acc:.4f}")

=== BoW + Logistic Regression ===
Training accuracy: 0.9779
Testing accuracy:  0.9070


## 1.2(c) Modeling — Logistic Regression with TF-IDF features

In [7]:
clf_tfidf = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)
clf_tfidf.fit(X_train_tfidf, y_train)

tfidf_train_acc = accuracy_score(y_train, clf_tfidf.predict(X_train_tfidf))
tfidf_test_acc = accuracy_score(y_test, clf_tfidf.predict(X_test_tfidf))

print("=== TF-IDF + Logistic Regression ===")
print(f"Training accuracy: {tfidf_train_acc:.4f}")
print(f"Testing accuracy:  {tfidf_test_acc:.4f}")

=== TF-IDF + Logistic Regression ===
Training accuracy: 0.9431
Testing accuracy:  0.9161


## Summary

In [8]:
summary = pd.DataFrame({
    "Method": ["BoW", "TF-IDF"],
    "Dictionary Size": [len(bow_vocab), len(tfidf_vocab)],
    "Train Accuracy": [f"{bow_train_acc:.4f}", f"{tfidf_train_acc:.4f}"],
    "Test Accuracy": [f"{bow_test_acc:.4f}", f"{tfidf_test_acc:.4f}"],
})
summary

Unnamed: 0,Method,Dictionary Size,Train Accuracy,Test Accuracy
0,BoW,61488,0.9779,0.907
1,TF-IDF,61488,0.9431,0.9161
