In [1]:
import numpy as np
import pandas as pd
import re
import string
import math

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruzgh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("bbc-text.csv")
print(data.head())

        category                                               text
0           tech  tv future in the hands of viewers with home th...
1       business  worldcom boss  left books alone  former worldc...
2          sport  tigers wary of farrell  gamble  leicester say ...
3          sport  yeading face newcastle in fa cup premiership s...
4  entertainment  ocean s twelve raids box office ocean s twelve...


# Data Preprocessing:
1. Clean the text data by removing punctuation, converting to lowercase, and removing stop words.
2. Tokenize the text into individual words.


In [3]:
stop_words = set(stopwords.words("english"))

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove symbols
    text = re.sub(f"[{string.punctuation}]", " ", text)
    # Remove numbers
    text = re.sub(r"\d+", " ", text)
    # Tokenize
    tokens = text.split()
    # Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]
    return tokens

data["tokens"] = data["text"].apply(preprocess)

In [4]:
data

Unnamed: 0,category,text,tokens
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, alone, former, w..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, say..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, twelve, raids, box, office, ocean, twe..."
...,...,...,...
2220,business,cars pull down us retail figures us retail sal...,"[cars, pull, us, retail, figures, us, retail, ..."
2221,politics,kilroy unveils immigration policy ex-chatshow ...,"[kilroy, unveils, immigration, policy, ex, cha..."
2222,entertainment,rem announce new glasgow concert us band rem h...,"[rem, announce, new, glasgow, concert, us, ban..."
2223,politics,how political squabbles snowball it s become c...,"[political, squabbles, snowball, become, commo..."


In [5]:
# Build vocab
vocab_set = set()
for doc in data["tokens"]:
    vocab_set.update(doc)

sorted_vocab = sorted(vocab_set)
vocab = {word: i for i, word in enumerate(sorted_vocab)}

In [6]:
list(vocab.items())[:10]

[('aa', 0),
 ('aaa', 1),
 ('aaas', 2),
 ('aac', 3),
 ('aadc', 4),
 ('aaliyah', 5),
 ('aaltra', 6),
 ('aamir', 7),
 ('aan', 8),
 ('aara', 9)]

In [7]:
len(vocab)

27758

# Implement Bag of Words

In [8]:
# docs to BoW
X_bow = []
for doc in data["tokens"]:
    vector = [0] * len(vocab)
    for token in doc:
        if token in vocab:
            vector[vocab[token]] += 1
    X_bow.append(vector)

X_bow = np.array(X_bow)
X_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(2225, 27758))

In [9]:
y = data["category"]

# Implement TF-IDF:

In [10]:
# IDF
N = len(data["tokens"])
idf = {}

all_tokens = set()


In [11]:
for doc in data["tokens"]:
    for word in doc:
        all_tokens.add(word)

In [12]:
for word in all_tokens:
    df = sum(1 for doc in data["tokens"] if word in doc)
    idf[word] = math.log(N / (1 + df))

idf_array = np.array([idf[word] for word in vocab.keys()])


In [13]:
from collections import Counter
import numpy as np

X_tfidf = []  # TF-IDF vectors

for tokens in data["tokens"]:
    counts = Counter(tokens)            # word frequencies
    total_terms = len(tokens)           # total terms in the document
    vector = [0] * len(vocab)

    for word, count in counts.items():
        if word in vocab:
            tf = count / total_terms    # tf
            vector[vocab[word]] = tf * idf[word]  # TF-IDF

    X_tfidf.append(vector)

X_tfidf = np.array(X_tfidf)


## Analysis:
1. For a given category, find the top 10 words with the highest average TF-IDF scores.
2. Identify words that have high TF scores but low IDF scores, and vice versa.




In [14]:
categories = set(y)

category_avg_tfidf = {}

In [15]:
# 1
for category in categories:

    avg_tfidf = X_tfidf[[i for i, label in enumerate(y) if label == category]].mean(axis=0)
    word_avg_tfidf = dict(zip(vocab, avg_tfidf))

    top_words = sorted(word_avg_tfidf.items(), key=lambda x: x[1], reverse=True)[:10]

    category_avg_tfidf[category] = top_words

for category, top_words in category_avg_tfidf.items():
    print(f"Category: {category}")
    for word, score in top_words:
        print(f"  {word}: {score:.4f}")

Category: tech
  mobile: 0.0115
  software: 0.0106
  users: 0.0104
  microsoft: 0.0090
  technology: 0.0090
  people: 0.0087
  computer: 0.0083
  net: 0.0082
  digital: 0.0080
  broadband: 0.0080
Category: sport
  england: 0.0099
  game: 0.0095
  cup: 0.0089
  win: 0.0085
  match: 0.0085
  injury: 0.0080
  chelsea: 0.0079
  club: 0.0078
  team: 0.0074
  season: 0.0073
Category: business
  bn: 0.0158
  bank: 0.0097
  growth: 0.0094
  oil: 0.0090
  economy: 0.0088
  sales: 0.0085
  shares: 0.0084
  company: 0.0082
  us: 0.0081
  market: 0.0080
Category: entertainment
  film: 0.0249
  best: 0.0125
  awards: 0.0100
  show: 0.0099
  music: 0.0097
  band: 0.0096
  award: 0.0093
  festival: 0.0091
  album: 0.0088
  actor: 0.0082
Category: politics
  labour: 0.0165
  mr: 0.0164
  election: 0.0142
  blair: 0.0141
  party: 0.0136
  government: 0.0104
  brown: 0.0097
  howard: 0.0086
  minister: 0.0083
  tory: 0.0073


In [16]:
# 2
global_tf = np.sum(X_bow, axis=0) / np.sum(X_bow)
high_tf_low_idf = []
high_idf_low_tf = []

for word, idx in vocab.items():
    word_tf = global_tf[idx]
    word_idf = idf_array[idx]

    # High TF, Low IDF
    if word_tf > np.median(global_tf) and word_idf < np.median(idf_array):
        high_tf_low_idf.append((word, word_tf, word_idf))

    # Low TF, High IDF
    if word_tf < np.median(global_tf) and word_idf > np.median(idf_array):
        high_idf_low_tf.append((word, word_tf, word_idf))

# Sort and keep top 10
high_tf_low_idf_sorted = sorted(high_tf_low_idf, key=lambda x: (x[1], -x[2]), reverse=True)[:10]
high_idf_low_tf_sorted = sorted(high_idf_low_tf, key=lambda x: (x[2], -x[1]), reverse=True)[:10]

# Results
print("\nWords with High TF, Low IDF:")
for word, tf_val, idf_val in high_tf_low_idf_sorted:
    print(f"  {word}: TF={tf_val:.4f}, IDF={idf_val:.4f}")

print("\nWords with Low TF, High IDF:")
for word, tf_val, idf_val in high_idf_low_tf_sorted:
    print(f"  {word}: TF={tf_val:.4f}, IDF={idf_val:.4f}")




Words with High TF, Low IDF:
  said: TF=0.0149, IDF=0.1637
  mr: TF=0.0062, IDF=1.0342
  would: TF=0.0053, IDF=0.6626
  year: TF=0.0047, IDF=0.6250
  also: TF=0.0044, IDF=0.5639
  people: TF=0.0042, IDF=1.0242
  new: TF=0.0041, IDF=0.8179
  us: TF=0.0040, IDF=0.9682
  one: TF=0.0039, IDF=0.7284
  could: TF=0.0031, IDF=0.9321

Words with Low TF, High IDF:
  aa: TF=0.0000, IDF=7.0144
  aaltra: TF=0.0000, IDF=7.0144
  aamir: TF=0.0000, IDF=7.0144
  aan: TF=0.0000, IDF=7.0144
  aara: TF=0.0000, IDF=7.0144
  aarhus: TF=0.0000, IDF=7.0144
  abate: TF=0.0000, IDF=7.0144
  abatement: TF=0.0000, IDF=7.0144
  abbot: TF=0.0000, IDF=7.0144
  abbreviated: TF=0.0000, IDF=7.0144
