# BBC News Article Classification using Custom Bag of Words and TF-IDF Implementations

In [35]:
import numpy as np
import pandas as pd
import string
import re
import nltk

In [36]:
df = pd.read_csv('bbc-text.csv')
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


### 1. Data Preprocessing

#### Lowercase, remove punctuation and stopwords

In [38]:
special_chars = string.punctuation
df['text'] = df['text'].apply(lambda x: str(x).lower())
df['text'] = df['text'].apply(lambda x:re.sub(r'[{}]'.format(re.escape(special_chars)), '', str(x)))
df.text.head()

0    tv future in the hands of viewers with home th...
1    worldcom boss  left books alone  former worldc...
2    tigers wary of farrell  gamble  leicester say ...
3    yeading face newcastle in fa cup premiership s...
4    ocean s twelve raids box office ocean s twelve...
Name: text, dtype: object

In [39]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jasmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [40]:
stopwords = set(nltk.corpus.stopwords.words('english'))

In [41]:
df['text'] = df['text'].apply(lambda x:' '.join([word for word in str(x).split() if word not in stopwords]))
df.text

0       tv future hands viewers home theatre systems p...
1       worldcom boss left books alone former worldcom...
2       tigers wary farrell gamble leicester say rushe...
3       yeading face newcastle fa cup premiership side...
4       ocean twelve raids box office ocean twelve cri...
                              ...                        
2220    cars pull us retail figures us retail sales fe...
2221    kilroy unveils immigration policy exchatshow h...
2222    rem announce new glasgow concert us band rem a...
2223    political squabbles snowball become commonplac...
2224    souness delight euro progress boss graeme soun...
Name: text, Length: 2225, dtype: object

tokenize

In [42]:
articles = df.text.tolist()
articles

['tv future hands viewers home theatre systems plasma highdefinition tvs digital video recorders moving living room way people watch tv radically different five years time according expert panel gathered annual consumer electronics show las vegas discuss new technologies impact one favourite pastimes us leading trend programmes content delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable devices one talkedabout technologies ces digital personal video recorders dvr pvr settop boxes like us tivo uk sky system allow people record store play pause forward wind tv programmes want essentially technology allows much personalised tv also builtin highdefinition tv sets big business japan us slower take europe lack highdefinition programming people forward wind adverts also forget abiding network channel schedules putting together alacarte entertainment us networks cable satellite companies worried means terms advertising revenue

In [43]:
split_articles = [article.split() for article in articles]
split_articles

[['tv',
  'future',
  'hands',
  'viewers',
  'home',
  'theatre',
  'systems',
  'plasma',
  'highdefinition',
  'tvs',
  'digital',
  'video',
  'recorders',
  'moving',
  'living',
  'room',
  'way',
  'people',
  'watch',
  'tv',
  'radically',
  'different',
  'five',
  'years',
  'time',
  'according',
  'expert',
  'panel',
  'gathered',
  'annual',
  'consumer',
  'electronics',
  'show',
  'las',
  'vegas',
  'discuss',
  'new',
  'technologies',
  'impact',
  'one',
  'favourite',
  'pastimes',
  'us',
  'leading',
  'trend',
  'programmes',
  'content',
  'delivered',
  'viewers',
  'via',
  'home',
  'networks',
  'cable',
  'satellite',
  'telecoms',
  'companies',
  'broadband',
  'service',
  'providers',
  'front',
  'rooms',
  'portable',
  'devices',
  'one',
  'talkedabout',
  'technologies',
  'ces',
  'digital',
  'personal',
  'video',
  'recorders',
  'dvr',
  'pvr',
  'settop',
  'boxes',
  'like',
  'us',
  'tivo',
  'uk',
  'sky',
  'system',
  'allow',
  'peo

In [44]:
tokens = [token for article in split_articles for token in article]
len(tokens)

494175

In [45]:
tokens = list(set(tokens))
len(tokens)

32955

### 2. Implement the bag of words

create a func to build a vocab from the training set

In [46]:
def build_vocabulary(tokenised_articles):
    vocabulary = {}
    for article in tokenised_articles:
        for token in article:
            count = vocabulary.get(token, 0)
            vocabulary[token] = count + 1
    return vocabulary


In [47]:
vocab = build_vocabulary(split_articles)
vocab

{'tv': 503,
 'future': 373,
 'hands': 86,
 'viewers': 87,
 'home': 647,
 'theatre': 94,
 'systems': 134,
 'plasma': 14,
 'highdefinition': 84,
 'tvs': 19,
 'digital': 415,
 'video': 345,
 'recorders': 20,
 'moving': 75,
 'living': 80,
 'room': 62,
 'way': 726,
 'people': 2043,
 'watch': 111,
 'radically': 10,
 'different': 257,
 'five': 482,
 'years': 1001,
 'time': 1147,
 'according': 422,
 'expert': 29,
 'panel': 79,
 'gathered': 38,
 'annual': 168,
 'consumer': 215,
 'electronics': 83,
 'show': 593,
 'las': 31,
 'vegas': 29,
 'discuss': 59,
 'new': 1970,
 'technologies': 116,
 'impact': 131,
 'one': 1763,
 'favourite': 110,
 'pastimes': 1,
 'us': 1924,
 'leading': 148,
 'trend': 67,
 'programmes': 101,
 'content': 219,
 'delivered': 50,
 'via': 129,
 'networks': 188,
 'cable': 71,
 'satellite': 47,
 'telecoms': 63,
 'companies': 428,
 'broadband': 256,
 'service': 453,
 'providers': 34,
 'front': 115,
 'rooms': 14,
 'portable': 97,
 'devices': 161,
 'talkedabout': 1,
 'ces': 37,
 'p

Implement a func to create a bag of words

In [48]:
def build_bag_of_words(tokenised_articles, vocabulary):
    num_docs = len(tokenised_articles)
    vocab_size = len(vocabulary)
    bow_matrix = np.zeros((num_docs, vocab_size), dtype=int)

    for i, doc in enumerate(tokenised_articles):
        for word in doc:
            if word in vocabulary:
                j = vocab[word]
                bow_matrix[i, j] += 1

    return bow_matrix

In [49]:
bow = build_bag_of_words(split_articles, vocab)

In [50]:
# print(bow)
# print(bow[0])
print(bow.shape)

(2225, 32955)


In [51]:
type(bow)
bow

array([[ 0, 17,  3, ...,  0,  0,  0],
       [ 0,  2,  2, ...,  0,  0,  0],
       [ 0,  1,  2, ...,  0,  0,  0],
       ...,
       [ 0,  8,  2, ...,  0,  0,  0],
       [ 0, 15,  8, ...,  0,  0,  0],
       [ 0,  1,  0, ...,  0,  0,  0]], shape=(2225, 32955))

### 3. TF-IDF

In [52]:
def build_tf(bow_matrix):
    tf_matrix = bow_matrix / bow_matrix.sum(axis=1, keepdims=True)
    return tf_matrix

In [53]:
tf = build_tf(bow)
print(tf)

[[0.         0.04146341 0.00731707 ... 0.         0.         0.        ]
 [0.         0.01052632 0.01052632 ... 0.         0.         0.        ]
 [0.         0.00769231 0.01538462 ... 0.         0.         0.        ]
 ...
 [0.         0.04545455 0.01136364 ... 0.         0.         0.        ]
 [0.         0.03816794 0.02035623 ... 0.         0.         0.        ]
 [0.         0.00826446 0.         ... 0.         0.         0.        ]]


In [54]:
def build_idf(bow_matrix):
    num_docs = bow_matrix.shape[0]
    df_count = np.sum(bow_matrix > 0, axis=0)
    idf = np.log(num_docs / (df_count + 1)) + 1

    return idf

In [55]:
idf = build_idf(bow)

In [56]:
def build_tfidf(tf_matrix, idf_vector):
    tfidf_matrix = tf_matrix * idf_vector
    return tfidf_matrix

In [57]:
tfidf_matrix = build_tfidf(tf, idf)

### 4. Analysis

For a given category, find the top 10 words with the highest average TF-IDF scores.



In [58]:
vocab_list = list(vocab.keys())
categories = df['category'].unique()

for category in categories:
    category_indices = df[df['category'] == category].index
    category_tfidf = tfidf_matrix[category_indices].mean(axis=0)

    top_indices = np.argsort(category_tfidf)[-10:][::-1]
    top_words = [vocab_list[i] for i in top_indices]
    top_scores = category_tfidf[top_indices]

    print(f"\n{category}:")
    for word, score in zip(top_words, top_scores):
        print(f"  {word:15s} - {score:.4f}")


tech:
  hands           - 0.0244
  decide          - 0.0217
  future          - 0.0215
  bandwidth       - 0.0169
  breezes         - 0.0165
  viewers         - 0.0164
  home            - 0.0159
  kind            - 0.0147
  systems         - 0.0147
  theatre         - 0.0145

business:
  future          - 0.0382
  hands           - 0.0223
  breezes         - 0.0197
  viewers         - 0.0175
  home            - 0.0166
  emerged         - 0.0157
  theatre         - 0.0156
  systems         - 0.0149
  triumph         - 0.0139
  plasma          - 0.0135

sport:
  future          - 0.0277
  hands           - 0.0201
  home            - 0.0173
  viewers         - 0.0164
  theatre         - 0.0156
  tvs             - 0.0144
  rochdale        - 0.0144
  blair           - 0.0142
  highdefinition  - 0.0138
  graduate        - 0.0138

entertainment:
  future          - 0.0420
  human           - 0.0343
  hands           - 0.0305
  viewers         - 0.0240
  home            - 0.0214
  systems    

Identify words that have high TF scores but low IDF scores, and vice versa.

In [59]:
avg_tf_per_word = tf.mean(axis=0)
vocab_list = list(vocab.keys())

high tf but low idf, meaning common words that appear freq in many docs

In [60]:
tf_idf_ratio = avg_tf_per_word / (idf + 1e-10)
high_tf_low_idf_indices = np.argsort(tf_idf_ratio)[-20:][::-1]

In [61]:
for idx in high_tf_low_idf_indices:
    word = vocab_list[idx]
    tf_score = avg_tf_per_word[idx]
    idf_score = idf[idx]
    ratio = tf_idf_ratio[idx]
    print(f"{word:<20} {tf_score:<12.6f} {idf_score:<12.6f} {ratio:<15.6f}")

future               0.025311     1.204223     0.021019       
hands                0.020905     1.082405     0.019314       
viewers              0.015550     1.132440     0.013731       
home                 0.014941     1.138100     0.013128       
breezes              0.015231     1.163709     0.013089       
theatre              0.012982     1.162651     0.011166       
systems              0.012046     1.209196     0.009962       
plasma               0.010818     1.241285     0.008715       
tvs                  0.010258     1.271484     0.008068       
highdefinition       0.009953     1.279773     0.007777       
digital              0.009173     1.297165     0.007071       
moving               0.008811     1.314249     0.006704       
video                0.008703     1.303233     0.006678       
recorders            0.008471     1.361502     0.006222       
living               0.007503     1.383022     0.005425       
room                 0.006750     1.418585     0.004759

Filter words that appear at least once

In [62]:
nonzero_mask = avg_tf_per_word > 0
filtered_indices = np.where(nonzero_mask)[0]

high IDF but low TF, meaning rare but specific words


In [63]:
idf_tf_ratio = np.zeros_like(avg_tf_per_word)
idf_tf_ratio[filtered_indices] = idf[filtered_indices] / (avg_tf_per_word[filtered_indices] + 1e-10)

low_tf_high_idf_indices = np.argsort(idf_tf_ratio)[-20:][::-1]

In [66]:
for idx in low_tf_high_idf_indices:
    word = vocab_list[idx]
    tf_score = avg_tf_per_word[idx]
    idf_score = idf[idx]
    ratio = idf_tf_ratio[idx]
    print(f"{word:<20} {tf_score:<12.6f} {idf_score:<12.6f} {ratio:<15.6f}")

communications       0.000279     3.895328     13960.530140   
mediavest            0.000302     4.007032     13287.143630   
likes                0.000328     4.300793     13129.306966   
yet                  0.000348     4.153635     11941.381508   
model                0.000344     3.928389     11405.310200   
reality              0.000349     3.911722     11221.549483   
suit                 0.000354     3.879198     10942.862842   
work                 0.000386     4.044073     10471.889595   
president            0.000378     3.847700     10191.698722   
help                 0.000397     3.737699     9415.313068    
tell                 0.000375     3.444822     9189.887040    
know                 0.000397     3.537028     8911.676883    
experience           0.000395     3.492576     8831.223911    
younger              0.000409     3.583548     8759.615615    
80hour               0.000446     3.879198     8690.697275    
branded              0.000435     3.638608     8358.053

Word importance across corpus

In [65]:
avg_tfidf = tfidf_matrix.mean(axis=0)
top_20_indices = np.argsort(avg_tfidf)[-20:][::-1]

for idx in top_20_indices:
    word = vocab_list[idx]
    score = avg_tfidf[idx]
    print(f"{word:20s} - {score:.6f}")

future               - 0.030480
hands                - 0.022628
breezes              - 0.017725
viewers              - 0.017609
home                 - 0.017005
theatre              - 0.015094
systems              - 0.014566
plasma               - 0.013428
tvs                  - 0.013043
highdefinition       - 0.012738
digital              - 0.011898
moving               - 0.011579
recorders            - 0.011534
video                - 0.011342
appears              - 0.011096
living               - 0.010377
people               - 0.009956
watch                - 0.009729
way                  - 0.009663
years                - 0.009643
