In [None]:
### load libraries ###
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re # data preprocessing, removing punctuation

import matplotlib.pyplot as plt # EDA, visualization
import seaborn as sns # EDA, visualization
from wordcloud import WordCloud # EDA, word cloud visualization

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # data preprocessing, matrix vectorization
from sklearn.decomposition import NMF # modeling, unsupervised classification
from sklearn.naive_bayes import MultinomialNB # modeling, classified model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # model evaluation

In [None]:
### import datasets ###

train_file = '/kaggle/input/bbc-news-classification/BBC News Train.csv'
test_file = '/kaggle/input/bbc-news-classification/BBC News Test.csv'

df_train = pd.read_csv(train_file)
df_test = pd.read_csv(test_file)

In [None]:
# check data types
df_train.info()
df_test.info()

# show data frames
print(df_train.head(10))
print(df_test.head(10))

# understand category distribution
print(df_train['Category'].value_counts())

In [None]:
print("-" * 50)
print("STEP 1: Problem & Data Description")
print("-" * 50)

**Problem**: The original Kaggle competition task for this dataset was to employ both supervised and unsupervised classification methods to predict the category of an article based on its text. For my final assignments, I will:
* **Unsupervised Classification**: Try to improve the performance of my unsupervised classification model by (1) optimizing my stopwords list and (2) tune my max_features hyperparameter. I will also compare my NMF model to an LDA model.

**Data Description**: Include are a testing dataset and a training dataset. The training dataset includes 1490 samples and the testing dataset includes 735 samples. The features included in these datasets are:
* **ArticleId**: int64, unique identifier for unique articles
* **Text**: string, article source text
* **Category**: object; classification of article; one of five: sport, business, politics, entertainment, tech (only in the training dataset, not the testing dataset)

In [None]:
print("-" * 50)
print("STEP 2a: EDA - Cleaning and Pre-processing")
print("-" * 50)

In [None]:
### Data Cleaning and Preprocessing: Lowercasing & Punctuation Removal ###
# For text vectorization, preprocessing should include setting all words in lowercase; removing punctuation; removing articles and prepositions ("stop words"); tokenization; and stemming.

# set words in lowercase
df_train['Text'] = df_train['Text'].str.lower()
df_test['Text'] = df_test['Text'].str.lower()

# remove punctation
def remove_punctuation(text):
    text = re.sub(r'a-z\s0-9\$', '', text) # retain a-z, 0-9, white spaces, and '$'
    return text

df_train['Text'] = df_train['Text'].apply(remove_punctuation)
df_test['Text'] = df_test['Text'].apply(remove_punctuation)

In [None]:
### Data Cleaning and Preprocessing: Stop Word Removal ###
# create function to remove stop words
# network issues when trying to download nltk stopwords, creating list
STOPWORDS = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 
    'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 
    'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 
    'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 
    'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 
    'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 
    'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 
    'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 
    'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 
    'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 
    'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 
    'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 
    'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 
    'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 
    'weren', 'won', 'wouldn', 'sir', 'mr', 'said'
])

def remove_stopwords(text):
    words = text.split() # split text into individual words
    filtered_words = [word for word in words if word not in STOPWORDS] # filter out stop words
    return ' '.join(filtered_words)

# remove stop words
df_train['Text'] = df_train['Text'].apply(remove_stopwords)
df_test['Text'] = df_test['Text'].apply(remove_stopwords)

In [None]:
### Matrix Vectorization ###

# initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(max_features = 5000)

# fit and transform the training data
X_train = vectorizer.fit_transform(df_train['Text'])

# transform test data
X_test = vectorizer.transform(df_test['Text'])

# print checks
print(X_train.shape)
print(X_test.shape)
print(vectorizer.max_features)

In [None]:
print("-" * 50)
print("STEP 2b: EDA - Visualization")
print("-" * 50)

In [None]:
### Exploratory Data Analysis ###
# EDA is used to understand key data characteristics: balance, sparsity, and feature correlation

# Check for balance between categories
plt.figure(figsize = (10, 6))
sns.countplot(y = 'Category',
             data = df_train,
             order = df_train['Category'].value_counts().index,
             palette = 'viridis')
plt.title("Article Count per Category")
plt.xlabel("Number of Articles")
plt.ylabel("Category")
plt.show()

# Show most common word in each category (n = 10)
def get_top_n_words(df, category, n = 10):
    category_text = df[df['Category'] == category]['Text'] # filter for specific category

    vectorize = CountVectorizer(max_features = 5000)
    X = vectorizer.fit_transform(category_text)

    word_counts = np.sum(X.toarray(), axis = 0)
    words_df = pd.DataFrame(
        {'word': vectorizer.get_feature_names_out(), 'count': word_counts}
    ).sort_values(by = 'count', ascending = False).reset_index(drop = True)

    return words_df.head(n)

categories = df_train['Category'].unique()

for category in categories:
    top_words = get_top_n_words(df_train, category, n = 10)
    print(f"\nCategory: {category.upper()}")
    print(top_words)

# sparsity check
sparsity = 1.0 - (X_train.nnz / (X_train.shape[0] * X_train.shape[1]))
print(f"\nSparsity of the TF-IDF Matrix: {sparsity:.4f}")

In [None]:
# word clouds (for funsies!)

# business
business_text = ' '.join(df_train[df_train['Category'] == 'business']['Text'])

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Greens').generate(business_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for 'Business' Category")
plt.show()

# sport
sport_text = ' '.join(df_train[df_train['Category'] == 'sport']['Text'])

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Blues').generate(sport_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for 'Sport' Category")
plt.show()

# politics
politics_text = ' '.join(df_train[df_train['Category'] == 'politics']['Text'])

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Reds').generate(politics_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for 'Politics' Category")
plt.show()

# entertainment
entertainment_text = ' '.join(df_train[df_train['Category'] == 'entertainment']['Text'])

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='Purples').generate(entertainment_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for 'Entertainment' Category")
plt.show()

# tech
tech_text = ' '.join(df_train[df_train['Category'] == 'tech']['Text'])

wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='magma').generate(tech_text)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for 'Tech' Category")
plt.show()


In [None]:
print("-" * 50)
print("STEP C: Model Building and Training")
print("-" * 50)

In [None]:
### Unsupervised Model Building and Training ###

## Non-Negative Matrix Factorization ##

# unsupervised classification using NMF
NUM_TOPICS = 5 # no. of categories
nmf_model = NMF(n_components = NUM_TOPICS,
               random_state = 42,
               max_iter = 500,
               tol = 0.0001)

# fit model to matrix
W_train = nmf_model.fit_transform(X_train)
H = nmf_model.components_

## NMF Outcomes ##

nmf_predictions = np.argmax(W_train, axis=1) # highest-weighted topic for each document
df_train['NMF_Topic_ID'] = nmf_predictions # add predictions to the training DataFrame for comparison

# Create a crosstab to show the relationship between True Labels and NMF Topics
topic_category_mapping = pd.crosstab(df_train['NMF_Topic_ID'], df_train['Category'])

# Plot the crosstab as a heatmap for better visualization
plt.figure(figsize=(10, 6))
sns.heatmap(topic_category_mapping, annot=True, fmt='d', cmap='Blues', linewidths=.5, linecolor='black')
plt.title('NMF Topic ID vs. True Category Count')
plt.ylabel('NMF Topic ID')
plt.xlabel('True Category')
plt.show()

# adjust dictionary
final_map = {
    0: 'sport',
    1: 'politics',
    2: 'tech',
    3: 'entertainment',
    4: 'business'
}

# map predicted Topic IDs to the new labels
df_train['NMF_Predicted_Category'] = df_train['NMF_Topic_ID'].map(final_map)

# calculate Accuracy
accuracy = accuracy_score(df_train['Category'], df_train['NMF_Predicted_Category'])
print(accuracy)

# display a full Classification Report for Precision, Recall, and F1-score
print("\nNMF Classification Report (Training Data):\n")
print(classification_report(df_train['Category'], df_train['NMF_Predicted_Category']))

In [None]:
## NMF Optimization: lower max_features ##
# initialize TF-IDF vectorizer
vectorizer_lower = TfidfVectorizer(max_features = 10000)

# fit and transform the training data
X_train_lower = vectorizer_lower.fit_transform(df_train['Text'])

# transform test data
X_test_lower = vectorizer_lower.transform(df_test['Text'])

# print checks
print(X_train_lower.shape)
print(X_test_lower.shape)
print(vectorizer_lower.max_features)

# unsupervised classification using NMF
NUM_TOPICS_LOW = 5 # no. of categories
nmf_model_lower = NMF(n_components = NUM_TOPICS_LOW,
               random_state = 42,
               max_iter = 500,
               tol = 0.0001)

# fit model to matrix
W_train_lower = nmf_model_lower.fit_transform(X_train_lower)
H_lower = nmf_model_lower.components_

## NMF Outcomes ##

nmf_predictions_lower = np.argmax(W_train_lower, axis=1) # highest-weighted topic for each document
df_train['NMF_Topic_ID'] = nmf_predictions_lower # add predictions to the training DataFrame for comparison

# Create a crosstab to show the relationship between True Labels and NMF Topics
topic_category_mapping_lower = pd.crosstab(df_train['NMF_Topic_ID'], df_train['Category'])

# Plot the crosstab as a heatmap for better visualization
plt.figure(figsize=(10, 6))
sns.heatmap(topic_category_mapping_lower, annot=True, fmt='d', cmap='Blues', linewidths=.5, linecolor='black')
plt.title('NMF Topic ID vs. True Category Count')
plt.ylabel('NMF Topic ID')
plt.xlabel('True Category')
plt.show()

# adjust dictionary
final_map = {
    0: 'sport',
    1: 'politics',
    2: 'tech',
    3: 'entertainment',
    4: 'business'
}

# map predicted Topic IDs to the new labels
df_train['NMF_Predicted_Category'] = df_train['NMF_Topic_ID'].map(final_map)

# calculate Accuracy
accuracy = accuracy_score(df_train['Category'], df_train['NMF_Predicted_Category'])
print(accuracy)

# display a full Classification Report for Precision, Recall, and F1-score
print("\nNMF Classification Report (Training Data):\n")
print(classification_report(df_train['Category'], df_train['NMF_Predicted_Category']))

In [None]:
## NMF Optimization ##

# refine stopwords

CUSTOM_STOPWORDS = set([
    'year', 'new', 'time', 'make', 'market', 'government', 'country', 
    'england', 'firm', 'world', 'us', 'people', 'best', 'number', 'uk', 'business', 'said'
])

OPTIMIZED_STOPWORDS = STOPWORDS.union(CUSTOM_STOPWORDS)

df_train_nmf_opt = df_train.copy()
df_test_nmf_opt = df_test.copy()

df_train_nmf_opt = pd.read_csv(train_file)
df_train_nmf_opt['Text'] = df_train_nmf_opt['Text'].str.lower()
df_train_nmf_opt['Text'] = df_train_nmf_opt['Text'].apply(remove_punctuation)

# Apply the new, optimized stopword list
def remove_optimized_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in OPTIMIZED_STOPWORDS]
    return ' '.join(filtered_words)

df_train_nmf_opt['Text'] = df_train_nmf_opt['Text'].apply(remove_optimized_stopwords)


## Re-vectorize and tune NMF ##

# re-initialize TF-IDF vectorizer with the same max_features
nmf_opt_vectorizer = TfidfVectorizer(max_features=5000)

# fit and transform the new, cleaned training data
X_train_nmf_opt = nmf_opt_vectorizer.fit_transform(df_train_nmf_opt['Text'])

NUM_TOPICS_OPT = 5 

nmf_opt_model = NMF(n_components=NUM_TOPICS_OPT,
               random_state=42,
               max_iter=500,
               tol=0.0001)

# fit model to new matrix
W_train_opt = nmf_opt_model.fit_transform(X_train_nmf_opt)
H_opt = nmf_opt_model.components_

print(f"\n--- NMF Optimization Training Complete (Topics: {NUM_TOPICS_OPT}) ---")

# top N words for each category

feature_names = nmf_opt_vectorizer.get_feature_names_out()

# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-no_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        print(f"Topic {topic_idx}: {' '.join(top_features)}")

print("\nTop 10 words for each NMF Topic:")
display_topics(nmf_opt_model, feature_names, 10)

In [None]:
## NMF Optimization: Evaluation ##

# determine the highest-weighted topic for each document
nmf_opt_predictions = np.argmax(W_train_opt, axis=1)
df_train_nmf_opt['NMF_Topic_ID'] = nmf_opt_predictions

# create a crosstab to show the relationship between True Labels and NEW NMF Topics
topic_category_mapping_opt = pd.crosstab(df_train_nmf_opt['NMF_Topic_ID'], df_train_nmf_opt['Category'])

print("\nNMF Topic ID vs. True Category Count (Optimized Model)")
print(topic_category_mapping_opt)

# mapping based on assumed cleanest fit (CHECK YOUR OUTPUT!)
final_map_opt = {
    0: 'sport',
    1: 'politics',
    2: 'tech',
    3: 'entertainment',
    4: 'business'
}

# map predicted Topic IDs to the new labels
df_train_nmf_opt['NMF_Predicted_Category'] = df_train_nmf_opt['NMF_Topic_ID'].map(final_map_opt)

# calculate Accuracy
accuracy_nmf_opt = accuracy_score(df_train_nmf_opt['Category'], df_train_nmf_opt['NMF_Predicted_Category'])
print(f"\nOptimized NMF Accuracy: {accuracy_nmf_opt:.4f}")

# display a full Classification Report
print("\nOptimized NMF Classification Report (Training Data):\n")
print(classification_report(df_train_nmf_opt['Category'], df_train_nmf_opt['NMF_Predicted_Category']))

In [None]:
## Try a Latent Dirichlet Allocation (LDA) Model ##
# uses count vectors instead of TF-IDF
from sklearn.decomposition import LatentDirichletAllocation

# convert the set of stopwords to a list as required by CountVectorizer
stop_words_list = list(OPTIMIZED_STOPWORDS)

# count vectorization initialization
# use the OPTIMIZED stopword list and the max_features you selected from NMF tuning.
count_vectorizer = CountVectorizer(max_features=5000, stop_words=stop_words_list) 
X_train_count = count_vectorizer.fit_transform(df_train_nmf_opt['Text'])

# initiate model
NUM_TOPICS_LDA = 5
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS_LDA, 
                                      max_iter=10, # Standard setting; can be tuned
                                      learning_method='online', 
                                      random_state=42, 
                                      n_jobs=-1)

# fit and transform
W_train_lda = lda_model.fit_transform(X_train_count)

# evaluation
lda_predictions = np.argmax(W_train_lda, axis=1)

# crosstab
topic_category_mapping_lda = pd.crosstab(lda_predictions, df_train_nmf_opt['Category'])
print("\nLDA Topic ID vs. True Category Count:")
print(topic_category_mapping_lda)

# mapping based on assumed cleanest fit
lda_final_map = {
    0: 'entertainment',
    1: 'politics',
    2: 'sport',
    3: 'tech',
    4: 'business'
}

# Step 1: Assign LDA predictions to the DataFrame
df_train_nmf_opt['LDA_Topic_ID'] = lda_predictions

# Step 2: Map the predicted Topic IDs to the final category labels
# NOTE: YOU MUST define lda_final_map based on your crosstab output first!
df_train_nmf_opt['LDA_Predicted_Category'] = df_train_nmf_opt['LDA_Topic_ID'].map(lda_final_map)

# Step 3: Calculate Accuracy
accuracy_lda = accuracy_score(df_train_nmf_opt['Category'], df_train_nmf_opt['LDA_Predicted_Category'])

# Step 4: Print Results
print("\n" + "="*50)
print(f"LDA Model Accuracy: {accuracy_lda:.4f}")
print("="*50)

print("\nLDA Classification Report (Training Data):\n")
print(classification_report(df_train_nmf_opt['Category'], df_train_nmf_opt['LDA_Predicted_Category']))

In [None]:
# 1. Get the feature names (words) from the CountVectorizer
feature_names_lda = count_vectorizer.get_feature_names_out()

# 2. Function to display the top words for each topic (re-using your NMF function structure)
def display_topics_lda(model, feature_names, no_top_words):
    print("\n--- LDA Topic Interpretation ---")
    for topic_idx, topic in enumerate(model.components_):
        # topic.argsort() finds the indices that would sort the array
        # [::-1] reverses it to get descending order
        top_features_ind = topic.argsort()[:-no_top_words - 1:-1]
        top_features = [feature_names[i] for i in top_features_ind]
        
        # Determine the assumed category based on your map for clarity
        category = lda_final_map.get(topic_idx, 'Unknown')
        
        print(f"Topic {topic_idx} ({category.upper()}): {' '.join(top_features)}")

# 3. Call the function
display_topics_lda(lda_model, feature_names_lda, 10)

In [None]:
print("-" * 50)
print("STEP D: Results and Discussion")
print("-" * 50)

### **Model Accuracy Results**
* **Baseline NMF** - Accuracy: 0.9235
* **Optimized NMF** - Accuracy: 0.9255
* **Latent Dirichlet Allocation (LDA)** - Accuracy: 0.7866

### **Discussion**
The non-negative matrix factorization (NMF) model performed significantly better than the latent dirichlet allocation (LDA) model. This is largely due to the different vectorization methods employed by each model.

The NMF model uses a TF-IDF vectorization method, which puts weight on **unique, defining keywords** that are common in one category but rare in the other categories. In other words, in a TF-IDF vectorization, the most important differentiation words have the heighest weights.

On the other hand, the LDA model uses a count vector, which puts weight simply on the **most frequent words without** considering whether those words are frequent across multiple categories. In other words, in a count vectorization the highest weighted words are not the most differentiating words.