<a href="https://colab.research.google.com/github/ihabiba/NLP-Labs/blob/main/Topic_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Importting necessary libraries

In [None]:
import pandas as pd
import numpy as np

# from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


###Load 20 Newsgroups Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("crawford/20-newsgroups")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/crawford/20-newsgroups?dataset_version_number=1...


100%|██████████| 25.7M/25.7M [00:00<00:00, 139MB/s] 

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/crawford/20-newsgroups/versions/1


In [None]:
import os

base_path = "/root/.cache/kagglehub/datasets/crawford/20-newsgroups/versions/1"
print(os.listdir(base_path))


['rec.autos.txt', 'comp.sys.ibm.pc.hardware.txt', 'talk.religion.misc.txt', 'soc.religion.christian.txt', 'comp.sys.mac.hardware.txt', 'talk.politics.mideast.txt', 'talk.politics.misc.txt', 'comp.graphics.txt', 'rec.sport.baseball.txt', 'comp.os.ms-windows.misc.txt', 'sci.space.txt', 'talk.politics.guns.txt', 'sci.electronics.txt', 'list.csv', 'rec.motorcycles.txt', 'sci.med.txt', 'rec.sport.hockey.txt', 'comp.windows.x.txt', 'sci.crypt.txt', 'alt.atheism.txt', 'misc.forsale.txt']


In [None]:
import os
import pandas as pd

base_path = "/root/.cache/kagglehub/datasets/crawford/20-newsgroups/versions/1"

print("Exists?", os.path.exists(base_path))
print("Files:", os.listdir(base_path)[:5])
print("Has list.csv?", os.path.exists(os.path.join(base_path, "list.csv")))

Exists? True
Files: ['rec.autos.txt', 'comp.sys.ibm.pc.hardware.txt', 'talk.religion.misc.txt', 'soc.religion.christian.txt', 'comp.sys.mac.hardware.txt']
Has list.csv? True


In [None]:
df = pd.read_csv(os.path.join(base_path, "list.csv"))
print(df.shape)
print(df.head())
print(df.columns)

(628, 2)
            newsgroup  document_id
0  talk.religion.misc        82757
1  talk.religion.misc        82758
2  talk.religion.misc        82759
3  talk.religion.misc        82760
4  talk.religion.misc        82763
Index(['newsgroup', 'document_id'], dtype='object')


In [None]:
categories = [
    "rec.autos",
    "comp.graphics",
    "sci.med",
    "talk.politics.guns"
]

label_map = {cat: idx for idx, cat in enumerate(categories)}
label_map


{'rec.autos': 0, 'comp.graphics': 1, 'sci.med': 2, 'talk.politics.guns': 3}

In [None]:
texts = []
labels = []

for cat in categories:
    file_path = os.path.join(base_path, f"{cat}.txt")

    with open(file_path, "r", encoding="latin-1") as f:
        content = f.read()

        # Split documents (empty lines separate documents)
        documents = [doc.strip() for doc in content.split("\n\n") if doc.strip()]

        for doc in documents:
            texts.append(doc)
            labels.append(label_map[cat])

print("Total documents:", len(texts))
print("Sample text:\n", texts[0][:300])
print("Sample label:", labels[0])


Total documents: 62147
Sample text:
 Newsgroup: rec.autos
document_id: 101551
From: cs012055@cs.brown.edu (Hok-Chung Tsang)
Subject: Re: Saturn's Pricing Policy
Sample label: 0


In [None]:
df = pd.DataFrame({
    "text": texts,
    "label": labels
})

print(df.head())
print(df["label"].value_counts())


                                                text  label
0  Newsgroup: rec.autos\ndocument_id: 101551\nFro...      0
1  In article <C4vIr5.L3r@shuksan.ds.boeing.com>,...      0
2  Say, you bought your Saturn at $13k, with a de...      0
3  Moreover, if Saturn really does reduce the dea...      0
4  1) Attract even more people to buy Saturns bec...      0
label
1    21667
3    15068
2    13762
0    11650
Name: count, dtype: int64


###Convert text to suitable representation

In [None]:
vectorizer = CountVectorizer(
    stop_words='english',
    max_features=5000
)

vec_matrix = vectorizer.fit_transform(df['text'])
feature_names = vectorizer.get_feature_names_out()


###Perform LDA modeling

In [None]:
num_topics = 4

lda_model = LatentDirichletAllocation(
    n_components=num_topics,
    random_state=42
)

topic_matrix = lda_model.fit_transform(vec_matrix)


###Display topics and their top words

In [None]:
print("Topics and their top words:")
for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-6:-1]]
    print(f"Topic {topic_idx + 1}: {', '.join(top_words)}")


Topics and their top words:
Topic 1: subject, newsgroup, document_id, com, edu
Topic 2: edu, writes, article, com, university
Topic 3: image, graphics, file, jpeg, software
Topic 4: people, don, just, like, car


###Create topic feature DataFrame

In [None]:
topic_features = pd.DataFrame(
    topic_matrix,
    columns=[f"Topic_{i+1}" for i in range(num_topics)]
)

X = topic_features
y = df['label']


###Split data into training and testing set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


###Train suitable classifier

In [None]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)


###Evaluate classification model performance

In [None]:
y_pred = classifier.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=categories))


Classification Report:
                    precision    recall  f1-score   support

         rec.autos       0.00      0.00      0.00      2330
     comp.graphics       0.54      0.92      0.68      4334
           sci.med       0.00      0.00      0.00      2752
talk.politics.guns       0.40      0.66      0.50      3014

          accuracy                           0.48     12430
         macro avg       0.23      0.40      0.29     12430
      weighted avg       0.28      0.48      0.36     12430



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Interpretation of Classification Model Performance

The model achieved an overall accuracy of approximately **48%**, indicating moderate performance when using LDA-based topic features for text classification.

**Performance by Category:**

- **comp.graphics** shows strong recall (**0.92**) and a reasonable F1-score (**0.68**), indicating that the model is able to identify most documents from this category. This suggests that graphics-related vocabulary forms a relatively distinct topic representation.

- **talk.politics.guns** performs moderately, with a recall of **0.66** and an F1-score of **0.50**, showing that political language related to firearms is partially captured by the learned topics.

- **rec.autos** and **sci.med** perform poorly, with zero precision and recall. This indicates that the classifier failed to assign any test samples to these categories. A likely reason is that the LDA topic distributions for these categories overlap significantly with other topics, making them indistinguishable for the classifier.

**Overall Analysis:**

These results highlight a limitation of combining LDA with a simple Naive Bayes classifier, especially when using a small number of topics. While LDA captures general themes, it may not preserve fine-grained distinctions needed for accurate classification across all categories. Increasing the number of topics or using a more expressive classifier could improve performance.
