# Name: Reshma Ganesan, Najah Zdafirah 
# ID: IS01082523, IS01082508
# Lab Assignment 3 – Topic Modeling (CISB5123)

In [67]:
# Required Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from gensim import corpora
from gensim.models import LdaModel, CoherenceModel

In [68]:
# Download NLTK Resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [69]:
# Step 1: Load Dataset
df = pd.read_csv("news_dataset.csv")
df = df[['text']].dropna()

In [70]:
# Step 2: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
import re

def preprocess_text(text):
    tokens = word_tokenize(text.lower())                     # Lowercase + tokenize
    tokens = [t for t in tokens if t.isalpha()]
    #tokens = [t for t in tokens if len(t) > 2 and not t.isnumeric()]              # Keep only words and numbers
    tokens = [t for t in tokens if t not in stop_words]      # Remove stopwords
    tokens = [t for t in tokens if len(t) > 2]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]       # Lemmatize
    tokens = [stemmer.stem(t) for t in tokens]               # Stem
    return tokens

processed_docs = df['text'].apply(preprocess_text)

In [71]:
# Step 3: Create Dictionary and Corpus
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=10, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [72]:
# Step 4: Train LDA Model with 4 Topics
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=4, passes=10, random_state=42)

In [73]:
# Step 5: Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()
print(f"\nCoherence Score: {coherence_score:.4f}\n")


Coherence Score: 0.4987



In [74]:
print("Top Terms for Each Topic:\n")
for idx, topic in lda_model.print_topics(num_topics=4, num_words=10):
    print(f"Topic {idx}:")
    for term in topic.split("+"):
        term = term.strip()
        if "*" in term:
            weight, word = term.split("*")
            print(f"  - {word.strip().strip('\"')} (weight: {weight.strip()})")
    print()


Top Terms for Each Topic:

Topic 0:
  - use (weight: 0.017)
  - key (weight: 0.015)
  - file (weight: 0.010)
  - encrypt (weight: 0.010)
  - system (weight: 0.008)
  - program (weight: 0.008)
  - chip (weight: 0.007)
  - inform (weight: 0.007)
  - secur (weight: 0.006)
  - avail (weight: 0.005)

Topic 1:
  - max (weight: 0.025)
  - use (weight: 0.013)
  - get (weight: 0.010)
  - one (weight: 0.009)
  - would (weight: 0.009)
  - like (weight: 0.008)
  - drive (weight: 0.007)
  - know (weight: 0.006)
  - work (weight: 0.006)
  - problem (weight: 0.006)

Topic 2:
  - would (weight: 0.011)
  - peopl (weight: 0.011)
  - one (weight: 0.010)
  - think (weight: 0.007)
  - say (weight: 0.007)
  - know (weight: 0.006)
  - like (weight: 0.006)
  - right (weight: 0.005)
  - make (weight: 0.005)
  - time (weight: 0.005)

Topic 3:
  - year (weight: 0.010)
  - presid (weight: 0.010)
  - game (weight: 0.008)
  - team (weight: 0.007)
  - new (weight: 0.006)
  - play (weight: 0.006)
  - last (weight: 0.

In [75]:
# Step 7: Optional - Assign dominant topic to each article
doc_topics = []
for doc in corpus:
    topics = lda_model.get_document_topics(doc)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    doc_topics.append(dominant_topic)

df['Dominant_Topic'] = doc_topics
print(df.head())


                                                text  Dominant_Topic
0  I was wondering if anyone out there could enli...               1
1  I recently posted an article asking what kind ...               1
2  \nIt depends on your priorities.  A lot of peo...               1
3  an excellent automatic can be found in the sub...               1
4  : Ford and his automobile.  I need information...               1


In [76]:
# Step 8: Interpretation
print("\n--- Coherence Score Interpretation ---")
print("The coherence score is a measure of how interpretable the topics are.")
print("A higher score (close to 1.0) means the topics make more sense semantically.")
print("In this model, a coherence score around", round(coherence_score, 3),
      "suggests that the LDA model is producing fairly meaningful topics.")


--- Coherence Score Interpretation ---
The coherence score is a measure of how interpretable the topics are.
A higher score (close to 1.0) means the topics make more sense semantically.
In this model, a coherence score around 0.499 suggests that the LDA model is producing fairly meaningful topics.
