# CISB5123 Text Analytics
## Lab Assignment 3 - Topic Modeling

**Group members:**
1. IZZAT HATTA BIN AZIZI SW01082390
2. MUHAMMAD HAKIMI BIN AZIZI SW01082355


In [2]:
# Step 1: Import libraries
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [3]:
# Step 2: Read the dataset
df = pd.read_csv('Dataset/news_dataset.csv')
df = df[['text']].dropna(subset=['text'])

In [4]:
# Download nltk resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Create a set of English stopwords
stop_words = set(stopwords.words('english'))

# Initialize a WordNet lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        text = str(text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Sample FIRST (important)
df_sample = df.sample(n=200, random_state=42)

# Now safely preprocess the sampled text only
sampled_docs = [preprocess_text(doc) for doc in df_sample['text']]

# Proceed as usual
dictionary = corpora.Dictionary(sampled_docs)
corpus = [dictionary.doc2bow(doc) for doc in sampled_docs]


In [6]:
# Step 5: Build the LDA Model
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=4,
                     random_state=42,
                     update_every=1,
                     chunksize=20,
                     passes=3,
                     alpha='auto',
                     per_word_topics=True)

In [7]:
# Step 6: Evaluate the LDA model using Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=sampled_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

In [8]:
# Step 7: Interpret the topics
topics = lda_model.print_topics(num_words=10)

# Display topics and coherence score
print("\n=== Topics Found ===")
for topic in topics:
    print(topic)

print(f"\nCoherence Score: {coherence_lda:.4f}")


=== Topics Found ===
(0, '0.010*"law" + 0.008*"technology" + 0.008*"encryption" + 0.007*"program" + 0.006*"new" + 0.006*"one" + 0.006*"muslim" + 0.005*"safety" + 0.005*"device" + 0.005*"case"')
(1, '0.015*"state" + 0.008*"people" + 0.008*"one" + 0.007*"u" + 0.007*"like" + 0.006*"time" + 0.006*"jew" + 0.006*"would" + 0.005*"year" + 0.005*"right"')
(2, '0.013*"health" + 0.012*"age" + 0.009*"medical" + 0.007*"disease" + 0.007*"arizona" + 0.006*"year" + 0.005*"among" + 0.005*"icsucieduincominggeodegif" + 0.005*"problem" + 0.004*"xtermmap"')
(3, '0.014*"armenian" + 0.013*"child" + 0.012*"use" + 0.007*"number" + 0.007*"russian" + 0.006*"gonorrhea" + 0.006*"increased" + 0.005*"reported" + 0.005*"rate" + 0.005*"cdc"')

Coherence Score: 0.3980
