# Lab Assignment 3

1. Nur Husnina Binti Norishak (IS01081121)
2. Nur Khairina Sofea Binti Khaidzir (IS01081122)

### Import the necessary libraries

In [43]:
# For text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# For topic modeling
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Read the data (use only the ‘text’ column)

In [28]:
# Load the dataset
file_path = 'news_dataset.csv'
news_data = pd.read_csv(file_path)

# Use only the 'text' column
texts = news_data['text'].dropna().tolist()


### Perform text pre-processing

In [31]:
# Initialize stop words, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text.lower())
    # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token.isalnum()] 
    # Remove stop words
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
     # Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

# Apply preprocessing to the texts
processed_texts = [preprocess_text(text) for text in texts]
print(processed_texts[0])

['wonder', 'anyon', 'could', 'enlighten', 'car', 'saw', 'day', 'sport', 'car', 'look', 'late', 'earli', 'call', 'bricklin', 'door', 'realli', 'small', 'addit', 'front', 'bumper', 'separ', 'rest', 'bodi', 'know', 'anyon', 'tellm', 'model', 'name', 'engin', 'spec', 'year', 'product', 'car', 'made', 'histori', 'whatev', 'info', 'funki', 'look', 'car', 'plea']


### Perform LDA using Gensim

In [32]:
#create a gensim dictionary object from the preprocessed doc
dictionary = corpora.Dictionary(processed_texts)

#convert each preprocessed document into a BoW representation using the dictionary
corpus = [dictionary.doc2bow(doc) for doc in processed_texts]

In [33]:
#Run LDA
lda_model = LdaModel(corpus, num_topics=4, id2word=dictionary, passes=15)

In [35]:
#empty list to store dominiant topic labels for each doc
article_labels = []

#iterate over each processed_texts
for i, doc in enumerate(processed_texts):
    #for each doc, convert to box representation
    bow = dictionary.doc2bow(doc)
    
    #get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    
    #determine the topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    
    #append to the list
    article_labels.append(dominant_topic)

### Interpret the result

In [38]:
#create dataframe
df_result = pd.DataFrame({"Article":texts, "Topic":article_labels})

#print dataframe
print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      I was wondering if anyone out there could enli...      3
1      I recently posted an article asking what kind ...      3
2      \nIt depends on your priorities.  A lot of peo...      3
3      an excellent automatic can be found in the sub...      3
4      : Ford and his automobile.  I need information...      3
...                                                  ...    ...
11091  Secrecy in Clipper Chip\n\nThe serial number o...      0
11092  Hi !\n\nI am interested in the source of FEAL ...      0
11093  The actual algorithm is classified, however, t...      0
11094  \n\tThis appears to be generic calling upon th...      3
11095  \nProbably keep quiet and take it, lest they g...      3

[11096 rows x 2 columns]



In [39]:
#print top terms for each topic
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()

Top terms for Topic #0:
['use', 'key', 'file', 'system', 'encrypt', 'one', 'program', 'get', 'chip', 'would']

Top terms for Topic #1:
['q', 'x', 'max', 'g', 'r', 'p', 'n', 'db', 'k', 'c']

Top terms for Topic #2:
['game', 'team', 'year', 'play', 'new', 'player', 'win', 'season', 'first', 'leagu']

Top terms for Topic #3:
['would', 'peopl', 'one', 'think', 'say', 'like', 'know', 'go', 'get', 'time']



In [40]:
#print the top terms for each topic with weight
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "use" (weight: 0.016)
- "key" (weight: 0.009)
- "file" (weight: 0.007)
- "system" (weight: 0.007)
- "encrypt" (weight: 0.006)
- "one" (weight: 0.006)
- "program" (weight: 0.005)
- "get" (weight: 0.005)
- "chip" (weight: 0.005)
- "would" (weight: 0.005)

Topic 1:
- "q" (weight: 0.055)
- "x" (weight: 0.053)
- "max" (weight: 0.048)
- "g" (weight: 0.031)
- "r" (weight: 0.031)
- "p" (weight: 0.026)
- "n" (weight: 0.023)
- "db" (weight: 0.023)
- "k" (weight: 0.017)
- "c" (weight: 0.017)

Topic 2:
- "game" (weight: 0.009)
- "team" (weight: 0.008)
- "year" (weight: 0.007)
- "play" (weight: 0.006)
- "new" (weight: 0.005)
- "player" (weight: 0.005)
- "win" (weight: 0.003)
- "season" (weight: 0.003)
- "first" (weight: 0.003)
- "leagu" (weight: 0.003)

Topic 3:
- "would" (weight: 0.009)
- "peopl" (weight: 0.008)
- "one" (weight: 0.008)
- "think" (weight: 0.006)
- "say" (weight: 0.005)
- "like" (weight: 0.005)
- "know" (weight: 0.005)
- "go" (weight: 0.005)
- "g

### Evaluate the LDA model using Coherence score

In [41]:
# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.6368


###### Notes

Topic 0: Computer Program |
Topic 1: Variable |
Topic 2: E-sport Game |
Topic 3: Opinions


###### A score of 0.6368 means the groups is good at having similar group together