# Example 1

documents = [
 "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
 "Rafael Nadal Is Out of the Australian Open",
 "Biden Announces Virus Measures",
 "Biden's Virus Plans Meet Reality",
 "Where Biden's Virus Plan Stands"
]

#### Import libraries

In [1]:
# Text preprocessing
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Topic modeling
from gensim import corpora
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Hafizatul
[nltk_data]     A'fifah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#### Load Data

In [2]:
documents = [
    "Rafael Nadal Joins Roger Federe in Missing U.S. Open",
    "Rafael Nadal Is Out of the Australian Open",
    "Biden Announces Virus Measures",
    "Biden's Virus Plans Meet Reality",
    "Where Biden's Virus Plan Stands"
]

#### Preprocess Data

In [3]:
stop_words = set(stopwords.words('english')) # Create a set of English stopwords
lemmatizer = WordNetLemmatizer() # Initialize a WordNet lemmatizer

def preprocess_text(text):
    tokens = word_tokenize(text.lower()) # Tokenize the text into words and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()] # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords from the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatize each token
    return tokens # Return the preprocessed tokens


preprocessed_documents = [preprocess_text(doc) for doc in documents] # Preprocess each document in the list
preprocessed_documents

[['rafael', 'nadal', 'join', 'roger', 'federe', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

#### Create Document-Term Matrix

In [4]:
dictionary = corpora.Dictionary(preprocessed_documents) # Create a Gensim Dictionary object from the preprocessed documents
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents] # Convert each preprocessed document into a bag-of-words representation using the dictionary

#### Run LDA

In [5]:
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15) # Train an LDA modelon the corpus with 2 topics using Gensim's LdaModel class

#### Interpret Results

In [6]:
# empty list to store dominant topic labels for each document
article_labels = []

# iterate over each processed document
for i, doc in enumerate(preprocessed_documents):
    # for each document, convert to bag-of-words representation
    bow = dictionary.doc2bow(doc)
    # get list of topic probabilities
    topics = lda_model.get_document_topics(bow)
    # determine topic with highest probability
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    # append to the list
    article_labels.append(dominant_topic)

In [7]:
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

print("Table with Articles and Topic:")
print(df)
print()

Table with Articles and Topic:
                                             Article  Topic
0  Rafael Nadal Joins Roger Federe in Missing U.S...      0
1         Rafael Nadal Is Out of the Australian Open      0
2                     Biden Announces Virus Measures      1
3                   Biden's Virus Plans Meet Reality      1
4                    Where Biden's Virus Plan Stands      1



In [8]:
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "open" (weight: 0.131)
- "nadal" (weight: 0.131)
- "rafael" (weight: 0.131)
- "missing" (weight: 0.079)
- "federe" (weight: 0.079)
- "roger" (weight: 0.079)
- "join" (weight: 0.079)
- "australian" (weight: 0.079)
- "biden" (weight: 0.027)
- "virus" (weight: 0.027)

Topic 1:
- "virus" (weight: 0.166)
- "biden" (weight: 0.166)
- "plan" (weight: 0.119)
- "meet" (weight: 0.071)
- "reality" (weight: 0.071)
- "stand" (weight: 0.071)
- "measure" (weight: 0.071)
- "announces" (weight: 0.071)
- "rafael" (weight: 0.024)
- "australian" (weight: 0.024)



Topic 0 seems to be related around politics and virus, where the weight of terms like "biden" and "virus" are particularly high, indicating their significance in this topic.

Topic 1 seems to be related to tennis, where the weight of terms like "nadal" and "rafael" are relatively high, suggesting a strong association with this topic.

In [9]:
# Calculate the coherence score for the LDA model
coherence_model_lda = CoherenceModel(model=lda_model, texts=preprocessed_documents, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

# Display the score
print(f'Topic Coherence Score (C_V): {coherence_lda:.4f}')

Topic Coherence Score (C_V): 0.3801
