# Example 1

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim.models import LdaModel

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
documents = [
    "Rafael Nadal Joins Roger Federer in Missing U.S. Open",
    "Rafael Nadal Is Out of the Australian Open",
    "Biden Announces Virus Measures",
    "Biden's Virus Plans Meet Reality",
    "Where Biden's Virus Plan Stands"
]

In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

preprocessed_documents = [preprocess_text(doc) for doc in documents]
preprocessed_documents

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['biden', 'virus', 'plan', 'meet', 'reality'],
 ['biden', 'virus', 'plan', 'stand']]

In [5]:
dictionary = corpora.Dictionary(preprocessed_documents)

In [6]:
corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [7]:
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=15)

In [8]:
article_labels = []

for i, doc in enumerate(preprocessed_documents):
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)

In [9]:
import pandas as pd
df = pd.DataFrame({"Article": documents, "Topic": article_labels})

print("Table with Articles and Topics:")
print(df)
print()

Table with Articles and Topics:
                                             Article  Topic
0  Rafael Nadal Joins Roger Federer in Missing U....      0
1         Rafael Nadal Is Out of the Australian Open      0
2                     Biden Announces Virus Measures      1
3                   Biden's Virus Plans Meet Reality      1
4                    Where Biden's Virus Plan Stands      1



In [10]:
print("Top Terms for Each Topic :")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic :
Topic 0:
- "nadal" (weight: 0.131)
- "open" (weight: 0.131)
- "rafael" (weight: 0.131)
- "roger" (weight: 0.079)
- "federer" (weight: 0.079)
- "join" (weight: 0.079)
- "missing" (weight: 0.079)
- "australian" (weight: 0.079)
- "stand" (weight: 0.027)
- "biden" (weight: 0.027)

Topic 1:
- "virus" (weight: 0.166)
- "biden" (weight: 0.166)
- "plan" (weight: 0.119)
- "meet" (weight: 0.071)
- "reality" (weight: 0.071)
- "announces" (weight: 0.071)
- "measure" (weight: 0.071)
- "stand" (weight: 0.071)
- "australian" (weight: 0.024)
- "rafael" (weight: 0.024)



# Example 2

In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from gensim import corpora
from gensim.models import LdaModel
import pandas as pd

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ikmalkamil/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
df = pd.read_csv('npr.csv')
documents = df['Article'].tolist()

In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer() 

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()] 
    tokens = [token for token in tokens if token not in stop_words] 
    tokens = [lemmatizer.lemmatize(token) for token in tokens] 
    return tokens 

preprocessed_documents = [preprocess_text(doc) for doc in documents] 
print(preprocessed_documents[0])

['washington', '2016', 'even', 'policy', 'bipartisan', 'politics', 'sense', 'year', 'show', 'little', 'sign', 'ending', 'president', 'obama', 'moved', 'sanction', 'russia', 'alleged', 'interference', 'election', 'concluded', 'republican', 'long', 'called', 'similar', 'severe', 'measure', 'could', 'scarcely', 'bring', 'approve', 'house', 'speaker', 'paul', 'ryan', 'called', 'obama', 'measure', 'appropriate', 'also', 'overdue', 'prime', 'example', 'administration', 'ineffective', 'foreign', 'policy', 'left', 'america', 'weaker', 'eye', 'gop', 'leader', 'sounded', 'much', 'theme', 'urging', 'president', 'obama', 'year', 'take', 'strong', 'action', 'deter', 'russia', 'worldwide', 'aggression', 'including', 'operation', 'wrote', 'devin', 'nunes', 'chairman', 'house', 'intelligence', 'committee', 'week', 'left', 'office', 'president', 'suddenly', 'decided', 'stronger', 'measure', 'indeed', 'appearing', 'cnn', 'frequent', 'obama', 'critic', 'trent', 'frank', 'called', 'much', 'tougher', 'acti

In [14]:
dictionary = corpora.Dictionary(preprocessed_documents)

dictionary.filter_extremes(no_below=15, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in preprocessed_documents]

In [15]:
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

In [16]:
article_labels = []

for i, doc in enumerate(preprocessed_documents):
    bow = dictionary.doc2bow(doc)
    topics = lda_model.get_document_topics(bow)
    dominant_topic = max(topics, key=lambda x: x[1])[0]
    article_labels.append(dominant_topic)
    
df_result = pd.DataFrame({"Article": documents, "Topic": article_labels})

print("Table with Articles and Topic:")
print(df_result)
print()

Table with Articles and Topic:
                                                 Article  Topic
0      In the Washington of 2016, even when the polic...      0
1        Donald Trump has used Twitter  —   his prefe...      4
2        Donald Trump is unabashedly praising Russian...      0
3      Updated at 2:50 p. m. ET, Russian President Vl...      4
4      From photography, illustration and video, to d...      2
...                                                  ...    ...
11987  The number of law enforcement officers shot an...      1
11988    Trump is busy these days with victory tours,...      0
11989  It’s always interesting for the Goats and Soda...      3
11990  The election of Donald Trump was a surprise to...      0
11991  Voters in the English city of Sunderland did s...      4

[11992 rows x 2 columns]



In [17]:
for topic_id in range(lda_model.num_topics):
    print(f"Top terms for Topic #{topic_id}:")
    top_terms = lda_model.show_topic(topic_id, topn=10)
    print([term[0] for term in top_terms])
    print()

Top terms for Topic #0:
['trump', 'clinton', 'state', 'president', 'republican', 'campaign', 'election', 'vote', 'obama', 'voter']

Top terms for Topic #1:
['police', 'law', 'court', 'state', 'report', 'case', 'department', 'told', 'officer', 'official']

Top terms for Topic #2:
['know', 'thing', 'think', 'life', 'really', 'story', 'back', 'day', 'show', 'go']

Top terms for Topic #3:
['health', 'school', 'study', 'child', 'student', 'percent', 'university', 'care', 'program', 'patient']

Top terms for Topic #4:
['country', 'war', 'government', 'world', 'president', 'china', 'american', 'woman', 'attack', 'group']



In [18]:
print("Top Terms for Each Topic:")
for idx, topic in lda_model.print_topics():
    print(f"Topic {idx}:")
    terms = [term.strip() for term in topic.split("+")]
    for term in terms:
        weight, word = term.split("*")
        print(f"- {word.strip()} (weight: {weight.strip()})")
    print()

Top Terms for Each Topic:
Topic 0:
- "trump" (weight: 0.028)
- "clinton" (weight: 0.011)
- "state" (weight: 0.010)
- "president" (weight: 0.009)
- "republican" (weight: 0.008)
- "campaign" (weight: 0.007)
- "election" (weight: 0.006)
- "vote" (weight: 0.005)
- "obama" (weight: 0.005)
- "voter" (weight: 0.005)

Topic 1:
- "police" (weight: 0.008)
- "law" (weight: 0.007)
- "court" (weight: 0.006)
- "state" (weight: 0.006)
- "report" (weight: 0.006)
- "case" (weight: 0.005)
- "department" (weight: 0.005)
- "told" (weight: 0.004)
- "officer" (weight: 0.004)
- "official" (weight: 0.004)

Topic 2:
- "know" (weight: 0.005)
- "thing" (weight: 0.005)
- "think" (weight: 0.005)
- "life" (weight: 0.004)
- "really" (weight: 0.004)
- "story" (weight: 0.003)
- "back" (weight: 0.003)
- "day" (weight: 0.003)
- "show" (weight: 0.003)
- "go" (weight: 0.003)

Topic 3:
- "health" (weight: 0.007)
- "school" (weight: 0.007)
- "study" (weight: 0.006)
- "child" (weight: 0.005)
- "student" (weight: 0.005)
- "pe