In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
import gensim.corpora as corpora

In [3]:
data = pd.read_csv("news_articles.csv")
data.head()

Unnamed: 0,id,title,content
0,25626,"One Weight-Loss Approach Fits All? No, Not Eve...","Dr. Frank Sacks, a professor of nutrition at H..."
1,19551,South Carolina Stuns Baylor to Reach the Round...,South Carolina’s win over Duke was not only ...
2,25221,"U.S. Presidential Race, Apple, Gene Wilder: Yo...",(Want to get this briefing by email? Here’s th...
3,18026,"His Predecessor Gone, Gambia’s New President F...","BANJUL, Gambia — A week after he was inaugu..."
4,21063,‘Harry Potter and the Cursed Child’ Goes From ...,The biggest book of the summer isn’t a blockbu...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       100 non-null    int64 
 1   title    100 non-null    object
 2   content  100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


In [17]:
articles = data["content"]

In [18]:
articles = articles.str.lower().replace(r'[^\w\s]', '', regex=True)

en_stopwords = stopwords.words('english')
articles = articles.apply(lambda words: ' '.join(
        [w for w in words.split() if w not in en_stopwords]
    ))

articles = articles.apply(lambda x: word_tokenize(x))

ps = PorterStemmer()

articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])
articles.head(10)

0    [dr, frank, sack, professor, nutrit, harvard, ...
1    [south, carolina, win, duke, surpris, fan, pos...
2    [want, get, brief, email, here, good, even, he...
3    [banjul, gambia, week, inaugur, anoth, countri...
4    [biggest, book, summer, isnt, blockbust, thril...
5    [indianapoli, senat, ted, cruz, texa, desper, ...
6    [west, palm, beach, fla, donald, j, trump, ran...
7    [year, could, end, without, littl, bit, taylor...
8    [washington, speaker, paul, ryan, deliv, parti...
9    [washington, report, death, islam, state, seni...
Name: content, dtype: object

In [20]:
dictionary = corpora.Dictionary(articles)
print(dictionary)

Dictionary<8693 unique tokens: ['10', '100', '108', '15', '155']...>


In [21]:
doc_term = [dictionary.doc2bow(text) for text in articles]

In [23]:
num_topics = 2

In [26]:
lda_model = gensim.models.LdaModel(corpus=doc_term,
                                    id2word=dictionary,
                                    num_topics=num_topics)

In [27]:
lda_model.print_topics(num_topics, num_words=5)

[(0,
  '0.016*"mr" + 0.015*"said" + 0.006*"trump" + 0.005*"would" + 0.005*"year"'),
 (1,
  '0.017*"mr" + 0.015*"said" + 0.005*"trump" + 0.004*"state" + 0.004*"one"')]