# 6.4 LDA

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
import gensim.corpora as corpora

### Load Data

In [2]:
data = pd.read_csv("news_articles.csv")

In [3]:
data.head()

Unnamed: 0,id,title,content
0,25626,"One Weight-Loss Approach Fits All? No, Not Eve...","Dr. Frank Sacks, a professor of nutrition at H..."
1,19551,South Carolina Stuns Baylor to Reach the Round...,South Carolina’s win over Duke was not only ...
2,25221,"U.S. Presidential Race, Apple, Gene Wilder: Yo...",(Want to get this briefing by email? Here’s th...
3,18026,"His Predecessor Gone, Gambia’s New President F...","BANJUL, Gambia — A week after he was inaugu..."
4,21063,‘Harry Potter and the Cursed Child’ Goes From ...,The biggest book of the summer isn’t a blockbu...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       100 non-null    int64 
 1   title    100 non-null    object
 2   content  100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


### Clean Data

In [6]:
# take just the content of the article, lowercase and remove punctuation
articles = data['content'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

# stop word removal
en_stopwords = stopwords.words('english')
articles = articles.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

# tokenize
articles = articles.apply(lambda x: word_tokenize(x))

# stemming (done for speed as we have a lot of text)
ps = PorterStemmer()
articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])

In [10]:
articles

0     [meet, donald, j, trump, nation, tech, elit, h...
1     [good, morn, want, get, california, today, ema...
2     [washington, good, news, repres, mike, pompeo,...
3     [morrison, colo, surround, geolog, format, cra...
4     [face, might, seem, right, time, invest, turke...
5     [lo, angel, shortli, drop, youngest, daughter,...
6     [indianapoli, senat, ted, cruz, texa, desper, ...
7     [cate, blanchett, prove, strong, draw, broadwa...
8     [cleveland, monday, night, donald, j, trump, b...
9     [good, morn, want, get, california, today, ema...
10    [lo, angel, john, mayer, explain, he, fact, ge...
11    [first, new, episod, presid, donald, j, trump,...
12    [least, 27, peopl, shot, seven, fatal, period,...
13    [tesla, motor, maverick, maker, scrutini, fede...
14    [time, journalist, iraq, sunni, heartland, eva...
15    [washington, presid, trump, plan, take, execut...
16    [nomad, wander, necessari, surviv, came, stock...
17    [st, petersburg, russia, quietli, number, 

### Vectorization

In [11]:
# create dictionary of all words
dictionary = corpora.Dictionary(articles)
print(dictionary)

Dictionary(3892 unique tokens: ['1', '2017', '2018', '3', '4']...)


In [12]:
# vecotize using bag of words into a document term matrix
doc_term = [dictionary.doc2bow(text) for text in articles]

In [13]:
print(doc_term)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 4), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 5), (26, 1), (27, 1), (28, 2), (29, 1), (30, 4), (31, 1), (32, 2), (33, 1), (34, 1), (35, 1), (36, 1), (37, 3), (38, 1), (39, 1), (40, 1), (41, 3), (42, 1), (43, 1), (44, 1), (45, 3), (46, 1), (47, 1), (48, 3), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 2), (58, 1), (59, 2), (60, 1), (61, 2), (62, 2), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 3), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 6), (77, 1), (78, 2), (79, 1), (80, 1), (81, 3), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 2), (88, 4), (89, 1), (90, 1), (91, 2), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 3), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 2), (106, 1), (107, 1), (108, 1), (109, 1), (110, 1)

### LDA

In [19]:
# specify number of topics
num_topics = 2

In [None]:
# create LDA model
lda_model = gensim.models.LdaModel(corpus=doc_term,
                                   id2word=dictionary,
                                   num_topics=num_topics)

In [None]:
lda_model.print_topics(num_topics=num_topics, num_words=5)