In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('npr.csv')

In [3]:
df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

**`max_df`**` : float in range [0.0, 1.0] or int, default=1.0`<br>
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

**`min_df`**` : float in range [0.0, 1.0] or int, default=1`<br>
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

In [5]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [6]:
dtm = cv.fit_transform(df['Article'])

In [7]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [8]:
from sklearn.decomposition import LatentDirichletAllocation

In [9]:
LDA = LatentDirichletAllocation(n_components=10, random_state=42)

In [10]:
LDA.fit(dtm)

LatentDirichletAllocation(random_state=42)

showing stored words

In [12]:
len(cv.get_feature_names())

54777

In [14]:
import random

In [17]:
for i in range(10):
    id = random.randint(0,54776)
    print(cv.get_feature_names()[id])

fluidity
fiber
cryopreservation
gleaning
fridays
performed
user
sumney
wgbh
schoolteacher


Showing top words per topic

In [18]:
len(LDA.components_)

10

In [19]:
LDA.components_

array([[5.11072577e+00, 1.94461867e+03, 1.00001806e-01, ...,
        1.00005562e-01, 1.00000000e-01, 1.00001005e-01],
       [7.90134677e+00, 9.65122359e+01, 1.00000000e-01, ...,
        1.00000000e-01, 1.00000000e-01, 1.00005679e-01],
       [5.31852874e+00, 3.00433521e+02, 1.00000000e-01, ...,
        6.09994114e+00, 2.09990395e+00, 2.09985534e+00],
       ...,
       [2.70759028e+01, 5.39924878e+02, 1.00000000e-01, ...,
        1.00009334e-01, 1.00000000e-01, 1.00000000e-01],
       [1.00044538e-01, 3.22049903e+02, 1.00000000e-01, ...,
        1.00002615e-01, 1.00002538e-01, 1.00002208e-01],
       [2.79006612e-01, 5.87022720e+02, 1.00000000e-01, ...,
        1.00005607e-01, 1.00006588e-01, 1.00000000e-01]])

In [21]:
len(LDA.components_[0])

54777

In [22]:
single_topic = LDA.components_[0]

In [23]:
single_topic.argsort()

array([18302,  2475, 44967, ..., 10425, 42561, 42993], dtype=int64)

In [24]:
#least representative word of this topic
single_topic[18302]

0.10000000000053799

In [25]:
#most representative word of this topic
single_topic[42993]

5183.8298988765355

In [26]:
#top 10 words for this topic
single_topic.argsort()[-10:]

array([    1, 18349, 33390, 32089, 10421, 31464, 22673, 10425, 42561,
       42993], dtype=int64)

In [27]:
index = single_topic.argsort()[-10:]

In [29]:
for i in index:
    print(cv.get_feature_names()[i])

000
federal
new
money
companies
million
health
company
said
says


In [33]:
for index, topic in enumerate(LDA.components_):
    print(f"The top 15 words for topic #{index}")
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['people', 'care', 'government', 'year', 'insurance', '000', 'federal', 'new', 'money', 'companies', 'million', 'health', 'company', 'said', 'says']


The top 15 words for topic #1
['npr', 'intelligence', 'security', 'new', 'told', 'russian', 'campaign', 'obama', 'news', 'white', 'russia', 'house', 'president', 'said', 'trump']


The top 15 words for topic #2
['know', 'little', 'home', 'make', 'way', 'day', 'water', 'time', 'years', 'people', 'food', 'new', 'just', 'like', 'says']


The top 15 words for topic #3
['don', 'food', 'work', 'day', 'life', 'time', 'family', 'children', 'years', 'just', 'women', 'world', 'like', 'people', 'says']


The top 15 words for topic #4
['supreme', 'order', 'city', 'states', 'federal', 'country', 'president', 'rights', 'government', 'people', 'law', 'state', 'said', 'court', 'says']


The top 15 words for topic #5
['going', 've', 'story', 'life', 'don', 'new', 'way', 'time', 'really', 'know', 'think', 'music', 'people', '

attaching discovered topic labels to original articles

In [34]:
topic_results = LDA.transform(dtm)

In [36]:
topic_results.shape

(11992, 10)

In [37]:
topic_results[0]

array([8.78101114e-03, 9.11263140e-01, 1.57269537e-04, 1.57265808e-04,
       1.57268730e-04, 1.57266519e-04, 1.57271636e-04, 1.57262374e-04,
       7.88549762e-02, 1.57267682e-04])

In [38]:
topic_results[0].round(2)

array([0.01, 0.91, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.08, 0.  ])

In [39]:
topic_results[0].argmax()

1

In [40]:
df['topic'] = topic_results.argmax(axis=1)

In [41]:
df

Unnamed: 0,Article,topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",6
...,...,...
11987,The number of law enforcement officers shot an...,7
11988,"Trump is busy these days with victory tours,...",1
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,8


## We could also use Non-negative Matrix Factorization instead of LDA

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [43]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

In [44]:
dtm = tfidf.fit_transform(df['Article'])

Really, the same procedure for LDA can be use for NMF to label the articles

In [46]:
from sklearn.decomposition import NMF

In [47]:
model = NMF(n_components=10, random_state=42)

In [48]:
model.fit(dtm)



NMF(n_components=10, random_state=42)

In [51]:
len(tfidf.get_feature_names())

54777

In [52]:
for i in range(10):
    id = random.randint(0,54776)
    print(tfidf.get_feature_names()[id])

tilting
nonconforming
oxygen
buffet
eggs
unveils
prophylactic
reyna
teething
otters


In [53]:
len(model.components_)

10

In [54]:
model.components_

array([[0.00076005, 0.26692258, 0.        , ..., 0.00266869, 0.00035543,
        0.        ],
       [0.        , 0.00066787, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.09803908, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.03090463, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00429197, 0.04813308, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00445153, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [59]:
model.components_.shape

(10, 54777)

In [60]:
model.components_[0].argsort()

array([27388, 27031, 27030, ..., 19307, 36283, 42993], dtype=int64)

In [62]:
#least representative word of this topic
model.components_[0].argsort()[27388]

35137

In [61]:
#most representative word of this topic
model.components_[0].argsort()[42993]

30405

In [63]:
#top 10 words for this topic
index = model.components_[0].argsort()[-10:]

In [64]:
for i in index:
    print(tfidf.get_feature_names()[i])

just
company
study
new
percent
like
water
food
people
says


In [65]:
for index, topic in enumerate(model.components_):
    print(f"The top 15 words for topic #{index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['year', 'university', 'workers', '000', 'years', 'just', 'company', 'study', 'new', 'percent', 'like', 'water', 'food', 'people', 'says']


The top 15 words for topic #1
['administration', 'cruz', 'election', 'pence', 'gop', 'presidential', 'obama', 'house', 'white', 'republican', 'donald', 'campaign', 'said', 'president', 'trump']


The top 15 words for topic #2
['patients', 'repeal', 'law', 'act', 'republicans', 'tax', 'people', 'plan', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The top 15 words for topic #3
['assad', 'iran', 'iraq', 'north', 'china', 'aleppo', 'war', 'korea', 'said', 'forces', 'russia', 'military', 'syrian', 'syria', 'isis']


The top 15 words for topic #4
['cruz', 'election', 'primary', 'democrats', 'percent', 'party', 'vote', 'state', 'delegates', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


The top 15 words for topic #5
['book', 'love', 'women', 'way', 'time', 'life'

let's read in the same file again but to a different df for this NMF study

In [66]:
nmf_df = pd.read_csv('npr.csv')

In [67]:
nmf_df.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [68]:
topic_results = LDA.transform(dtm)

In [70]:
nmf_df['topic'] = topic_results.argmax(axis=1)

In [71]:
nmf_df

Unnamed: 0,Article,topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,7
11988,"Trump is busy these days with victory tours,...",1
11989,It’s always interesting for the Goats and Soda...,9
11990,The election of Donald Trump was a surprise to...,8


In [73]:
topic_dict = {0:'Study',1:'election',2:'obamacare',3:'Syria',4:'democratic - party, elections',5:'women - lifestyle',
              6:'education',7:'healthcare',8:'Police',9:'Russia investigation'}

In [74]:
nmf_df['topic label'] = nmf_df['topic'].map(topic_dict)

In [75]:
nmf_df

Unnamed: 0,Article,topic,topic label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1,election
4,"From photography, illustration and video, to d...",2,obamacare
...,...,...,...
11987,The number of law enforcement officers shot an...,7,healthcare
11988,"Trump is busy these days with victory tours,...",1,election
11989,It’s always interesting for the Goats and Soda...,9,Russia investigation
11990,The election of Donald Trump was a surprise to...,8,Police
