In [1]:
import pandas as pd
npr = pd.read_csv("npr.csv")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")

In [5]:
dtm = tfidf.fit_transform(npr["Article"]) # document term matrix 

In [6]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.decomposition import NMF

In [8]:
nmf = NMF(n_components=7, random_state=42)

In [9]:
nmf.fit(dtm)

NMF(n_components=7, random_state=42)

In [10]:
tfidf.get_feature_names()[2300]

'albala'

In [11]:
for index,topic in enumerate(nmf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


THE TOP 15 WORDS FOR TOPIC #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


THE TOP 15 WORDS FOR TOPIC #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


THE TOP 15 WORDS FOR TOPIC #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


THE TOP 15 WORDS FOR TOPIC #5
['love', 've', 'don', 'al

In [12]:
topic_results = nmf.transform(dtm)

In [13]:
topic_results[0]

array([0.        , 0.12075603, 0.00140297, 0.05919954, 0.01518909,
       0.        , 0.        ])

In [14]:
topic_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3], dtype=int64)

In [15]:
npr["Topic"] = topic_results.argmax(axis=1)

In [16]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6
...,...,...
11987,The number of law enforcement officers shot an...,3
11988,"Trump is busy these days with victory tours,...",1
11989,It’s always interesting for the Goats and Soda...,0
11990,The election of Donald Trump was a surprise to...,4


In [17]:
my_topics = {0:'health', 1:'election', 2:'legis', 3:'poli', 4:'election', 5:'music', 6:'edu'}

In [18]:
npr['Topic label'] = npr['Topic'].map(my_topics)

In [19]:
npr

Unnamed: 0,Article,Topic,Topic label
0,"In the Washington of 2016, even when the polic...",1,election
1,Donald Trump has used Twitter — his prefe...,1,election
2,Donald Trump is unabashedly praising Russian...,1,election
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,poli
4,"From photography, illustration and video, to d...",6,edu
...,...,...,...
11987,The number of law enforcement officers shot an...,3,poli
11988,"Trump is busy these days with victory tours,...",1,election
11989,It’s always interesting for the Goats and Soda...,0,health
11990,The election of Donald Trump was a surprise to...,4,election
