In [1]:
import pandas as pd
npr_articles = pd.read_csv('national-public-radio.csv')
npr_articles.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
document_term_matrix = tfidf.fit_transform(npr_articles['Article'])
document_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [6]:
print(len(tfidf.get_feature_names()))
import random
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(tfidf.get_feature_names()[random_word_id])

54777
inserting
gunbattle
repercussion
pretoria
prefigured
albuquerque
modi
burps
catalogs
visually


In [5]:
from sklearn.decomposition import NMF
nmf_model = NMF(n_components=7,random_state=42)
nmf_model.fit(document_term_matrix)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

<h3>Top Words Per Topic</h3>

In [7]:
len(nmf_model.components_)

7

In [8]:
nmf_model.components_

array([[0.00000000e+00, 2.49950821e-01, 0.00000000e+00, ...,
        1.70313822e-03, 2.37544362e-04, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 8.22048918e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 3.12379960e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [5.89723338e-03, 0.00000000e+00, 1.50186440e-03, ...,
        7.06428924e-04, 5.85500542e-04, 6.89536542e-04],
       [4.01763234e-03, 5.31643833e-02, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [9]:
len(nmf_model.components_[0])

54777

In [10]:
single_topic = nmf_model.components_[0]
# Returns the indices that would sort this array.
print(single_topic.argsort())
# Word least representative of this topic
print(single_topic[18302])
# Word most representative of this topic
print(single_topic[42993])
# Top 10 words for this topic:
print(single_topic.argsort()[-10:])

[    0 27208 27206 ... 36283 54692 42993]
0.0
2.005055165418594
[14441 36310 53989 52615 47218 53152 19307 36283 54692 42993]


In [11]:
top_word_indices = single_topic.argsort()[-10:]
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

disease
percent
women
virus
study
water
food
people
zika
says


In [13]:
topic_map={}
for index,topic in enumerate(nmf_model.components_):
    print(f'The Top 15 Words for Topic #{index} :')
    topic_map[index]=[tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]]
    print(topic_map[index])
    print('\n')

The Top 15 Words for Topic #0 :
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


The Top 15 Words for Topic #1 :
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


The Top 15 Words for Topic #2 :
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The Top 15 Words for Topic #3 :
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


The Top 15 Words for Topic #4 :
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


The Top 15 Words for Topic #5 :
['love', 've'

<h3>Attaching Topic Labels to News Articles</h3>

In [14]:
document_term_matrix.shape

(11992, 54777)

In [15]:
len(npr_articles)

11992

In [16]:
topic_model_results = nmf_model.transform(document_term_matrix)
print("Topic Model Result Dimentions :",topic_model_results.shape)
print("Topic Model Result for 1st Article :",topic_model_results[0])
print("Rounded Topic Model Result for 1st Article :",topic_model_results[0].round(2))
topic_model_results[0].argmax()

Topic Model Result Dimentions : (11992, 7)
Topic Model Result for 1st Article : [0.         0.12075603 0.00140297 0.05919954 0.01518909 0.
 0.        ]
Rounded Topic Model Result for 1st Article : [0.   0.12 0.   0.06 0.02 0.   0.  ]


1

<p>This means that our model thinks that the first article belongs to topic #1.</p>

<h3>Combining with Original Data</h3>

In [18]:
topic_model_results.argmax(axis=1)

array([1, 1, 1, ..., 0, 4, 3])

In [19]:
npr_articles['Topic'] = topic_model_results.argmax(axis=1)
npr_articles['Words For Topic'] = npr_articles.apply(lambda row: topic_map[row.Topic], axis = 1)
npr_articles.head(10)

Unnamed: 0,Article,Topic,Words For Topic
0,"In the Washington of 2016, even when the polic...",1,"[gop, pence, presidential, russia, administrat..."
1,Donald Trump has used Twitter — his prefe...,1,"[gop, pence, presidential, russia, administrat..."
2,Donald Trump is unabashedly praising Russian...,1,"[gop, pence, presidential, russia, administrat..."
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,"[officers, syria, security, department, law, i..."
4,"From photography, illustration and video, to d...",6,"[teacher, state, high, says, parents, devos, c..."
5,I did not want to join yoga class. I hated tho...,5,"[love, ve, don, album, way, time, song, life, ..."
6,With a who has publicly supported the debunk...,0,"[new, research, like, patients, health, diseas..."
7,"I was standing by the airport exit, debating w...",0,"[new, research, like, patients, health, diseas..."
8,"If movies were trying to be more realistic, pe...",0,"[new, research, like, patients, health, diseas..."
9,"Eighteen years ago, on New Year’s Eve, David F...",5,"[love, ve, don, album, way, time, song, life, ..."
