In [46]:
import pandas as pd
npr_articles = pd.read_csv('national-public-radio.csv')
npr_articles.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
document_term_matrix = tfidf.fit_transform(npr_articles['Article'])
document_term_matrix

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [48]:
print(len(tfidf.get_feature_names()))
import random
for i in range(10):
    random_word_id = random.randint(0,54776)
    print(tfidf.get_feature_names()[random_word_id])

54777
kolko
restricts
parts
improvised
luhrmann
skewed
chiu
attractive
develop
scar


In [49]:
from sklearn.decomposition import TruncatedSVD
svd_model = TruncatedSVD(n_components=7, algorithm='randomized', n_iter=100, random_state=122)
svd_model.fit(document_term_matrix)

TruncatedSVD(algorithm='randomized', n_components=7, n_iter=100,
             random_state=122, tol=0.0)

<h3>Top Words Per Topic</h3>

In [50]:
len(svd_model.components_)

7

In [51]:
svd_model.components_

array([[ 1.55900729e-03,  4.21839655e-02,  8.05207864e-05, ...,
         1.62269473e-04,  6.70255273e-05,  3.75122661e-05],
       [-1.07760156e-03, -2.86484973e-02, -1.55163733e-04, ...,
        -3.02197147e-04, -1.19388897e-04, -6.62555764e-05],
       [-1.28152898e-03,  4.33947840e-02, -3.89983617e-04, ...,
        -2.22494805e-04, -1.33187324e-04, -2.09939357e-04],
       ...,
       [ 7.99286368e-04,  1.81944822e-02, -2.67380312e-04, ...,
        -1.57625594e-04, -8.96626887e-05, -7.72754944e-05],
       [-7.55381962e-04,  3.72929928e-02, -7.12300961e-04, ...,
         1.21127103e-04, -2.57524925e-06, -3.18123685e-04],
       [ 2.24854776e-04,  3.60945327e-03, -3.53097321e-04, ...,
        -9.26260189e-05, -2.33065771e-06, -1.52896991e-04]])

In [52]:
len(svd_model.components_[0])

54777

In [53]:
single_topic = svd_model.components_[0]
# Returns the indices that would sort this array.
print(single_topic.argsort())
# Word least representative of this topic
print(single_topic[18302])
# Word most representative of this topic
print(single_topic[42993])
# Top 10 words for this topic:
print(single_topic.argsort()[-10:])

[10088 17188  5876 ... 42561 42993 50426]
0.00019185342627741267
0.20840703057955665
[22673 33390 26752 28659 38079  9767 36283 42561 42993 50426]


In [54]:
top_word_indices = single_topic.argsort()[-10:]
for index in top_word_indices:
    print(tfidf.get_feature_names()[index])

health
new
just
like
president
clinton
people
said
says
trump


In [55]:
topic_map={}
for index,topic in enumerate(svd_model.components_):
    print(f'The Top 15 Words for Topic #{index} :')
    topic_map[index]=[tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]]
    print(topic_map[index])
    print('\n')

The Top 15 Words for Topic #0 :
['women', 'house', 'think', 'campaign', 'state', 'health', 'new', 'just', 'like', 'president', 'clinton', 'people', 'said', 'says', 'trump']


The Top 15 Words for Topic #1 :
['comey', 'democrats', 'republicans', 'hillary', 'voters', 'election', 'obama', 'said', 'donald', 'sanders', 'republican', 'president', 'campaign', 'clinton', 'trump']


The Top 15 Words for Topic #2 :
['act', 'zika', 'percent', 'plan', 'affordable', 'patients', 'tax', 'obamacare', 'federal', 'law', 'coverage', 'medicaid', 'insurance', 'care', 'health']


The Top 15 Words for Topic #3 :
['gop', 'vote', 'republican', 'republicans', 'cruz', 'democratic', 'hillary', 'delegates', 'insurance', 'percent', 'voters', 'care', 'health', 'sanders', 'clinton']


The Top 15 Words for Topic #4 :
['department', 'vote', 'attack', 'fbi', 'reports', 'hillary', 'officers', 'voters', 'democratic', 'court', 'said', 'state', 'police', 'sanders', 'clinton']


The Top 15 Words for Topic #5 :
['sanders', 'c

<h3>Attaching Topic Labels to News Articles</h3>

In [56]:
document_term_matrix.shape

(11992, 54777)

In [57]:
len(npr_articles)

11992

In [58]:
topic_model_results = svd_model.transform(document_term_matrix)
print("Topic Model Result Dimentions :",topic_model_results.shape)
print("Topic Model Result for 1st Article :",topic_model_results[0])
print("Rounded Topic Model Result for 1st Article :",topic_model_results[0].round(2))
topic_model_results[0].argmax()

Topic Model Result Dimentions : (11992, 7)
Topic Model Result for 1st Article : [ 0.34319845  0.29033854  0.00449203 -0.1665609  -0.06057904  0.00305527
 -0.06938709]
Rounded Topic Model Result for 1st Article : [ 0.34  0.29  0.   -0.17 -0.06  0.   -0.07]


0

<p>This means that our model thinks that the first article belongs to topic #0.</p>

<h3>Combining with Original Data</h3>

In [59]:
topic_model_results.argmax(axis=1)

array([0, 0, 1, ..., 0, 0, 0])

In [60]:
npr_articles['Topic'] = topic_model_results.argmax(axis=1)
npr_articles['Words For Topic'] = npr_articles.apply(lambda row: topic_map[row.Topic], axis = 1)
npr_articles.head(10)

Unnamed: 0,Article,Topic,Words For Topic
0,"In the Washington of 2016, even when the polic...",0,"[women, house, think, campaign, state, health,..."
1,Donald Trump has used Twitter — his prefe...,0,"[women, house, think, campaign, state, health,..."
2,Donald Trump is unabashedly praising Russian...,1,"[comey, democrats, republicans, hillary, voter..."
3,"Updated at 2:50 p. m. ET, Russian President Vl...",0,"[women, house, think, campaign, state, health,..."
4,"From photography, illustration and video, to d...",0,"[women, house, think, campaign, state, health,..."
5,I did not want to join yoga class. I hated tho...,0,"[women, house, think, campaign, state, health,..."
6,With a who has publicly supported the debunk...,0,"[women, house, think, campaign, state, health,..."
7,"I was standing by the airport exit, debating w...",0,"[women, house, think, campaign, state, health,..."
8,"If movies were trying to be more realistic, pe...",0,"[women, house, think, campaign, state, health,..."
9,"Eighteen years ago, on New Year’s Eve, David F...",0,"[women, house, think, campaign, state, health,..."
