In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('files/article_items.csv',parse_dates=['date'])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344 entries, 0 to 1343
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1344 non-null   int64 
 1   idx           1344 non-null   int64 
 2   date          1344 non-null   object
 3   category      1338 non-null   object
 4   authors       1344 non-null   object
 5   title         1344 non-null   object
 6   article_text  1344 non-null   object
dtypes: int64(2), object(5)
memory usage: 73.6+ KB


# NNMF WORKFLOW

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf = TfidfVectorizer(max_df = .80, min_df = 2, stop_words = 'english')

In [10]:
dtm = tfidf.fit_transform(df['article_text'])

### Construct vector space model for documents (after stopword filtering), resulting in a document-term matrix

In [11]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=12, random_state=18)
nmf.fit(dtm)

In [41]:
# Look at words/coefficients
len(tfidf.get_feature_names())
# holding an instance of every single word

# nfm.components_
# an array of coefficients per word per topic 

21866

### Interpret topics based off the coefficient values of the words per topic

In [42]:
# Grab 15 words with highest coefficients per topic
for i, topic in enumerate(nmf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC {i}:')
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC 0:
['cases', 'dr', 'new', 'virus', 'medical', 'coronavirus', 'hospitals', 'pandemic', 'hospital', 'patients', 'care', '19', 'covid', 'says', 'health']


THE TOP 15 WORDS FOR TOPIC 1:
['office', 'republican', 'election', 'vote', 'donald', 'pence', 'raskin', 'managers', 'senate', 'house', 'capitol', 'trial', 'president', 'impeachment', 'trump']


THE TOP 15 WORDS FOR TOPIC 2:
['getty', 'riot', 'rioters', 'images', 'guard', 'law', 'enforcement', 'department', 'fbi', 'officer', 'security', 'jan', 'officers', 'police', 'capitol']


THE TOP 15 WORDS FOR TOPIC 3:
['says', 'shot', 'shots', 'dose', 'johnson', 'moderna', 'health', 'vaccinated', 'pfizer', '19', 'vaccination', 'covid', 'doses', 'vaccines', 'vaccine']


THE TOP 15 WORDS FOR TOPIC 4:
['tax', 'economy', 'package', 'white', 'infrastructure', 'congress', 'republicans', 'relief', 'plan', 'administration', 'senate', 'democrats', 'house', 'president', 'biden']


THE TOP 15 WORDS FOR TOPIC 5:
['security', 'chines

### Assign to Docs

In [62]:
topic_results = nmf.transform(dtm).round(3)
df['topic_num'] = topic_results.argmax(axis=1)

In [57]:
topic_results.shape

(1344, 12)

In [63]:
topic_results[0]

array([0.014, 0.   , 0.01 , 0.   , 0.   , 0.019, 0.005, 0.004, 0.001,
       0.031, 0.   , 0.   ])

In [44]:
df['category'] = df['category'].apply(lambda x: str(x).strip())

In [45]:
df[['title','category','topic_num']].head()

Unnamed: 0,title,category,topic_num
0,The Army Is Expanding Allowed Hairstyles For W...,National,9
1,"Once Nearly Extinct, The Florida Panther Is Ma...",Environment,5
2,What The Rise Of Amazon Has To Do With The Ris...,Politics,11
3,The Youth Of Cuba's Tiny Jewish Minority,Politics,9
4,Melting Snow Usually Means Water For The West....,Environment,5


### NNMF Topic as Corresponding to Article Title

In [None]:
df.groupby(['topic_num'])['title'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

In [48]:
# map the topic numbers to words/topic descriptors
# nterpret topics based off the coefficent values of the words per topic