In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('files/article_items.csv',parse_dates=['date'])

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344 entries, 0 to 1343
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1344 non-null   int64 
 1   idx           1344 non-null   int64 
 2   date          1344 non-null   object
 3   category      1338 non-null   object
 4   authors       1344 non-null   object
 5   title         1344 non-null   object
 6   article_text  1344 non-null   object
dtypes: int64(2), object(5)
memory usage: 73.6+ KB


# NNMF WORKFLOW

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
tfidf = TfidfVectorizer(max_df = .80, min_df = 2, stop_words = 'english')

In [10]:
dtm = tfidf.fit_transform(df['article_text'])

### Construct vector space model for documents (after stopword filtering), resulting in a document-term matrix A 

In [11]:
from sklearn.decomposition import NMF

In [40]:
nmf = NMF(n_components=12, random_state=18)
nmf.fit(dtm)



NMF(n_components=12, random_state=18)

In [41]:
# Look at words/coefficients
len(tfidf.get_feature_names())
# holding an instance of every single word

# nfm.components_
# an array of coefficients per word per topic 

21866

### Interpret topics based off the coefficient values of the words per topic

In [42]:
# Grab 15 words with highest coefficients per topic
for i, topic in enumerate(nmf.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC {i}:')
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC 0:
['cases', 'dr', 'new', 'virus', 'medical', 'coronavirus', 'hospitals', 'pandemic', 'hospital', 'patients', 'care', '19', 'covid', 'says', 'health']


THE TOP 15 WORDS FOR TOPIC 1:
['office', 'republican', 'election', 'vote', 'donald', 'pence', 'raskin', 'managers', 'senate', 'house', 'capitol', 'trial', 'president', 'impeachment', 'trump']


THE TOP 15 WORDS FOR TOPIC 2:
['getty', 'riot', 'rioters', 'images', 'guard', 'law', 'enforcement', 'department', 'fbi', 'officer', 'security', 'jan', 'officers', 'police', 'capitol']


THE TOP 15 WORDS FOR TOPIC 3:
['says', 'shot', 'shots', 'dose', 'johnson', 'moderna', 'health', 'vaccinated', 'pfizer', '19', 'vaccination', 'covid', 'doses', 'vaccines', 'vaccine']


THE TOP 15 WORDS FOR TOPIC 4:
['tax', 'economy', 'package', 'white', 'infrastructure', 'congress', 'republicans', 'relief', 'plan', 'administration', 'senate', 'democrats', 'house', 'president', 'biden']


THE TOP 15 WORDS FOR TOPIC 5:
['security', 'chines

### Assign to Docs

In [62]:
topic_results = nmf.transform(dtm).round(3)
df['topic_num'] = topic_results.argmax(axis=1)

In [57]:
topic_results.shape

(1344, 12)

In [63]:
topic_results[0]

array([0.014, 0.   , 0.01 , 0.   , 0.   , 0.019, 0.005, 0.004, 0.001,
       0.031, 0.   , 0.   ])

In [44]:
df['category'] = df['category'].apply(lambda x: str(x).strip())

In [45]:
df[['title','category','topic_num']].head()

Unnamed: 0,title,category,topic_num
0,The Army Is Expanding Allowed Hairstyles For W...,National,9
1,"Once Nearly Extinct, The Florida Panther Is Ma...",Environment,5
2,What The Rise Of Amazon Has To Do With The Ris...,Politics,11
3,The Youth Of Cuba's Tiny Jewish Minority,Politics,9
4,Melting Snow Usually Means Water For The West....,Environment,5


In [46]:
df.groupby(['category'])['topic_num'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

{'50 Years Of NPR': [9],
 'Africa': [0, 9],
 'America Reckons With Racial Injustice': [7, 2, 9, 5, 4],
 'Analysis': [4, 1, 2],
 'Animals': [9, 5, 0],
 'Architecture': [0, 5],
 'Arts & Life': [9],
 'Asia': [5, 2],
 'Author Interviews': [9, 4, 11],
 'Biden Transition Updates': [4, 9, 6, 1, 5],
 'Bill Of The Month': [0],
 'Book Reviews': [0],
 'Books': [9, 1],
 'Business': [11, 5, 9, 4, 0, 2, 10],
 'Capitol Insurrection Updates': [2],
 'Code Switch': [0],
 'Congress Weighs Action Against Trump: Live Updates': [1, 2],
 "Congress' Electoral College Tally: Live Updates": [1, 2],
 'Consider This from NPR': [9, 3],
 'Coronavirus By The Numbers': [0, 3],
 'Coronavirus Guide': [0, 9],
 'Coronavirus Live Updates': [9, 0, 11],
 'Coronavirus Updates': [0, 3, 9, 1, 2, 4, 5, 11],
 'Coronavirus, Illustrated': [0, 3],
 'Criminal Justice Collaborative': [2, 7, 0, 4],
 'Culture': [5],
 'Development': [5],
 'Discipline And Women In Prison': [4],
 'Economics': [0],
 'Economy': [4, 0, 5, 9],
 "Editors' Pick

### NNMF Topic as Corresponding to Assigned Category

In [47]:
df.groupby(['topic_num'])['category'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

{0: ['The Coronavirus Crisis',
  'National',
  'Goats and Soda',
  'Treatments',
  'Shots - Health News',
  'Health',
  'Coronavirus Updates',
  'Investigations',
  'Public Health',
  'Policy-ish',
  'Business',
  'World',
  'Bill Of The Month',
  'Coronavirus Live Updates',
  'Politics',
  'Your Health',
  'Environment',
  'Health Inc.',
  'Health Care',
  'nan',
  'Economy',
  'Coronavirus, Illustrated',
  'Youth',
  'Economics',
  'Middle East',
  'Global Health',
  'Animals',
  'Book Reviews',
  'Coronavirus By The Numbers',
  'Criminal Justice Collaborative',
  'Take A Number',
  'Architecture',
  'Coronavirus Guide',
  'Africa',
  'National Security',
  'Code Switch',
  'Law'],
 1: ['Politics',
  'Trump Impeachment Trial: Live Updates',
  'Congress Weighs Action Against Trump: Live Updates',
  'National',
  'Media',
  'Insurrection At The Capitol: Live Updates',
  'Elections',
  'Law',
  'Analysis',
  'Biden Transition Updates',
  'House Impeachment Vote: Live Updates',
  'Corona

In [48]:
# map the topic numbers to words/topic descriptors
# nterpret topics based off the coefficent values of the words per topic

# Readability

In [49]:
import textstat

In [50]:
df['f_r_e'] = df['article_text'].apply(lambda x: textstat.flesch_reading_ease(x))

In [51]:
df['ari_grade_level'] = df['article_text'].apply(lambda x: textstat.automated_readability_index(x))

In [52]:
df[['title','article_text','topic_num','f_r_e','ari_grade_level']]

Unnamed: 0,title,article_text,topic_num,f_r_e,ari_grade_level
0,The Army Is Expanding Allowed Hairstyles For W...,Enlarge this image An Army photo shows a soldi...,9,69.01,11.2
1,"Once Nearly Extinct, The Florida Panther Is Ma...",Enlarge this image A male panther leaps over a...,5,66.27,12.3
2,What The Rise Of Amazon Has To Do With The Ris...,Enlarge this image Members of the Workers Asse...,11,63.43,12.9
3,The Youth Of Cuba's Tiny Jewish Minority,"Enlarge this image Adri Quiñones, center, lead...",9,58.82,13.2
4,Melting Snow Usually Means Water For The West....,Enlarge this image Colorado Snow Survey superv...,5,51.82,16.3
...,...,...,...,...,...
1339,President-Elect Biden Names Core Members Of Hi...,Enlarge this image President-elect Joe Biden h...,4,49.45,14.3
1340,Moderna's COVID-19 Vaccine Candidate Gets More...,Enlarge this image Moderna protocol files for ...,3,46.91,14.5
1341,"As Hospitals Fill With COVID-19 Patients, Medi...",Enlarge this image Tennessee Gov. Bill Lee (ri...,0,57.71,14.0
1342,"IRS Says Its Own Error Sent $1,200 Stimulus Ch...",Enlarge this image An error at the IRS caused ...,4,51.82,16.0


In [54]:
df.groupby(['topic_num'])['f_r_e'].mean()

topic_num
0     54.303269
1     51.942933
2     50.501759
3     51.028760
4     48.928742
5     50.338850
6     49.140392
7     55.394375
8     49.300435
9     58.325023
10    45.505938
11    49.446667
Name: f_r_e, dtype: float64