In [40]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
nlp = spacy.load('en_core_web_sm')

In [41]:
df = pd.read_csv('files/article_items.csv',parse_dates=['date'])

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344 entries, 0 to 1343
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1344 non-null   int64 
 1   idx           1344 non-null   int64 
 2   date          1344 non-null   object
 3   category      1338 non-null   object
 4   authors       1344 non-null   object
 5   title         1344 non-null   object
 6   article_text  1344 non-null   object
dtypes: int64(2), object(5)
memory usage: 73.6+ KB


In [43]:
df['category'].nunique()

132

In [44]:
df['category'].unique()

array(['National', 'Environment', 'Politics', 'Environment ',
       'Music News', 'Middle East', 'World ', 'Education', 'Technology',
       'The Coronavirus Crisis', 'Goats and Soda ', 'Energy ', 'World',
       'Animals', 'Shots - Health News ', 'Infectious Disease',
       'Business', 'Criminal Justice Collaborative ', 'Your Health',
       'The Coronavirus Crisis ', 'Investigations',
       'America Reckons With Racial Injustice ',
       'Untangling Disinformation ', 'Law', 'Student Podcast Challenge ',
       'Public Health', 'National ', 'Race', 'Coronavirus Live Updates ',
       'Analysis', 'Technology ', 'Photography', 'Health',
       'Planet Money ', 'National Security', 'Treatments', 'Science ',
       'Law ', 'Elections', 'Coronavirus Updates',
       'America Reckons With Racial Injustice', 'Obituaries', 'Family',
       'Coronavirus Updates ', 'Politics ', 'Author Interviews', 'Sports',
       'Space', 'History', 'Science', 'Architecture', '50 Years Of NPR ',
       'E

In [46]:
df['article_text'][0]

'Enlarge this image An Army photo shows a soldier wearing a new approved ponytail hairstyle. U.S. Army hide caption toggle caption U.S. Army An Army photo shows a soldier wearing a new approved ponytail hairstyle. U.S. Army The Army is now allowing female soldiers to wear their hair in ponytails in all uniforms, in a change announced earlier this month. It expands on hair guidelines announced in January. For years, many women in the Army were required to keep their hair in a tight bun. The newest changes mean women can keep their hair either a bun, single ponytail, two braids or a single braid; locks, braids, twists or cornrows can come together in one or two braids or a ponytail; and braids or a ponytail can go as far down as the bottom of the shoulder blades. There are exceptions on the length of the ponytail or braid for women doing tactical or physical training, the Army says. In other changes this year, hair highlights are now allowed in natural colors, lipstick and nail polish al

# LDA WORKFLOW

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [128]:
cv = CountVectorizer(max_df = .8, min_df = 2, stop_words = 'english')

In [129]:
dtm = cv.fit_transform(df['article_text'])

### LDA to learn topic representation:

In [52]:
from sklearn.decomposition import LatentDirichletAllocation

In [139]:
LDA = LatentDirichletAllocation(n_components=15,random_state=18)
LDA.fit(dtm)

LatentDirichletAllocation(n_components=15, random_state=18)

In [91]:
len(cv.get_feature_names())
# holding an instance of every single word

21868

In [None]:
# LDA.components_
# an array of probabilities per word per topic 

In [141]:
topic_one = LDA.components_[0]
topic_one.argsort()[-10:]
# sorts the values, and returns the index values of the sorted list
# aka show location of high probability 
# use index to get the word corresponding with that probability
top_twenty_words = topic_one.argsort()[-20:]
for index in top_twenty_words:
    print(cv.get_feature_names()[index])

disease
just
dr
news
virus
getty
images
vaccines
new
hospital
coronavirus
medical
patients
pandemic
care
vaccine
19
covid
says
health


### Interpret topics and assign to docs

In [142]:
# Grab top 15 words per topic
for i, topic in enumerate(LDA.components_,1):
    print(f'THE TOP 15 WORDS FOR TOPIC {i}:')
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC 1:
['getty', 'images', 'vaccines', 'new', 'hospital', 'coronavirus', 'medical', 'patients', 'pandemic', 'care', 'vaccine', '19', 'covid', 'says', 'health']


THE TOP 15 WORDS FOR TOPIC 2:
['joe', 'officials', 'residents', 'palestinians', 'water', 'palestinian', 'st', 'killed', 'ap', 'city', 'says', 'gaza', 'israeli', 'npr', 'israel']


THE TOP 15 WORDS FOR TOPIC 3:
['national', 'years', 'korea', 'kim', 'state', 'church', 'north', 'intelligence', 'president', 'npr', 'prison', 'federal', 'death', 'biden', 'saudi']


THE TOP 15 WORDS FOR TOPIC 4:
['american', 'black', 'case', 'new', 'rights', 'atlanta', 'anti', 'right', 'women', 'hate', 'court', 'law', 'state', 'says', 'asian']


THE TOP 15 WORDS FOR TOPIC 5:
['justice', 'law', 'force', 'trial', 'black', 'year', 'city', 'attorney', 'chauvin', 'death', 'department', 'officer', 'floyd', 'officers', 'police']


THE TOP 15 WORDS FOR TOPIC 6:
['company', 'years', 'black', 'npr', 'twitter', 'new', 'time', 'think', 'me

In [146]:
# Assign to docs
topic_results = LDA.transform(dtm).round(2)
df['topic_num'] = topic_results.argmax(axis=1)

In [147]:
df[['title','category','topic_num']]

Unnamed: 0,title,category,topic_num
0,The Army Is Expanding Allowed Hairstyles For W...,National,0
1,"Once Nearly Extinct, The Florida Panther Is Ma...",Environment,12
2,What The Rise Of Amazon Has To Do With The Ris...,Politics,11
3,The Youth Of Cuba's Tiny Jewish Minority,Politics,0
4,Melting Snow Usually Means Water For The West....,Environment,5
...,...,...,...
1339,President-Elect Biden Names Core Members Of Hi...,Biden Transition Updates,8
1340,Moderna's COVID-19 Vaccine Candidate Gets More...,The Coronavirus Crisis,10
1341,"As Hospitals Fill With COVID-19 Patients, Medi...",Coronavirus By The Numbers,0
1342,"IRS Says Its Own Error Sent $1,200 Stimulus Ch...",Investigations,8


In [None]:
# map the topic numbers to words

In [148]:
df['category'] = df['category'].apply(lambda x: str(x).strip())

In [137]:
df.groupby(['category'])['topic_num'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

{'50 Years Of NPR': [3],
 'Africa': [0, 4],
 'America Reckons With Racial Injustice': [4, 2, 0, 3, 1],
 'Analysis': [1, 2],
 'Animals': [0, 3, 4],
 'Architecture': [3],
 'Arts & Life': [3],
 'Asia': [3, 4, 1, 2],
 'Author Interviews': [4, 0, 2],
 'Biden Transition Updates': [2, 0, 1],
 'Bill Of The Month': [0],
 'Book Reviews': [0],
 'Books': [2, 4],
 'Business': [3, 0, 2, 4],
 'Capitol Insurrection Updates': [1],
 'Code Switch': [0],
 'Congress Weighs Action Against Trump: Live Updates': [1],
 "Congress' Electoral College Tally: Live Updates": [1],
 'Consider This from NPR': [0, 3],
 'Coronavirus By The Numbers': [0],
 'Coronavirus Guide': [0],
 'Coronavirus Live Updates': [0, 3, 2, 4],
 'Coronavirus Updates': [0, 2, 3, 4],
 'Coronavirus, Illustrated': [0],
 'Criminal Justice Collaborative': [4, 0, 1],
 'Culture': [3],
 'Development': [0],
 'Discipline And Women In Prison': [2],
 'Economics': [0],
 'Economy': [0, 2, 3],
 "Editors' Picks": [2, 3],
 'Education': [0, 1, 4],
 'Elections':

In [138]:
df.groupby(['topic_num'])['category'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

{0: ['The Coronavirus Crisis',
  'National',
  'Shots - Health News',
  'Goats and Soda',
  'Coronavirus Updates',
  'Public Health',
  'Politics',
  'Treatments',
  'Education',
  'Investigations',
  'Your Health',
  'Health',
  'Infectious Disease',
  'Economy',
  'Business',
  'Technology',
  'Policy-ish',
  'nan',
  'World',
  'Coronavirus Live Updates',
  'Environment',
  'Animals',
  'Bill Of The Month',
  'America Reckons With Racial Injustice',
  'Coronavirus, Illustrated',
  'Biden Transition Updates',
  'Science',
  'Obituaries',
  'Untangling Disinformation',
  'Author Interviews',
  'Life Kit',
  'Criminal Justice Collaborative',
  'Family',
  'Media',
  'Law',
  'Middle East',
  'Coronavirus Guide',
  'Health Care',
  'On Disabilities',
  'Health Inc.',
  'Coronavirus By The Numbers',
  'President Biden Takes Office',
  'Global Health',
  'How I Built This with Guy Raz',
  'Women & Girls',
  'National Security',
  'Consider This from NPR',
  'Economics',
  'Weather',
  'Ra

# Readability

In [149]:
import pickle
with open('../project-checkpoints/mvp_readability_df.pkl', 'wb') as f:
    pickle.dump(df[['title','article_text','topic_num']], f)

In [150]:
import textstat

In [151]:
df['f_r_e'] = df['article_text'].apply(lambda x: textstat.flesch_reading_ease(x))

In [152]:
df['ari_grade_level'] = df['article_text'].apply(lambda x: textstat.automated_readability_index(x))

In [153]:
df[['title','article_text','topic_num','f_r_e','ari_grade_level']]

Unnamed: 0,title,article_text,topic_num,f_r_e,ari_grade_level
0,The Army Is Expanding Allowed Hairstyles For W...,Enlarge this image An Army photo shows a soldi...,0,69.01,11.2
1,"Once Nearly Extinct, The Florida Panther Is Ma...",Enlarge this image A male panther leaps over a...,12,66.27,12.3
2,What The Rise Of Amazon Has To Do With The Ris...,Enlarge this image Members of the Workers Asse...,11,63.43,12.9
3,The Youth Of Cuba's Tiny Jewish Minority,"Enlarge this image Adri Quiñones, center, lead...",0,58.82,13.2
4,Melting Snow Usually Means Water For The West....,Enlarge this image Colorado Snow Survey superv...,5,51.82,16.3
...,...,...,...,...,...
1339,President-Elect Biden Names Core Members Of Hi...,Enlarge this image President-elect Joe Biden h...,8,49.45,14.3
1340,Moderna's COVID-19 Vaccine Candidate Gets More...,Enlarge this image Moderna protocol files for ...,10,46.91,14.5
1341,"As Hospitals Fill With COVID-19 Patients, Medi...",Enlarge this image Tennessee Gov. Bill Lee (ri...,0,57.71,14.0
1342,"IRS Says Its Own Error Sent $1,200 Stimulus Ch...",Enlarge this image An error at the IRS caused ...,8,51.82,16.0


In [154]:
df.groupby(['topic_num'])['f_r_e'].mean()

topic_num
0     53.422892
1     55.434242
2     47.983636
3     51.156604
4     53.066667
5     54.500874
6     45.724348
7     53.188485
8     47.892867
9     49.849368
10    50.893500
11    57.837784
12    52.806667
13    50.574412
14    54.221892
Name: f_r_e, dtype: float64

In [155]:
df['ari_grade_level'].describe()

count    1344.000000
mean       15.506845
std         2.981374
min         5.800000
25%        13.600000
50%        15.400000
75%        17.300000
max        42.100000
Name: ari_grade_level, dtype: float64

In [156]:
df['f_r_e'].describe()

count    1344.000000
mean       52.190796
std         9.518520
min        -4.250000
25%        45.900000
50%        52.430000
75%        58.020000
max        85.180000
Name: f_r_e, dtype: float64