In [641]:
import pandas as pd

import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import textstat

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lescardone/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lescardone/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lescardone/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [642]:
df = pd.read_csv('files/article_items.csv')

In [643]:
df.drop(columns=['Unnamed: 0','idx'],axis=1,inplace=True)

In [644]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1344 entries, 0 to 1343
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          1344 non-null   object
 1   category      1338 non-null   object
 2   authors       1344 non-null   object
 3   title         1344 non-null   object
 4   article_text  1344 non-null   object
dtypes: object(5)
memory usage: 52.6+ KB


### Consolidate Category Labels

In [645]:
df['category'].nunique()

132

In [646]:
df['category'] = df['category'].apply(lambda x: str(x).strip())

In [647]:
df['category'].nunique()

106

In [648]:
d = dict.fromkeys(['Books','Book Reviews','Author Interviews'],'Books')
d.update(dict.fromkeys(['Coronavirus By The Numbers','Coronavirus Guide',
             'Coronavirus Live Updates','Coronavirus Updates',
             'Coronavirus, Illustrated','The Coronavirus Crisis'],'Covid'))
d.update(dict.fromkeys(['Global Health','Health Care','Health Inc.','Shots - Health News',
                       'Your Health'],'Health'))
d.update(dict.fromkeys(['Economics','Economy','Planet Money','Your Money'],'Econ'))
d.update(dict.fromkeys(['Environment And Energy Collaborative'],'Environment'))
d.update(dict.fromkeys(['Biden Transition Updates','Congress Weighs Action Against Trump: Live Updates',
                  "Congress' Electoral College Tally: Live Updates",
                  'Elections','President Biden Takes Office'],'Government'))
d.update(dict.fromkeys(["Live Updates: Trial Over George Floyd's Killing",
           'Trump Impeachment Trial: Live Updates'],'Law'))
d.update(dict.fromkeys(['50 Years Of NPR','Fall 2020 College Road Trip',
             'How I Built This with Guy Raz','nan'],'Other'))
d['Movie Interviews'] = 'Movies'
d['Music News'] = 'Music'

In [649]:
df['category'] = df['category'].replace(d)

In [650]:
df['category'].nunique()

79

### Clean Article Text

#### Get rid of some artifacts from webscrape

In [651]:
df['article_text'] = df['article_text'].apply(lambda x: x.replace('Enlarge this image ',''))
df['article_text'] = df['article_text'].apply(lambda x: x.replace(' via Getty Images hide caption toggle caption',''))
df['article_text'] = df['article_text'].apply(lambda x: x.replace('via Getty Images ',''))
df['article_text'] = df['article_text'].apply(lambda x: x.replace('hide caption toggle caption ',''))

In [652]:
df['article_text']

0       An Army photo shows a soldier wearing a new ap...
1       A male panther leaps over a creek at Florida P...
2       Members of the Workers Assembly Against Racism...
3       Adri Quiñones, center, leads the prayer during...
4       Colorado Snow Survey supervisor Brian Domonkos...
                              ...                        
1339    President-elect Joe Biden has nominated labor ...
1340    Moderna protocol files for COVID-19 vaccinatio...
1341    Tennessee Gov. Bill Lee (right) tours a tempor...
1342    An error at the IRS caused thousands of non-Am...
1343    After a highly fraught but largely interferenc...
Name: article_text, Length: 1344, dtype: object

#### Remove repeat sentences/get sentence count per article

In [653]:
import nltk.data

In [654]:
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
df['sentence_toks'] = df['article_text'].apply(lambda x: sent_detector.tokenize(x.strip()))

In [655]:
def remove_dup_sentences(sentence_toks):
    duplicates = []
    cleaned = []
    for s in sentence_toks:
        if s in cleaned:
            if s in duplicates:
                continue
            else:
                duplicates.append(s)
        else:
            cleaned.append(s)
    return cleaned

In [656]:
df['sentence_toks'] = df['sentence_toks'].apply(lambda x: remove_dup_sentences(x))

In [657]:
df['sentence_toks']

0       [An Army photo shows a soldier wearing a new a...
1       [A male panther leaps over a creek at Florida ...
2       [Members of the Workers Assembly Against Racis...
3       [Adri Quiñones, center, leads the prayer durin...
4       [Colorado Snow Survey supervisor Brian Domonko...
                              ...                        
1339    [President-elect Joe Biden has nominated labor...
1340    [Moderna protocol files for COVID-19 vaccinati...
1341    [Tennessee Gov., Bill Lee (right) tours a temp...
1342    [An error at the IRS caused thousands of non-A...
1343    [After a highly fraught but largely interferen...
Name: sentence_toks, Length: 1344, dtype: object

In [658]:
df['num_sentences'] = df['sentence_toks'].apply(lambda x: len(x))

In [659]:
df['article_text_set'] = df['sentence_toks'].apply(lambda x: ' '.join(x))

In [660]:
df['unique_words'] = df['article_text_set'].map(lambda x: len(list(set(nltk.word_tokenize(x)))))

In [661]:
df['word_count'] = df['article_text_set'].map(lambda x: len(list(nltk.word_tokenize(x))))

In [662]:
df['word_count'].mean()

1431.4650297619048

In [663]:
df['num_sentences'].mean()

59.023065476190474

#### Look at word frequency over entire corpus to modify stop_words list

In [664]:
from sklearn.feature_extraction.text import CountVectorizer

In [665]:
stop_words = stopwords.words('english')
stop_words.extend(['says','say','npr','just','don','19','going','years','day','images',
                   'ap','getty','know','000','did','week','according','ve','told','need',
                  'really', 'wednesday','seen','would','new','like','get','many','time',
                  'could','first','last','even','still','back','may','see','well','go',
                  'go','want','got','jan','already','likely','10','put','two','make','saw',
                  'co','made','lot','use','100','20','came','six','12','embed','embedded','es',
                  '529'])

In [666]:
cv = CountVectorizer(max_df = .8, min_df = 3, stop_words = stop_words)

In [667]:
cv_dtm = cv.fit_transform(df['article_text_set'])

In [668]:
word_sums = cv_dtm.sum(axis=0)
words_freq = [(word, word_sums[0, idx]) for word, idx in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=False)

#### TF-IDF - Did not end up using

In [367]:
# from sklearn.feature_extraction.text import TfidfVectorizer

In [386]:
# tfidf = TfidfVectorizer(max_df = .80, min_df = 10, stop_words = stop_words)

In [387]:
# tfidf_dtm = tfidf.fit_transform(df['article_text_set'])

In [402]:
# LDA_tfidf = LatentDirichletAllocation(n_components=6,random_state=18)
# LDA_tfidf.fit(tfidf_dtm)

In [400]:
# Grab top 15 words per topic
# for i, topic in enumerate(LDA_tfidf.components_,1):
#    print(f'THE TOP 15 WORDS FOR TOPIC {i}:')
#    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
#    print('\n')

# LDA WORKFLOW

### LDA to learn topic representation

In [669]:
from sklearn.decomposition import LatentDirichletAllocation

In [670]:
LDA_cv = LatentDirichletAllocation(n_components=18,random_state=18)
LDA_cv.fit(cv_dtm)

LatentDirichletAllocation(n_components=18, random_state=18)

In [392]:
len(cv.get_feature_names())
# holding an instance of every single word

16872

### Interpret topics and assign to docs

#### COUNT VECTORIZE

In [401]:
# topic_one = LDA_cv.components_[0]
# topic_one.argsort()[-10:]
# sorts the values, and returns the index values of the sorted list
# aka show location of high probability 
# use index to get the word corresponding with that probability
# top_twenty_words = topic_one.argsort()[-20:]
# for index in top_twenty_words:
#    print(cv.get_feature_names()[index])

In [671]:
# Grab top 20 words per topic
for i, topic in enumerate(LDA_cv.components_,0):
    print(f'THE TOP 15 WORDS FOR TOPIC {i}:')
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC 0:
['free', 'voa', 'nelson', 'media', 'voice', 'medical', 'team', 'usagm', 'athletes', 'court', 'ncaa', 'women', 'trial', 'radio', 'former', 'sports', 'agency', 'chauvin', 'pack', 'floyd']


THE TOP 15 WORDS FOR TOPIC 1:
['data', 'year', 'employees', 'staff', 'home', 'work', 'states', 'public', 'guidance', 'covid', 'program', 'money', 'care', 'federal', 'coronavirus', 'state', 'cdc', 'pandemic', 'workers', 'health']


THE TOP 15 WORDS FOR TOPIC 2:
['home', 'work', 'social', 'think', 'learning', 'teachers', 'parents', 'high', 'student', 'education', 'person', 'children', 'kids', 'university', 'college', 'year', 'schools', 'pandemic', 'students', 'school']


THE TOP 15 WORDS FOR TOPIC 3:
['war', 'racist', 'crimes', 'church', 'germany', 'white', 'americans', 'andrew', 'austin', 'year', 'community', 'family', 'black', 'right', 'group', 'violence', 'atlanta', 'hate', 'american', 'asian']


THE TOP 15 WORDS FOR TOPIC 4:
['users', 'information', 'platform', 'misinfo

In [672]:
# Assign to docs
topic_results = LDA_cv.transform(cv_dtm).round(2)
df['topic_num'] = topic_results.argmax(axis=1)

In [673]:
topic_prob_df = pd.DataFrame(topic_results)
topic_prob = topic_prob_df.max(axis=1)
df['topic_probability'] = topic_prob

In [674]:
df[['title','category','topic_num','topic_probability']]
df[df['topic_probability'] > .95]['topic_num'].value_counts()

5     32
7     21
2     11
12    11
15    10
10    10
16     6
9      4
11     4
3      3
6      3
8      3
1      2
13     2
14     2
0      1
4      1
17     1
Name: topic_num, dtype: int64

In [None]:
df['topic_num'].value_counts().sort_values()

In [676]:
# map the topic numbers to words
topic_map = {0: 'Zero', 1: 'Paycheck Protection Program', 2: 'Learning & Education During The COVID Pandemic',
            3: 'Racism & Hate Crimes', 4: 'Media & Misinformation', 5: 'Political Decision Making',
            6: 'Science & Medicine', 7: 'COVID Vaccines', 8: 'Global News & International Relations', 9: 'Immigration & Foreign Policy',
            10: 'Investigations & Trial Coverage', 11: 'Drugs/Opiods', 12: 'COVID Relief Bills',
            13: 'Music & Culture', 14: 'Earth/Natural Resources', 15: 'Texas Power Outtage & Census Case',
            16: 'Cultural Phenomena', 17: 'Seventeen'}

In [677]:
df['topic_names'] = df['topic_num'].replace(topic_map)

In [678]:
df['topic_names'].value_counts()

Political Decision Making                         171
COVID Vaccines                                    169
COVID Relief Bills                                120
Science & Medicine                                112
Investigations & Trial Coverage                   103
Immigration & Foreign Policy                       90
Cultural Phenomena                                 73
Global News & International Relations              71
Learning & Education During The COVID Pandemic     69
Media & Misinformation                             55
Texas Power Outtage & Census Case                  52
Seventeen                                          48
Paycheck Protection Program                        44
Earth/Natural Resources                            43
Racism & Hate Crimes                               39
Drugs/Opiods                                       37
Zero                                               36
Music & Culture                                    12
Name: topic_names, dtype: in

In [679]:
df_most_representative = df[(df['topic_probability'] > .80)].copy()

In [635]:
## Look at most article titles that are most representative of a given topic
# df_most_representative.groupby(['topic_num'])['title'].apply(lambda grp: list(grp.value_counts().index)).to_dict()

In [680]:
with open('lda_df', 'wb') as f:
    pickle.dump(df, f)

# Readability

In [681]:
import textstat

In [682]:
df['flesch_reading'] = df['article_text'].apply(lambda x: textstat.flesch_reading_ease(x))

In [683]:
df['ari_grade_level'] = df['article_text'].apply(lambda x: textstat.automated_readability_index(x))

In [684]:
df[['title','article_text','topic_names','flesch_reading','ari_grade_level']]

Unnamed: 0,title,article_text,topic_names,flesch_reading,ari_grade_level
0,The Army Is Expanding Allowed Hairstyles For W...,An Army photo shows a soldier wearing a new ap...,Cultural Phenomena,68.30,11.4
1,"Once Nearly Extinct, The Florida Panther Is Ma...",A male panther leaps over a creek at Florida P...,Earth/Natural Resources,66.88,12.0
2,What The Rise Of Amazon Has To Do With The Ris...,Members of the Workers Assembly Against Racism...,Drugs/Opiods,63.63,12.9
3,The Youth Of Cuba's Tiny Jewish Minority,"Adri Quiñones, center, leads the prayer during...",Texas Power Outtage & Census Case,60.04,12.5
4,Melting Snow Usually Means Water For The West....,Colorado Snow Survey supervisor Brian Domonkos...,Earth/Natural Resources,52.12,16.1
...,...,...,...,...,...
1339,President-Elect Biden Names Core Members Of Hi...,President-elect Joe Biden has nominated labor ...,COVID Relief Bills,49.75,14.1
1340,Moderna's COVID-19 Vaccine Candidate Gets More...,Moderna protocol files for COVID-19 vaccinatio...,COVID Vaccines,47.32,14.3
1341,"As Hospitals Fill With COVID-19 Patients, Medi...",Tennessee Gov. Bill Lee (right) tours a tempor...,Science & Medicine,57.40,14.2
1342,"IRS Says Its Own Error Sent $1,200 Stimulus Ch...",An error at the IRS caused thousands of non-Am...,Cultural Phenomena,52.43,15.6


In [685]:
df['flesch_reading'].describe()

count    1344.000000
mean       52.655424
std         9.568339
min        -3.240000
25%        46.385000
50%        53.140000
75%        58.520000
max        85.180000
Name: flesch_reading, dtype: float64

#### Remove extreme articles

In [686]:
df[df['flesch_reading'] < 10]

Unnamed: 0,date,category,authors,title,article_text,sentence_toks,num_sentences,article_text_set,unique_words,word_count,topic_num,topic_probability,topic_names,flesch_reading,ari_grade_level
405,"April 1, 2021",Politics,"['Benjamin Swasey', 'claire oby']",By The Numbers: Biden's $2 Trillion Infrastruc...,President Biden's $2 trillion infrastructure p...,[President Biden's $2 trillion infrastructure ...,12,President Biden's $2 trillion infrastructure p...,282,602,12,0.87,COVID Relief Bills,-3.24,41.6
1026,"January 12, 2021",Government,['Barbara Sprunt'],READ: Text Of House Resolution Urging Pence To...,An American flag flies at half staff at the U....,[An American flag flies at half staff at the U...,17,An American flag flies at half staff at the U....,619,1624,5,0.87,Political Decision Making,8.1,31.4


In [687]:
df.drop([405,1026],inplace=True)

In [688]:
df['flesch_reading'].describe()

count    1342.000000
mean       52.730276
std         9.374105
min        26.000000
25%        46.400000
50%        53.140000
75%        58.520000
max        85.180000
Name: flesch_reading, dtype: float64

In [689]:
df.groupby(['topic_names'])['flesch_reading'].mean().sort_values()

topic_names
Immigration & Foreign Policy                      47.235444
Media & Misinformation                            47.980182
Texas Power Outtage & Census Case                 48.465962
Zero                                              50.488333
COVID Relief Bills                                51.177311
Political Decision Making                         51.271824
Paycheck Protection Program                       51.372727
COVID Vaccines                                    52.318639
Investigations & Trial Coverage                   52.382427
Racism & Hate Crimes                              52.846154
Global News & International Relations             55.195493
Seventeen                                         55.277500
Earth/Natural Resources                           55.900930
Science & Medicine                                56.094375
Drugs/Opiods                                      56.428378
Cultural Phenomena                                56.556712
Music & Culture             

In [690]:
df.to_csv('../tableau/npr_df.csv',index=False)

In [691]:
df = pd.read_csv('../tableau/npr_df.csv')

In [692]:
df.columns

Index(['date', 'category', 'authors', 'title', 'article_text', 'sentence_toks',
       'num_sentences', 'article_text_set', 'unique_words', 'word_count',
       'topic_num', 'topic_probability', 'topic_names', 'flesch_reading',
       'ari_grade_level'],
      dtype='object')

In [693]:
df[df['flesch_reading'] == df['flesch_reading'].max()]

Unnamed: 0,date,category,authors,title,article_text,sentence_toks,num_sentences,article_text_set,unique_words,word_count,topic_num,topic_probability,topic_names,flesch_reading,ari_grade_level
795,"February 10, 2021",Politics,['Brian Naylor'],"Read Trump's Jan. 6 Speech, A Key Part Of Impe...",Then-President Donald Trump speaks to supporte...,"[""Then-President Donald Trump speaks to suppor...",1041,Then-President Donald Trump speaks to supporte...,1876,13777,5,0.76,Political Decision Making,85.18,5.8
