# Headline Classifier and Cleanup

In [1]:
# Set up and dependencies
from pprint import pprint
import pandas as pd
import numpy as np
import re


In [2]:
# Load source file and preview
headlines = pd.read_csv('articles1.csv')
headlines.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [3]:
headlines.columns

Index(['Unnamed: 0', 'id', 'title', 'publication', 'author', 'date', 'year',
       'month', 'url', 'content'],
      dtype='object')

In [4]:
headlines.drop(['Unnamed: 0', 'id', 'author', 'date', 'year', 'month', 'url', 'content'],
               axis=1, inplace=True)
headlines.head()

Unnamed: 0,title,publication
0,House Republicans Fret About Winning Their Hea...,New York Times
1,Rift Between Officers and Residents as Killing...,New York Times
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times
4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times


In [5]:
headlines.to_csv('article1_stripped.csv', index=False)

In [6]:
# Examine headlines for potential issues
for ind, row in headlines.head(25).iterrows():
    print(row['title'])

House Republicans Fret About Winning Their Health Care Suit - The New York Times
Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times
Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial Bias, Dies at 106 - The New York Times
Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times
Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times
Sick With a Cold, Queen Elizabeth Misses New Year’s Service - The New York Times
Taiwan’s President Accuses China of Renewed Intimidation - The New York Times
After ‘The Biggest Loser,’ Their Bodies Fought to Regain Weight - The New York Times
First, a Mixtape. Then a Romance. - The New York Times
Calling on Angels While Enduring the Trials of Job - The New York Times
Weak Federal Powers Could Limit Trump’s Climate-Policy Rollback - The New York Times
Can Carbon Capture Technology Prosper Under Trump? - The New York Times
Mar-a-Lago, the Future Winter White House and Home of t

In [7]:
print(len(headlines))
headlines.dropna(inplace=True)
print(len(headlines))

50000
50000


In [8]:
headlines.drop_duplicates(inplace=True)
print(len(headlines))

49920


In [9]:
headlines = headlines.loc[headlines['title'] != 'SIGN IN']
print(len(headlines))

49919


In [10]:
def titleToHeadline(row):
    text = str(row['title'])
    text = text.replace(' - The New York Times',   '')
    text = text.replace(' - Breitbart',   '')
    text = text.replace('U.S.', 'USAAbbrev')
    return text.lower()

all_publishers = ['CNN',
                  #'Huffington Post',
                  'Vox', 
                  'Atlantic', 
                  'Business Insider', 
                  'Buzzfeed News', 
                  'Guardian', 
                  'NPR', 
                  'New York Times', 
                  'Reuters', 
                  'Talking Points Memo', 
                  'Washington Post', 
                  'New York Post',
                  'Daily Wire',
                  'Fox News', 
                  'National Review',
                  'Breitbart']

ll = ['CNN', 'Vox']
lc = ['Atlantic', 'Business Insider', 'Buzzfeed News', 'Guardian', 'NPR', 
      'New York Times', 'Reuters', 'Talking Points Memo', 'Washington Post']
rc = ['New York Post', 'WSJ']
rr = ['Daily Wire', 'Fox News', 'National Review']


def addBias(row):
    
    pub_name = str(row['publication'])
    
    if (pub_name in ll):
        return "LL"
    elif (pub_name in lc):
        return "LC"
    elif (pub_name in rc):
        return "RC"
    elif (pub_name in rr):
        return "RR"
    
    return "NA"

def simpleBias(row):
    return str(row['bias'])[0]

def punctuationToText(row):
    text = str(row['headline'])
    text = text.replace(',',   ' pcomma')
    text = text.replace('.',   ' pperiod ')
    text = text.replace('!',   ' pexclamation')
    text = text.replace('?',   ' pquestion')
    text = text.replace(" '",  ' pquote ')
    text = text.replace(" ‘",  ' pquote ')
    text = text.replace("'s ", ' ppossession ')
    text = text.replace("’s ", ' ppossession ')
    text = text.replace("'",   '')
    text = text.replace("’",   '')
    text = text.replace("(",   ' pparen ')
    text = text.replace(")",   '')
    text = text.replace(':',   ' pcolon')
    text = text.replace(';',   ' psemic')
    text = text.replace('-',   ' pdash ')
    text = text.replace('  ',   ' ')
    return text    

In [11]:
headlines['headline'] = headlines.apply(titleToHeadline, axis=1)
for ind, row in headlines.head(25).iterrows():
    print(row['headline'])

house republicans fret about winning their health care suit
rift between officers and residents as killings persist in south bronx
tyrus wong, ‘bambi’ artist thwarted by racial bias, dies at 106
among deaths in 2016, a heavy toll in pop music
kim jong-un says north korea is preparing to test long-range missile
sick with a cold, queen elizabeth misses new year’s service
taiwan’s president accuses china of renewed intimidation
after ‘the biggest loser,’ their bodies fought to regain weight
first, a mixtape. then a romance.
calling on angels while enduring the trials of job
weak federal powers could limit trump’s climate-policy rollback
can carbon capture technology prosper under trump?
mar-a-lago, the future winter white house and home of the calmer trump
how to form healthy habits in your 20s
turning your vacation photos into works of art
as second avenue subway opens, a train delay ends in (happy) tears
dylann roof himself rejects best defense against execution
modi’s cash ban brings p

In [12]:
headlines.to_csv('article1_clean.csv', index=False)

### Function Based Cleanup

In [13]:
def stripClean(filename):
    headlines = pd.read_csv(filename)
    headlines.drop(['Unnamed: 0', 'id', 'author', 'date', 'year', 'month', 'url', 'content'], 
                   axis=1, inplace=True)
    #headlines.to_csv('_stripped.'.join(filename.split('.')), index=False)
    startsize = len(headlines)
    headlines.dropna(inplace=True)
    headlines.drop_duplicates(inplace=True)
    headlines = headlines.loc[headlines['title'] != 'SIGN IN']
    headlines['headline'] = headlines.apply(titleToHeadline, axis=1)
    headlines['bias'] = headlines.apply(addBias, axis=1)
    headlines['lean'] = headlines.apply(simpleBias, axis=1)
    headlines['is_sarcastic'] = 0
    headlines['head_char'] = headlines.apply(punctuationToText, axis=1)
    headlines.to_csv('_clean.'.join(filename.split('.')), index=False)
    print(f'{len(headlines)}/{startsize} rows in {filename} were kept.')
    return headlines

def noStripClean(filename):
    headlines = pd.read_csv(filename)
    startsize = len(headlines)
    headlines.dropna(inplace=True)
    headlines.drop_duplicates(inplace=True)
    headlines = headlines.loc[headlines['title'] != 'SIGN IN']
    headlines['headline'] = headlines.apply(titleToHeadline, axis=1)
    headlines['bias'] = headlines.apply(addBias, axis=1)
    headlines['lean'] = headlines.apply(simpleBias, axis=1)
    headlines['is_sarcastic'] = 0
    headlines['head_char'] = headlines.apply(punctuationToText, axis=1)
    headlines.to_csv('_clean.'.join(filename.split('.')), index=False)
    print(f'{len(headlines)}/{startsize} rows in {filename} were kept.')
    return headlines

In [14]:
set1 = stripClean('articles1.csv')
set2 = stripClean('articles2.csv')
set3 = stripClean('articles3.csv')

49919/50000 rows in articles1.csv were kept.
49664/49999 rows in articles2.csv were kept.
42553/42571 rows in articles3.csv were kept.


In [15]:
for ind, row in set1.head(25).iterrows():
    print(row['head_char'])

house republicans fret about winning their health care suit
rift between officers and residents as killings persist in south bronx
tyrus wong pcomma pquote bambi artist thwarted by racial bias pcomma dies at 106
among deaths in 2016 pcomma a heavy toll in pop music
kim jong pdash un says north korea is preparing to test long pdash range missile
sick with a cold pcomma queen elizabeth misses new year ppossession service
taiwan ppossession president accuses china of renewed intimidation
after pquote the biggest loser pcomma their bodies fought to regain weight
first pcomma a mixtape pperiod then a romance pperiod 
calling on angels while enduring the trials of job
weak federal powers could limit trump ppossession climate pdash policy rollback
can carbon capture technology prosper under trump pquestion
mar pdash a pdash lago pcomma the future winter white house and home of the calmer trump
how to form healthy habits in your 20s
turning your vacation photos into works of art
as second aven

In [16]:
set4 = noStripClean('daily_wire_small.csv')

1472/1477 rows in daily_wire_small.csv were kept.


In [17]:
set4.head()

Unnamed: 0,title,publication,headline,bias,lean,is_sarcastic,head_char
0,Make the web’s best conservative commentary ev...,Daily Wire,make the web’s best conservative commentary ev...,RR,R,0,make the web ppossession best conservative com...
1,Bipartisan Support Builds For Israeli Sovereig...,Daily Wire,bipartisan support builds for israeli sovereig...,RR,R,0,bipartisan support builds for israeli sovereig...
2,Shapiro At 'National Review': A World Without ...,Daily Wire,shapiro at 'national review': a world without ...,RR,R,0,shapiro at pquote national review pcolon a wor...
3,"Limbaugh: As These Sex Scandals Hit Democrats,...",Daily Wire,"limbaugh: as these sex scandals hit democrats,...",RR,R,0,limbaugh pcolon as these sex scandals hit demo...
4,VA Gov. Northam: I Won't Resign And Be Branded...,Daily Wire,va gov. northam: i won't resign and be branded...,RR,R,0,va gov pperiod northam pcolon i wont resign an...


In [18]:
headlines = pd.concat([set1, set2, set3, set4])
print(len(headlines))
headlines.head(5)

143608


Unnamed: 0,title,publication,headline,bias,lean,is_sarcastic,head_char
0,House Republicans Fret About Winning Their Hea...,New York Times,house republicans fret about winning their hea...,LC,L,0,house republicans fret about winning their hea...
1,Rift Between Officers and Residents as Killing...,New York Times,rift between officers and residents as killing...,LC,L,0,rift between officers and residents as killing...
2,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,"tyrus wong, ‘bambi’ artist thwarted by racial ...",LC,L,0,tyrus wong pcomma pquote bambi artist thwarted...
3,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,"among deaths in 2016, a heavy toll in pop music",LC,L,0,among deaths in 2016 pcomma a heavy toll in po...
4,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,kim jong-un says north korea is preparing to t...,LC,L,0,kim jong pdash un says north korea is preparin...


In [19]:
headlines.columns

Index(['title', 'publication', 'headline', 'bias', 'lean', 'is_sarcastic',
       'head_char'],
      dtype='object')

In [30]:
sep_headlines = []
for publisher in all_publishers:
    selectset = headlines.loc[headlines['publication'] == publisher]
    savename = publisher + '_headlines.csv'
    selectset.to_csv(savename, index=False)
    print(f'Saved {savename}!')
    sep_headlines.append(selectset)

Saved CNN_headlines.csv!
Saved Vox_headlines.csv!
Saved Atlantic_headlines.csv!
Saved Business Insider_headlines.csv!
Saved Buzzfeed News_headlines.csv!
Saved Guardian_headlines.csv!
Saved NPR_headlines.csv!
Saved New York Times_headlines.csv!
Saved Reuters_headlines.csv!
Saved Talking Points Memo_headlines.csv!
Saved Washington Post_headlines.csv!
Saved New York Post_headlines.csv!
Saved Daily Wire_headlines.csv!
Saved Fox News_headlines.csv!
Saved National Review_headlines.csv!
Saved Breitbart_headlines.csv!


In [31]:
headlines['publication'].unique()

array(['New York Times', 'Breitbart', 'CNN', 'Business Insider',
       'Atlantic', 'Fox News', 'Talking Points Memo', 'Buzzfeed News',
       'National Review', 'New York Post', 'Guardian', 'NPR', 'Reuters',
       'Vox', 'Washington Post', 'Daily Wire'], dtype=object)

In [32]:
for df in sep_headlines:
    print(df.head())

                                                 title publication  \
31584      Istanbul attack: Dozens killed at nightclub         CNN   
31585     Alabama, Clemson back in national title game         CNN   
31586               New year celebrations ring in 2017         CNN   
31587  Trump says he has inside information on hacking         CNN   
31588            3 dead in Texas plane crash collision         CNN   

                                              headline bias lean  \
31584      istanbul attack: dozens killed at nightclub   LL    L   
31585     alabama, clemson back in national title game   LL    L   
31586               new year celebrations ring in 2017   LL    L   
31587  trump says he has inside information on hacking   LL    L   
31588            3 dead in texas plane crash collision   LL    L   

       is_sarcastic                                          head_char  
31584             0  istanbul attack pcolon dozens killed at nightclub  
31585             0  ala

In [33]:
huffpo_df = pd.read_csv('Huffington Post_headlines.csv').drop(['Unnamed: 0'], axis=1)
huffpo_df.head()

Unnamed: 0,title,publication,headline,bias,lean,is_sarcastic,head_char
0,former versace store clerk sues over secret 'b...,Huffington Post,former versace store clerk sues over secret 'b...,LL,L,0,former versace store clerk sues over secret pq...
1,the 'roseanne' revival catches up to our thorn...,Huffington Post,the 'roseanne' revival catches up to our thorn...,LL,L,0,the pquote roseanne revival catches up to our ...
2,j.k. rowling wishes snape happy birthday in th...,Huffington Post,j.k. rowling wishes snape happy birthday in th...,LL,L,0,j pperiod k pperiod rowling wishes snape happy...
3,advancing the world's women,Huffington Post,advancing the world's women,LL,L,0,advancing the world ppossession women
4,the fascinating case for eating lab-grown meat,Huffington Post,the fascinating case for eating lab-grown meat,LL,L,0,the fascinating case for eating lab pdash grow...


In [42]:
onion_df = pd.read_csv('The Onion_headlines.csv').drop(['Unnamed: 0'], axis=1)
onion_df['bias'] = 'NR'
onion_df.to_csv('The Onion_headlines.csv', index=False)
onion_df.head()

Unnamed: 0,title,publication,headline,bias,lean,is_sarcastic,head_char
0,mom starting to fear son's web series closest ...,The Onion,mom starting to fear son's web series closest ...,NR,N,1,mom starting to fear son ppossession web serie...
1,"boehner just wants wife to listen, not come up...",The Onion,"boehner just wants wife to listen, not come up...",NR,N,1,boehner just wants wife to listen pcomma not c...
2,top snake handler leaves sinking huckabee camp...,The Onion,top snake handler leaves sinking huckabee camp...,NR,N,1,top snake handler leaves sinking huckabee camp...
3,nuclear bomb detonates during rehearsal for 's...,The Onion,nuclear bomb detonates during rehearsal for 's...,NR,N,1,nuclear bomb detonates during rehearsal for pq...
4,cosby lawyer asks why accusers didn't come for...,The Onion,cosby lawyer asks why accusers didn't come for...,NR,N,1,cosby lawyer asks why accusers didnt come forw...


In [41]:
wsj_df = noStripClean('wsj_data.csv')
wsj_df.to_csv('WSJ_headlines.csv', index=False)
wsj_df.head()

6501/6717 rows in wsj_data.csv were kept.


Unnamed: 0,title,publication,headline,bias,lean,is_sarcastic,head_char
0,Gulfport Energy CEO Steps Down After Review,WSJ,gulfport energy ceo steps down after review,RC,R,0,gulfport energy ceo steps down after review
1,Corrections & Amplifications,WSJ,corrections & amplifications,RC,R,0,corrections & amplifications
2,"In Pittsburgh, Preparing to Sit Shiva for 11",WSJ,"in pittsburgh, preparing to sit shiva for 11",RC,R,0,in pittsburgh pcomma preparing to sit shiva fo...
3,University System of Maryland Board Chairman R...,WSJ,university system of maryland board chairman r...,RC,R,0,university system of maryland board chairman r...
4,American World War II Base Will Be Rebuilt as ...,WSJ,american world war ii base will be rebuilt as ...,RC,R,0,american world war ii base will be rebuilt as ...


In [36]:
sep_headlines.append(onion_df)
sep_headlines.insert((len(sep_headlines)-5), wsj_df)
sep_headlines.insert(1, huffpo_df)

In [37]:
L2R_headlines = pd.concat(sep_headlines)
L2R_headlines.head()

Unnamed: 0,title,publication,headline,bias,lean,is_sarcastic,head_char
31584,Istanbul attack: Dozens killed at nightclub,CNN,istanbul attack: dozens killed at nightclub,LL,L,0,istanbul attack pcolon dozens killed at nightclub
31585,"Alabama, Clemson back in national title game",CNN,"alabama, clemson back in national title game",LL,L,0,alabama pcomma clemson back in national title ...
31586,New year celebrations ring in 2017,CNN,new year celebrations ring in 2017,LL,L,0,new year celebrations ring in 2017
31587,Trump says he has inside information on hacking,CNN,trump says he has inside information on hacking,LL,L,0,trump says he has inside information on hacking
31588,3 dead in Texas plane crash collision,CNN,3 dead in texas plane crash collision,LL,L,0,3 dead in texas plane crash collision


In [38]:
L2R_headlines.to_csv('combined_headlines.csv', index=False)

In [39]:
L2R_headlines['publication'].unique()

array(['CNN', 'Huffington Post', 'Vox', 'Atlantic', 'Business Insider',
       'Buzzfeed News', 'Guardian', 'NPR', 'New York Times', 'Reuters',
       'Talking Points Memo', 'Washington Post', 'New York Post', 'WSJ',
       'Daily Wire', 'Fox News', 'National Review', 'Breitbart',
       'The Onion'], dtype=object)