# In this notebook, all I want to do it pull in the three .csv files, run the polarity analyzer over them sentence-by-sentence, then save the aggregated output into new .csv files that can be easily loaded and used for EDA, etc.

In [31]:
import pandas as pd
import numpy as np
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [27]:
# Set up the analyzer
analyzer = SentimentIntensityAnalyzer()

# Bring in the first .csv
csv1 = pd.read_csv('~/Documents/big_data/articles1.csv')
csv1.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [28]:
csv1.drop('Unnamed: 0', axis=1, inplace=True)
csv1.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,WASHINGTON — Congressional Republicans have...
1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"After the bullet shells get counted, the blood..."
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"When Walt Disney’s “Bambi” opened in 1942, cri..."
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"Death may be the great equalizer, but it isn’t..."
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"SEOUL, South Korea — North Korea’s leader, ..."


In [29]:
csv1.shape

(50000, 9)

In [33]:
total_sent = pd.DataFrame(columns = ['id', 'neg', 'neu', 'pos', 'compound'])
total_sent

Unnamed: 0,id,neg,neu,pos,compound


In [34]:
for num, article in zip(csv1['id'], csv1['content']):
    temp = re.split("(?<!Mr|Ms)(?<!Mrs)(?<![A-Z+])[.]", article)
    art_sent = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
    art_sent
    for i in range(len(temp)):
        sentence = temp[i]
        sent = analyzer.polarity_scores(sentence)
        art_sent.loc[i, 'neg'] = float(sent['neg'])
        art_sent.loc[i, 'neu'] = float(sent['neu'])
        art_sent.loc[i, 'pos'] = float(sent['pos'])
        art_sent.loc[i, 'compound'] = float(sent['compound'])
    temp_sent = pd.DataFrame({'id':num, 'neg':art_sent['neg'].mean(),
                             'neu':art_sent['neu'].mean(), 'pos':art_sent['pos'].mean(),
                             'compound':art_sent['compound'].sum()}, index=np.arange(1))
    total_sent = total_sent.append(temp_sent)

In [38]:
total_sent

Unnamed: 0,id,neg,neu,pos,compound
0,17283,0.083857,0.794857,0.085536,0.7504
0,17284,0.113749,0.842700,0.039067,-44.5612
0,17285,0.064865,0.859405,0.066676,4.5303
0,17286,0.083333,0.815387,0.087933,0.0790
0,17287,0.035037,0.854296,0.073667,2.6097
...,...,...,...,...,...
0,73465,0.038700,0.855067,0.089500,6.2009
0,73466,0.082097,0.790452,0.095226,3.8019
0,73467,0.042674,0.864908,0.077762,42.1114
0,73468,0.042070,0.906814,0.051116,0.6481


In [39]:
total_sent.to_csv('~/Documents/big_data/articles1_sent.csv')

In [40]:
# Bring in the second .csv
csv2 = pd.read_csv('~/Documents/big_data/articles2.csv')
csv2.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,53293,73471,Patriots Day Is Best When It Digs Past the Her...,Atlantic,David Sims,2017-01-11,2017.0,1.0,,"Patriots Day, Peter Berg’s new thriller that r..."
1,53294,73472,A Break in the Search for the Origin of Comple...,Atlantic,Ed Yong,2017-01-11,2017.0,1.0,,"In Norse mythology, humans and our world were ..."
2,53295,73474,Obama’s Ingenious Mention of Atticus Finch,Atlantic,Spencer Kornhaber,2017-01-11,2017.0,1.0,,“If our democracy is to work in this increasin...
3,53296,73475,"Donald Trump Meets, and Assails, the Press",Atlantic,David A. Graham,2017-01-11,2017.0,1.0,,Updated on January 11 at 5:05 p. m. In his fir...
4,53297,73476,Trump: ’I Think’ Hacking Was Russian,Atlantic,Kaveh Waddell,2017-01-11,2017.0,1.0,,Updated at 12:25 p. m. After months of equivoc...


In [41]:
csv2.drop('Unnamed: 0', axis=1, inplace=True)
csv2.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,73471,Patriots Day Is Best When It Digs Past the Her...,Atlantic,David Sims,2017-01-11,2017.0,1.0,,"Patriots Day, Peter Berg’s new thriller that r..."
1,73472,A Break in the Search for the Origin of Comple...,Atlantic,Ed Yong,2017-01-11,2017.0,1.0,,"In Norse mythology, humans and our world were ..."
2,73474,Obama’s Ingenious Mention of Atticus Finch,Atlantic,Spencer Kornhaber,2017-01-11,2017.0,1.0,,“If our democracy is to work in this increasin...
3,73475,"Donald Trump Meets, and Assails, the Press",Atlantic,David A. Graham,2017-01-11,2017.0,1.0,,Updated on January 11 at 5:05 p. m. In his fir...
4,73476,Trump: ’I Think’ Hacking Was Russian,Atlantic,Kaveh Waddell,2017-01-11,2017.0,1.0,,Updated at 12:25 p. m. After months of equivoc...


In [42]:
csv2.shape

(49999, 9)

In [43]:
total_sent = pd.DataFrame(columns = ['id', 'neg', 'neu', 'pos', 'compound'])
total_sent

Unnamed: 0,id,neg,neu,pos,compound


In [44]:
for num, article in zip(csv2['id'], csv2['content']):
    temp = re.split("(?<!Mr|Ms)(?<!Mrs)(?<![A-Z+])[.]", article)
    art_sent = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
    art_sent
    for i in range(len(temp)):
        sentence = temp[i]
        sent = analyzer.polarity_scores(sentence)
        art_sent.loc[i, 'neg'] = float(sent['neg'])
        art_sent.loc[i, 'neu'] = float(sent['neu'])
        art_sent.loc[i, 'pos'] = float(sent['pos'])
        art_sent.loc[i, 'compound'] = float(sent['compound'])
    temp_sent = pd.DataFrame({'id':num, 'neg':art_sent['neg'].mean(),
                             'neu':art_sent['neu'].mean(), 'pos':art_sent['pos'].mean(),
                             'compound':art_sent['compound'].sum()}, index=np.arange(1))
    total_sent = total_sent.append(temp_sent)

In [45]:
total_sent.head()

Unnamed: 0,id,neg,neu,pos,compound
0,73471,0.108824,0.773059,0.088765,-1.8319
0,73472,0.034779,0.913597,0.051623,5.4129
0,73474,0.084594,0.752375,0.06925,-3.5558
0,73475,0.058312,0.836833,0.083979,3.6127
0,73476,0.035208,0.874417,0.04875,1.7098


In [46]:
total_sent.to_csv('~/Documents/big_data/articles2_sent.csv')

In [47]:
# Bring in the third .csv
csv3 = pd.read_csv('~/Documents/big_data/articles3.csv')
csv3.head()

Unnamed: 0.1,Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,103459,151908,Alton Sterling’s son: ’Everyone needs to prote...,Guardian,Jessica Glenza,2016-07-13,2016.0,7.0,https://www.theguardian.com/us-news/2016/jul/1...,The son of a Louisiana man whose father was sh...
1,103460,151909,Shakespeare’s first four folios sell at auctio...,Guardian,,2016-05-25,2016.0,5.0,https://www.theguardian.com/culture/2016/may/2...,Copies of William Shakespeare’s first four boo...
2,103461,151910,My grandmother’s death saved me from a life of...,Guardian,Robert Pendry,2016-10-31,2016.0,10.0,https://www.theguardian.com/commentisfree/2016...,"Debt: $20, 000, Source: College, credit cards,..."
3,103462,151911,I feared my life lacked meaning. Cancer pushed...,Guardian,Bradford Frost,2016-11-26,2016.0,11.0,https://www.theguardian.com/commentisfree/2016...,"It was late. I was drunk, nearing my 35th birt..."
4,103463,151912,Texas man serving life sentence innocent of do...,Guardian,,2016-08-20,2016.0,8.0,https://www.theguardian.com/us-news/2016/aug/2...,A central Texas man serving a life sentence fo...


In [48]:
csv3.drop('Unnamed: 0', axis=1, inplace=True)
csv3.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content
0,151908,Alton Sterling’s son: ’Everyone needs to prote...,Guardian,Jessica Glenza,2016-07-13,2016.0,7.0,https://www.theguardian.com/us-news/2016/jul/1...,The son of a Louisiana man whose father was sh...
1,151909,Shakespeare’s first four folios sell at auctio...,Guardian,,2016-05-25,2016.0,5.0,https://www.theguardian.com/culture/2016/may/2...,Copies of William Shakespeare’s first four boo...
2,151910,My grandmother’s death saved me from a life of...,Guardian,Robert Pendry,2016-10-31,2016.0,10.0,https://www.theguardian.com/commentisfree/2016...,"Debt: $20, 000, Source: College, credit cards,..."
3,151911,I feared my life lacked meaning. Cancer pushed...,Guardian,Bradford Frost,2016-11-26,2016.0,11.0,https://www.theguardian.com/commentisfree/2016...,"It was late. I was drunk, nearing my 35th birt..."
4,151912,Texas man serving life sentence innocent of do...,Guardian,,2016-08-20,2016.0,8.0,https://www.theguardian.com/us-news/2016/aug/2...,A central Texas man serving a life sentence fo...


In [49]:
csv3.shape

(42571, 9)

In [50]:
total_sent = pd.DataFrame(columns = ['id', 'neg', 'neu', 'pos', 'compound'])
total_sent

Unnamed: 0,id,neg,neu,pos,compound


In [51]:
for num, article in zip(csv3['id'], csv3['content']):
    temp = re.split("(?<!Mr|Ms)(?<!Mrs)(?<![A-Z+])[.]", article)
    art_sent = pd.DataFrame(columns = ['neg', 'neu', 'pos', 'compound'])
    art_sent
    for i in range(len(temp)):
        sentence = temp[i]
        sent = analyzer.polarity_scores(sentence)
        art_sent.loc[i, 'neg'] = float(sent['neg'])
        art_sent.loc[i, 'neu'] = float(sent['neu'])
        art_sent.loc[i, 'pos'] = float(sent['pos'])
        art_sent.loc[i, 'compound'] = float(sent['compound'])
    temp_sent = pd.DataFrame({'id':num, 'neg':art_sent['neg'].mean(),
                             'neu':art_sent['neu'].mean(), 'pos':art_sent['pos'].mean(),
                             'compound':art_sent['compound'].sum()}, index=np.arange(1))
    total_sent = total_sent.append(temp_sent)

In [52]:
total_sent.head()

Unnamed: 0,id,neg,neu,pos,compound
0,151908,0.174167,0.672778,0.0975,-4.9132
0,151909,0.019643,0.928571,0.051786,0.6169
0,151910,0.079268,0.776732,0.119585,4.4083
0,151911,0.073787,0.798596,0.10634,1.6014
0,151912,0.13755,0.77205,0.0404,-5.5531


In [53]:
total_sent.to_csv('~/Documents/big_data/articles3_sent.csv')