# Generate Sentiment Data

Import libraries

In [29]:
import json
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer

  return f(*args, **kwds)
  return f(*args, **kwds)


## Get Data

In [3]:
with open('data/cnn_articles.json', 'r') as f:
    cnn_articles = json.load(f)

In [4]:
with open('data/guardian_articles.json', 'r') as f:
    guardian_articles = json.load(f)

## Sentiment Analysis

In [5]:
analyser = SentimentIntensityAnalyzer()

In [25]:
def generate_sentiment(articles):
    result = []
    for article in articles:
        
        # Generate Polarity Scores
        title_sentiment = analyser.polarity_scores(article['title'])
        full_sentiment = analyser.polarity_scores(article['full_article'])
        
        # Generate row of data
        row = {}
        row['publish_date'] = article['publish_date']
        row['title'] = article['title']
        row['title_sentiment_pos'] = title_sentiment['pos']
        row['title_sentiment_neu'] = title_sentiment['neu']
        row['title_sentiment_neg'] = title_sentiment['neg']
        row['title_sentiment_comp'] = title_sentiment['compound']
        row['full_sentiment_pos'] = full_sentiment['pos']
        row['full_sentiment_neu'] = full_sentiment['neu']
        row['full_sentiment_neg'] = full_sentiment['neg']
        row['full_sentiment_comp'] = full_sentiment['compound']
        
        # Add to results
        result.append(row)
        
    return result

In [26]:
cnn_sentiment = generate_sentiment(cnn_articles)

In [27]:
guardian_sentiment = generate_sentiment(guardian_articles)

## Load in a dataframe and pickle

In [34]:
cnn_df = pd.DataFrame(cnn_sentiment)
cnn_df['source'] = 'CNN'

In [35]:
guardian_df = pd.DataFrame(guardian_sentiment)
guardian_df['source'] = 'Guardian'

In [44]:
sentiment_df = pd.concat([cnn_df, guardian_df]).reset_index(drop=True)
sentiment_df['publish_date'] = pd.to_datetime(sentiment_df.publish_date)

In [46]:
sentiment_df.head()

Unnamed: 0,publish_date,title,title_sentiment_pos,title_sentiment_neu,title_sentiment_neg,title_sentiment_comp,full_sentiment_pos,full_sentiment_neu,full_sentiment_neg,full_sentiment_comp,source
0,2019-08-30 00:00:00,It'll take superpowers to unseat Boris Johnson...,0.141,0.859,0.0,0.3182,0.089,0.862,0.05,0.9909,CNN
1,2019-08-30 00:00:00,Portuguese woman interrupts Sky broadcast with...,0.0,0.753,0.247,-0.3182,0.053,0.899,0.048,0.5034,CNN
2,2019-08-29 00:00:00,Boris Johnson put the Queen in an absurdly awk...,0.0,0.849,0.151,-0.1531,0.05,0.907,0.044,0.4576,CNN
3,2019-08-29 00:00:00,"5 things to know for August 29: Dorian, citize...",0.0,1.0,0.0,0.0,0.054,0.869,0.077,-0.9614,CNN
4,2019-08-28 09:36:30+00:00,Boris Johnson asks Queen to suspend UK Parliam...,0.0,0.813,0.187,-0.3182,0.024,0.832,0.143,-0.9892,CNN


In [47]:
sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36343 entries, 0 to 36342
Data columns (total 11 columns):
publish_date            36343 non-null object
title                   36343 non-null object
title_sentiment_pos     36343 non-null float64
title_sentiment_neu     36343 non-null float64
title_sentiment_neg     36343 non-null float64
title_sentiment_comp    36343 non-null float64
full_sentiment_pos      36343 non-null float64
full_sentiment_neu      36343 non-null float64
full_sentiment_neg      36343 non-null float64
full_sentiment_comp     36343 non-null float64
source                  36343 non-null object
dtypes: float64(8), object(3)
memory usage: 2.6+ MB


On first glance, we can tell the following from our data:
* Both the title and full article sentiment are primarily neutral (~80%)
* The sentiment of the title has more variance than that of the full article however its the opposite for the compound variance.

In [48]:
sentiment_df.describe()

Unnamed: 0,title_sentiment_pos,title_sentiment_neu,title_sentiment_neg,title_sentiment_comp,full_sentiment_pos,full_sentiment_neu,full_sentiment_neg,full_sentiment_comp
count,36343.0,36343.0,36343.0,36343.0,36343.0,36343.0,36343.0,36343.0
mean,0.079851,0.812731,0.107421,-0.046296,0.097808,0.824778,0.076616,0.363296
std,0.122744,0.168914,0.140056,0.363943,0.032651,0.051967,0.034582,0.844047
min,0.0,0.185,0.0,-0.9393,0.0,0.0,0.0,-1.0
25%,0.0,0.693,0.0,-0.3182,0.077,0.796,0.053,-0.7717
50%,0.0,0.809,0.0,0.0,0.097,0.825,0.073,0.9499
75%,0.167,1.0,0.208,0.0772,0.117,0.855,0.096,0.9932
max,0.778,1.0,0.732,0.9274,0.336,1.0,0.623,1.0


In [49]:
sentiment_df.to_pickle('data/sentiment_df.pickle')

## END