### Setting up PySpark as stand alone on windows

In [2]:
# Importing Library and setting environment path
import os
import sys
# set the path 

sparkPath = "C:\spark-2.0.2-bin-hadoop2.7"

os.environ['SPARK_HOME'] = sparkPath
sys.path.append(sparkPath + '/bin')
sys.path.append(sparkPath + '/python')
sys.path.append(sparkPath + '/python/pyspark')
sys.path.append(sparkPath + '/python/pyspark/lib')
sys.path.append(sparkPath + '/python/pyspark/lib/pyspark.zip')
sys.path.append(sparkPath + '/python/pyspark/lib/py4j-0.10.3-src.zip')
sys.path.append("C:/Program Files (x86)/Java/jre1.8.0_111/bin")

In [3]:
import pyspark

In [4]:
from pyspark import SparkContext

In [5]:
sc = SparkContext()

In [6]:
#!pip install vaderSentiment

### Importing VaderSentiment (A library for extracting sentiment from text, considers word order, context & punctuation)

In [7]:
import pandas as pd
import sys
nb_stdout = sys.stdout
#from vaderSentiment.vaderSentiment import sentiment as vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
import datetime
sys.stdout = nb_stdout

### Defining a function to decompose sentiment into postive, negative and neutral part as well as extract the overall sentiment.

In [8]:
def senti(para):
    lines_list = tokenize.sent_tokenize(para)
    
    negList = []
    neuList = []
    posList = []
    compoundList = []
    for lines in lines_list:
        negs = vaderSentiment(lines)['neg']
        negList.append(negs)
        neu = vaderSentiment(lines)['neu']
        neuList.append(neu)
        poss = vaderSentiment(lines)['pos']
        posList.append(poss)
        comps = vaderSentiment(lines)['compound']
        compoundList.append(comps)
        
    sentiList = [np.array(negList).mean(),np.array(neuList).mean(),np.array(posList).mean(),np.array(compoundList).mean()]
    return sentiList

In [9]:
analyzer = SentimentIntensityAnalyzer()

### Loading the data as RDD

In [10]:
originalRDD=sc.textFile(\
"file:///C:/Users/Sahil Gupta/Google Drive/Fall/Big Data/Big Data Project/NewsAggregatorDataset/newsCorpora.csv")\
.map(lambda line: line.split("\t"))

In [12]:
originalRDD.first()

[u'1',
 u'Fed official says weak data caused by weather, should not slow taper',
 u'http://www.latimes.com/business/money/la-fi-mo-federal-reserve-plosser-stimulus-economy-20140310,0,1312750.story\\?track=rss',
 u'Los Angeles Times',
 u'b',
 u'ddUyU0VZz0BRneMioxUPQVP6sIxvM',
 u'www.latimes.com',
 u'1394470370698']

In [13]:
consideredRDD = originalRDD.map(lambda fields:(fields[0],fields[1],fields[4],analyzer.polarity_scores(fields[1])['compound'],\
                                               time.strftime('%Y-%m-%d', time.gmtime(int(fields[7])/1000.0))))

### Filtering only the business articles. The following chunk of code was run for each of the four category.

In [12]:
businessRDD = consideredRDD.filter(lambda line: line[2] == 'b')

In [13]:
businessRDD.first()

(u'1',
 u'Fed official says weak data caused by weather, should not slow taper',
 u'b',
 -0.4404,
 '2014-03-10')

### Calculating the aggregated sentiment by dates

In [14]:
aggrSentiRDD = businessRDD.map(lambda line: (line[4], line[3])).reduceByKey(lambda v1,v2:v1+v2)

In [15]:
countSentiRDD = businessRDD.map(lambda line: (line[4], 1)).reduceByKey(lambda v1,v2:v1+v2)

In [16]:
busineSentiRDD = aggrSentiRDD.join(countSentiRDD)

In [17]:
busineSentiRDD.first()

('2014-03-11', (-14.769699999999977, 1117))

In [18]:
busineSentiRDDFinal = busineSentiRDD.map(lambda line: (line[0], line[1][0]/line[1][1]))

In [19]:
busineSentiRDDFinal.take(5)

[('2014-03-11', -0.013222649955237222),
 ('2014-03-18', -0.022615942028985387),
 ('2014-06-16', -0.1123684663536776),
 ('2014-04-17', 0.03348704177323086),
 ('2014-04-12', -0.025745797280593334)]

### Converting the RDD to a dataframe

In [20]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [21]:
busineSentiDF = sqlContext.createDataFrame(busineSentiRDDFinal)

In [22]:
busineSentiDataFrame = busineSentiDF.toPandas().rename(columns={'_1': 'Date', '_2': 'Overall Sentiment'}).groupby('Date').mean()

In [23]:
from IPython.display import display, HTML

In [24]:
display(busineSentiDataFrame)

Unnamed: 0_level_0,Overall Sentiment
Date,Unnamed: 1_level_1
2014-03-10,-0.112655
2014-03-11,-0.013223
2014-03-13,-0.041790
2014-03-17,0.004762
2014-03-18,-0.022616
2014-03-19,-0.020367
2014-03-20,0.026327
2014-03-22,-0.144964
2014-03-23,0.051618
2014-03-24,0.016070


### Plotting the sentiment as a time series in Plotly

In [25]:
import plotly.plotly as py
import plotly.graph_objs as go

import plotly 
plotly.tools.set_credentials_file(username='hi5sahil', api_key='V2ZZuDdBR6IqM4cqbTM3')

layout = go.Layout(
    title='Business Sentiment over Time',
    xaxis=dict(
        title='Date',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    ),
    yaxis=dict(
        title='Overall Sentiment',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#7f7f7f'
        )
    )
)

data = [go.Scatter(
          x=list(busineSentiDataFrame.index),
          y=list(busineSentiDataFrame['Overall Sentiment'].values),
          line = dict(
          color = ('darkgreen')))]

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='axes-booleans')