In [33]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) EQUINIX##


In [34]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')
equinix_headlines=pd.read_csv('./datasets/headlines/equinix_headlines.csv')

In [35]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [36]:
equinix_headlines.head(100)

Unnamed: 0,headlines,date
0,BAML unveils its high conviction calls,01-02-19
1,MEDIA ALERT: Equinix to Speak at Upcoming Citi...,01-02-19
2,MEDIA ALERT: Equinix to Speak at Upcoming Citi...,01-03-18
3,Analyst Actions: Equinix Upgraded To Outperfor...,01-04-16
4,Equinix +0.9% as Stifel upgrades to Buy,01-05-16
5,Equinix Appoints Adaire Fox-Martin to Board of...,01-07-20
6,Equinix Invests $85 Million to Build Fourth Da...,01-07-19
7,Equinix invests $85M in Singapore data center,01-07-19
8,"Equinix CIO: Company about 'interconnection,' ...",01-07-16
9,This just in: REITs outperform,01-08-16


In [37]:
equinix_headlines.shape

(861, 2)

In [38]:
equinix_headlines.isnull().sum()

headlines    0
date         0
dtype: int64

In [39]:
equinix_headlines.columns = [x.lower() for x in equinix_headlines.columns]

In [40]:
equinix_headlines['date'] = pd.to_datetime(equinix_headlines['date'])

In [41]:
equinix_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [42]:
equinix_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
46,Equinix buys cloud IT services firm,2014-01-20
783,"Equinix upsizes, prices debt offering",2014-11-18
853,Equnix takes out $1.5B credit facility,2014-12-18
858,Equinix to begin operating as a REIT on New Ye...,2014-12-23
61,Equinix reports Q4 earnings on February 19,2015-01-26
86,Data-center REIT bulls pick their favorites,2015-02-05
163,Notable earnings after Thursday’s close,2015-02-18
168,Equinix declares $1.69 dividend,2015-02-19
170,"Equinix turns positive after mixed results, li...",2015-02-20
220,Equinix +4.7%; BofA expects REIT index addition,2015-03-05


**Separate the Headlines Column for Preprocessing**

In [43]:
equinix_headlines2=equinix_headlines[['headlines']].copy()

In [44]:
equinix_headlines2

Unnamed: 0,headlines
0,BAML unveils its high conviction calls
1,MEDIA ALERT: Equinix to Speak at Upcoming Citi...
2,MEDIA ALERT: Equinix to Speak at Upcoming Citi...
3,Analyst Actions: Equinix Upgraded To Outperfor...
4,Equinix +0.9% as Stifel upgrades to Buy
5,Equinix Appoints Adaire Fox-Martin to Board of...
6,Equinix Invests $85 Million to Build Fourth Da...
7,Equinix invests $85M in Singapore data center
8,"Equinix CIO: Company about 'interconnection,' ..."
9,This just in: REITs outperform


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [45]:
equinix_headlines2.headline = [x.lower() for x in equinix_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [46]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = equinix_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    equinix_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [47]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [48]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(equinix_headlines2['headlines'])):#creating a function
    words = equinix_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    equinix_headlines2.iloc[i,0] = cleaned_string

In [49]:
for i in equinix_headlines2['headlines']:
    print(i)
    

baml unveils it high conviction call
medium alert: equinix to speak at upcoming citi 2019 tmt west conference
medium alert: equinix to speak at upcoming citi 2018 global tmt west conference
analyst actions: equinix upgraded to outperform at well fargo security
equinix +0.9% a stifel upgrade to buy
equinix appoints adaire fox-martin to board of director
equinix invests $85 million to build fourth data center in singapore
equinix invests $85m in singapore data center
equinix cio: company about 'interconnection,' moving beyond data center
this just in: reit outperform
analyst actions: berenberg bank initiate coverage on equinix with buy rating and $477 pt
equinix secures €1.0 billion of new term debt and reprices existing term loan
equinix completes us$175 million acquisition of three data center in mexico
bmo like hcp, ctre, dre in another volatile year for reit
equinix close $175m deal for mexico data center
s&p 500 movers: constellation brand slump 10%, lead losers; pg&e corp. add 8%
e

In [69]:
equinix_headlines2_copy=i
pd.DataFrame(equinix_headlines2)
equinix_headlines2

Unnamed: 0,headlines
0,baml unveils it high conviction call
1,medium alert: equinix to speak at upcoming cit...
2,medium alert: equinix to speak at upcoming cit...
3,analyst actions: equinix upgraded to outperfor...
4,equinix +0.9% a stifel upgrade to buy
5,equinix appoints adaire fox-martin to board of...
6,equinix invests $85 million to build fourth da...
7,equinix invests $85m in singapore data center
8,"equinix cio: company about 'interconnection,' ..."
9,this just in: reit outperform


In [70]:
equinix_headline_date=equinix_headlines[['date']]

In [71]:
equinix_headlines[['date']]

Unnamed: 0,date
0,2019-01-02
1,2019-01-02
2,2018-01-03
3,2016-01-04
4,2016-01-05
5,2020-01-07
6,2019-01-07
7,2019-01-07
8,2016-01-07
9,2016-01-08


In [72]:
equinix_headlines3= pd.concat([equinix_headline_date, equinix_headlines2], axis=1, join='inner')

In [73]:
equinix_headlines3.shape

(861, 2)

In [74]:
equinix_headlines3.set_index('date',inplace=True)

In [75]:
equinix_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [76]:
equinix_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2014-01-20,equinix buy cloud it service firm
2014-11-18,"equinix upsizes, price debt offering"
2014-12-18,equnix take out $1.5b credit facility
2014-12-23,equinix to begin operating a a reit on new year's
2015-01-26,equinix report q4 earnings on february 19


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (EQUINIX) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [77]:
!pip install vaderSentiment



In [78]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [79]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
equinix_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in equinix_headlines3['headlines']]
equinix_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in equinix_headlines3['headlines']]
equinix_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in equinix_headlines3['headlines']]
equinix_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in equinix_headlines3['headlines']]

equinix_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-01-20,equinix buy cloud it service firm,0.4588,0.0,0.625,0.375
2014-11-18,"equinix upsizes, price debt offering",-0.3612,0.385,0.615,0.0
2014-12-18,equnix take out $1.5b credit facility,0.3818,0.0,0.658,0.342
2014-12-23,equinix to begin operating a a reit on new year's,0.0,0.0,1.0,0.0
2015-01-26,equinix report q4 earnings on february 19,0.0,0.0,1.0,0.0
2015-02-05,data-center reit bull pick their favorite,0.4588,0.0,0.625,0.375
2015-02-18,notable earnings after thursday’s close,0.0,0.0,1.0,0.0
2015-02-19,equinix declares $1.69 dividend,0.4588,0.0,0.5,0.5
2015-02-20,"equinix turn positive after mixed results, lig...",0.5574,0.0,0.66,0.34
2015-03-05,equinix +4.7%; bofa expects reit index addition,0.0,0.0,1.0,0.0


## Equinix Headlines Average Vadar Scores By Date

In [80]:
equinix_headline_vadar=equinix_headlines3.groupby('date').mean()

In [81]:
equinix_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-01-20,0.4588,0.0,0.625,0.375
2014-11-18,-0.3612,0.385,0.615,0.0
2014-12-18,0.3818,0.0,0.658,0.342
2014-12-23,0.0,0.0,1.0,0.0
2015-01-26,0.0,0.0,1.0,0.0


In [82]:
equinix_headline_vadar.to_csv('./datasets/headlines/equinix_headline_vadar.csv')