In [8]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) NETFLIX##


In [9]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')
equinix_headlines=pd.read_csv('./datasets/headlines/equinix_headlines.csv')
tsla_headlines=pd.read_csv('./datasets/headlines/tsla_headlines.csv')
apple_headlines=pd.read_csv('./datasets/headlines/apple_headlines.csv')
nflx_headlines=pd.read_csv('./datasets/headlines/netflix_headlines.csv')

In [10]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [11]:
nflx_headlines.head(100)

Unnamed: 0,headlines,date
0,Netflix poaches CFO from Activision Blizzard -...,01-01-19
1,2015 leaders and laggards,01-01-16
2,Scorecard for the markets in 2018,01-01-19
3,How will the FAANGs perform in 2019?,01-01-19
4,Friends' bids farewell to Netflix,01-01-20
5,Netflix pulls episode critical of Saudi Arabia,01-02-19
6,Market Chatter: Netflix Reportedly Hires Activ...,01-02-19
7,OPTIONS: Large Tech Option Implied Volatility,01-02-19
8,Netflix -2% after SunTrust reels in high-flyin...,01-02-19
9,MTNewswires' Opening Bell Momentum Stocks: NFLX,01-02-19


In [12]:
nflx_headlines.shape

(10656, 2)

In [13]:
nflx_headlines.isnull().sum()

headlines       0
date         7104
dtype: int64

In [14]:
nflx_headlines.columns = [x.lower() for x in nflx_headlines.columns]

In [15]:
nflx_headlines['date'] = pd.to_datetime(nflx_headlines['date'])

In [16]:
nflx_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [17]:
nflx_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
3250,Seismic changes coming for pay TV,2014-11-13
3288,"Netflix launching in Australia, New Zealand in...",2014-11-18
3289,Streaming services to be tracked by Nielsen,2014-11-18
3292,Netflix takeover speculation churned up,2014-11-19
3312,Netflix lands rights to Tina Fey show,2014-11-21
...,...,...
10651,11/18 21:41,NaT
10652,SA Breaking News,NaT
10653,11/18 18:06,NaT
10654,SA Breaking News,NaT


**Separate the Headlines Column for Preprocessing**

In [18]:
nflx_headlines2=nflx_headlines[['headlines']].copy()

In [19]:
nflx_headlines2

Unnamed: 0,headlines
0,Netflix poaches CFO from Activision Blizzard -...
1,2015 leaders and laggards
2,Scorecard for the markets in 2018
3,How will the FAANGs perform in 2019?
4,Friends' bids farewell to Netflix
...,...
10651,11/18 21:41
10652,SA Breaking News
10653,11/18 18:06
10654,SA Breaking News


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [20]:
nflx_headlines2.headline = [x.lower() for x in nflx_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [21]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = nflx_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    nflx_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [22]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [25]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(nflx_headlines2['headlines'])):#creating a function
    words = nflx_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    nflx_headlines2.iloc[i,0] = cleaned_string

In [26]:
for i in nflx_headlines2['headlines']:
    print(i)
    

netflix poaches cfo from activision blizzard reuters
2015 leader and laggard
scorecard for the market in 2018
how will the faangs perform in 2019?
friends' bid farewell to netflix
netflix pull episode critical of saudi arabia
market chatter: netflix reportedly hire activision blizzard's spencer neumann a cfo
options: large tech option implied volatility
netflix -2% after suntrust reel in high-flying price target
mtnewswires' opening bell momentum stocks: nflx
sector update: consumer
wall street set to open in the red on weak chinese data
netflix ride macquarie confidence higher
update: activision blizzard name dennis durkin cfo, succeeding spencer neumann
sector update: consumer stock retreat pre-market
thinking about buying stock in apple, biopharmx, china ceramic co., netflix or weatherford international?
thinking about investing in apple, alibaba, netflix, nvidia and tesla motor in the new year?
--analyst actions: macquarie upgrade netflix to outperform from neutral, raise pt to $22

netflix join hand with spark new zealand to offer first kiwi bundled deal
the zacks analyst blog highlights: disney, netflix, at&t, apple and amazon
netflix
5 coronavirus-proof stock to counter market meltdown
netflix win an oscar for documentary
for some company the impact of the coronavirus aren’t all bad
netflix content trend draw raymond james praise
dow jones pares loss after 950-point plunge; fang stock jumps, square surge
mkm's 'stay at home' stock built for virus time
netflix ceo talk about vr, ai and 100m sub
wsj: youtube view at over 1b hour per day
netflix, roku, video game stock seen safe from coronavirus impact
amd, apple, intel lead silicon valley stock sell-off a coronavirus fear in u.s. grow
will domino's, zoom video, netflix stock see bump in coronavirus stock market correction?
market chatter: india eye chunk of $100 billion in global tax from google, facebook, amazon, netflix
netflix stock fall 3%
how much are streamer losing from mooching?
the zacks analyst blog hig

deadline reminder: the law office of howard g. smith reminds investor of looming deadline in the class action lawsuit against netflix, inc.
shareholder alert: pomerantz law firm reminds shareholder with loss on their investment netflix, inc. of class action lawsuit and upcoming deadline – nflx
netflix would spend $20m/hour for right content
class action update for nflx and gnln: levi & korsinsky, llp reminds investor of class action on behalf of shareholder
netflix close to cash-flow positive - share firm
rosen, a globally recognized law firm, reminds netflix, inc. investor of important september 20th deadline in security class action first filed by the firm – nflx
netflix dip on macquarie downgrade
thinking about buying stock in apple, advanced micro devices, netflix, nike, or nvidia?
filing deadline--kuznicki law pllc announces class action on behalf of shareholder of rbgly, nflx, egbn and val
--analyst actions: macquarie lower netflix to underperform v neutral - share slide 2%
thr a

03/19 13:50
yahoo
03/19 11:58
yahoo
03/19 9:56
sa breaking news
03/19 9:49
yahoo
03/19 8:16
yahoo
03/19 8:00
yahoo
03/19 6:18
yahoo
03/19 2:00
yahoo
03/18 20:00
sa breaking news
03/18 18:43
yahoo
03/18 18:12
yahoo
03/18 17:57
yahoo
03/18 17:45
yahoo
03/18 15:46
yahoo
03/18 13:52
yahoo
03/18 12:58
yahoo
03/18 12:20
yahoo
03/18 12:13
yahoo
03/18 11:34
yahoo
03/18 10:56
sa breaking news
03/18 10:37
yahoo
03/18 7:57
yahoo
03/18 6:00
yahoo
03/18 5:33
yahoo
03/17 16:34
yahoo
03/17 16:22
yahoo
03/17 16:14
yahoo
03/17 16:02
yahoo
03/17 14:07
yahoo
03/17 12:53
yahoo
03/17 12:52
yahoo
03/17 11:46
sa breaking news
03/17 9:59
yahoo
03/17 9:00
yahoo
03/17 8:30
yahoo
03/16 18:39
yahoo
03/16 17:08
yahoo
03/16 17:02
yahoo
03/16 15:45
yahoo
03/16 13:27
yahoo
03/16 11:36
yahoo
03/16 11:30
yahoo
03/16 11:24
yahoo
03/16 10:24
yahoo
03/16 10:01
yahoo
03/16 10:00
pr newswire
03/16 9:31
yahoo
03/16 8:45
yahoo
03/16 5:32
yahoo
03/15 17:01
yahoo
03/14 13:28
yahoo
03/14 9:48
yahoo
03/13 16:15
sa breaking news
0

12/03 8:05
mt newswires
12/02 15:45
sa breaking news
12/02 13:44
mt newswires
12/02 13:17
mt newswires
11/29 4:42
mt newswires
11/27 16:24
sa breaking news
11/27 9:20
mt newswires
11/26 10:16
pr newswire
11/26 9:31
sa breaking news
11/26 8:23
sa breaking news
11/25 9:38
mt newswires
11/25 8:31
mt newswires
11/21 14:47
sa breaking news
11/21 14:04
mt newswires
11/21 13:53
mt newswires
11/21 13:49
sa breaking news
11/19 11:54
sa breaking news
11/19 11:02
mt newswires
11/18 16:43
mt newswires
11/18 13:07
mt newswires
11/15 15:10
sa breaking news
11/15 11:49
sa breaking news
11/15 8:39
mt newswires
11/14 16:37
mt newswires
11/14 14:25
sa breaking news
11/14 10:55
mt newswires
11/14 6:30
sa breaking news
11/14 4:44
sa breaking news
11/13 17:04
sa breaking news
11/13 12:22
sa breaking news
11/12 14:04
sa breaking news
11/12 10:19
sa breaking news
11/12 4:44
pr newswire
11/08 9:31
sa breaking news
11/07 11:22
sa breaking news
11/07 10:24
sa breaking news
11/07 10:09
sa breaking news
11/01 16:

01/02 14:45
mt newswires
01/02 14:34
mt newswires
01/02 13:19
mt newswires
01/02 12:45
mt newswires
01/02 12:34
sa breaking news
01/02 12:25
mt newswires
01/02 12:00
pr newswire
01/02 12:00
mt newswires
01/02 11:06
pr newswire
01/02 9:31
mt newswires
01/02 9:04
mt newswires
01/02 9:04
mt newswires
01/02 8:45
mt newswires
01/02 8:37
mt newswires
01/02 8:35
sa breaking news
01/02 7:48
mt newswires
01/02 5:42
mt newswires
01/02 4:29
sa breaking news
01/02 2:44
sa breaking news
01/01 5:32
sa breaking news
01/01 5:12
sa breaking news
01/01 3:34
sa breaking news
12/31 22:28
sa breaking news
12/31 13:56
mt newswires
12/31 12:06
mt newswires
12/31 10:26
mt newswires
12/31 9:13
sa breaking news
12/31 8:57
mt newswires
12/31 6:29
sa breaking news
12/28 16:22
mt newswires
12/28 12:59
mt newswires
12/28 12:35
mt newswires
12/28 9:09
mt newswires
12/28 5:45
sa breaking news
12/28 4:44
pr newswire
12/26 9:31
mt newswires
12/24 9:35
sa breaking news
12/24 8:38
mt newswires
12/21 16:17
mt newswires
12

11/29 16:05
mt newswires
11/29 12:47
pr newswire
11/29 9:31
globe newswire
11/28 8:05
sa breaking news
11/21 12:10
pr newswire
11/20 9:31
sa breaking news
11/16 3:05
sa breaking news
11/15 7:49
sa breaking news
11/10 12:05
pr newswire
11/10 9:31
sa breaking news
11/09 3:33
sa breaking news
11/02 15:48
sa breaking news
10/31 14:01
pr newswire
10/31 8:45
sa breaking news
10/31 5:46
mt newswires
10/31 4:31
pr newswire
10/27 9:31
pr newswire
10/26 21:17
mt newswires
10/24 5:25
sa breaking news
10/23 19:33
pr newswire
10/23 18:58
sa breaking news
10/23 12:22
mt newswires
10/23 9:03
pr newswire
10/23 8:00
sa breaking news
10/19 10:32
pr newswire
10/19 6:30
mt newswires
10/17 17:00
mt newswires
10/17 16:57
mt newswires
10/17 12:43
mt newswires
10/17 11:58
mt newswires
10/17 11:10
sa breaking news
10/17 9:11
mt newswires
10/17 8:30
mt newswires
10/17 5:07
mt newswires
10/17 4:39
mt newswires
10/16 17:56
mt newswires
10/16 17:54
mt newswires
10/16 17:41
mt newswires
10/16 17:41
mt newswires
10/

sa breaking news
01/06 13:05
pr newswire
01/06 12:50
mt newswires
01/06 12:15
mt newswires
01/05 14:03
mt newswires
01/05 10:48
sa breaking news
01/05 10:15
sa breaking news
01/04 15:30
mt newswires
01/04 12:07
mt newswires
01/04 8:31
sa breaking news
01/04 8:30
sa breaking news
01/04 7:59
sa breaking news
01/04 7:55
mt newswires
01/04 6:16
sa breaking news
01/01 4:53
sa breaking news
12/31 9:32
mt newswires
12/31 6:30
mt newswires
12/29 15:39
pr newswire
12/29 9:31
mt newswires
12/28 7:53
mt newswires
12/28 7:02
mt newswires
12/24 21:32
sa breaking news
12/24 8:34
mt newswires
12/24 8:26
mt newswires
12/24 8:25
mt newswires
12/24 8:04
mt newswires
12/23 14:18
mt newswires
12/22 18:18
sa breaking news
12/22 13:26
mt newswires
12/22 8:25
mt newswires
12/22 6:48
sa breaking news
12/21 16:12
pr newswire
12/21 16:01
sa breaking news
12/17 14:35
sa breaking news
12/16 7:52
pr newswire
12/15 16:01
pr newswire
12/15 9:31
mt newswires
12/11 22:14
sa breaking news
12/10 9:28
sa breaking news
12

In [27]:
nflx_headlines2_copy=i
pd.DataFrame(nflx_headlines2)
nflx_headlines2

Unnamed: 0,headlines
0,netflix poaches cfo from activision blizzard r...
1,2015 leader and laggard
2,scorecard for the market in 2018
3,how will the faangs perform in 2019?
4,friends' bid farewell to netflix
...,...
10651,11/18 21:41
10652,sa breaking news
10653,11/18 18:06
10654,sa breaking news


In [28]:
nflx_headline_date=nflx_headlines[['date']]

In [29]:
nflx_headlines[['date']]

Unnamed: 0,date
0,2019-01-01
1,2016-01-01
2,2019-01-01
3,2019-01-01
4,2020-01-01
...,...
10651,NaT
10652,NaT
10653,NaT
10654,NaT


In [30]:
nflx_headlines3= pd.concat([nflx_headline_date, nflx_headlines2], axis=1, join='inner')

In [31]:
nflx_headlines3.shape

(10656, 2)

In [32]:
nflx_headlines3.set_index('date',inplace=True)

In [33]:
nflx_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [34]:
nflx_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2014-11-13,seismic change coming for pay tv
2014-11-18,"netflix launching in australia, new zealand in..."
2014-11-18,streaming service to be tracked by nielsen
2014-11-19,netflix takeover speculation churned up
2014-11-21,netflix land right to tina fey show


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (NETFLIX) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [35]:
!pip install vaderSentiment



In [36]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [43]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
nflx_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in nflx_headlines3['headlines']]
nflx_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in nflx_headlines3['headlines']]
nflx_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in nflx_headlines3['headlines']]
nflx_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in nflx_headlines3['headlines']]

nflx_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11-13,seismic change coming for pay tv,-0.1027,0.219,0.781,0.0
2014-11-18,"netflix launching in australia, new zealand in...",0.0,0.0,1.0,0.0
2014-11-18,streaming service to be tracked by nielsen,0.0,0.0,1.0,0.0
2014-11-19,netflix takeover speculation churned up,0.4588,0.0,0.571,0.429
2014-11-21,netflix land right to tina fey show,0.0,0.0,1.0,0.0
2014-11-21,amazon reportedly working on ad-supported stre...,0.1779,0.0,0.779,0.221
2014-11-24,netflix sued in france over customer tems,0.0,0.0,1.0,0.0
2014-11-25,what did the billionaire do in q3?,0.0,0.0,1.0,0.0
2014-11-25,netflix -2.2% on stifel downgrade,0.0,0.0,1.0,0.0
2014-11-26,netflix sue yahoo cio mike kail for alleged ki...,0.0,0.0,1.0,0.0


## Netflix Headlines Average Vadar Scores By Date

In [44]:
nflx_headline_vadar=nflx_headlines3.groupby('date').mean()

In [45]:
nflx_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11-13,-0.1027,0.219,0.781,0.0
2014-11-18,0.0,0.0,1.0,0.0
2014-11-19,0.4588,0.0,0.571,0.429
2014-11-21,0.08895,0.0,0.8895,0.1105
2014-11-24,0.0,0.0,1.0,0.0


In [46]:
nflx_headline_vadar.to_csv('./datasets/headlines/nflx_headline_vadar.csv')