In [1]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) FACEBOOK##


In [2]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [4]:
fb_headlines.head()

Unnamed: 0,Headline,Date
0,Internet stocks beat S&P 500 in 2019,12-31-2019
1,Equities at US Record Highs in 2019 Mark 'Pola...,12-31-2019
2,Sector Update: Solid Gains for Tech Stocks Mon...,12-31-2018
3,Sector Update: Tech Stocks Advance in Monday A...,12-31-2018
4,Analyst: Market needs FAANG to recover,12-31-2018


In [5]:
fb_headlines.shape

(5616, 2)

In [6]:
fb_headlines.isnull().sum()

Headline    0
Date        0
dtype: int64

In [7]:
fb_headlines.columns = [x.lower() for x in fb_headlines.columns]

In [8]:
fb_headlines['date'] = pd.to_datetime(fb_headlines['date'])

In [9]:
fb_headlines.dtypes

headline            object
date        datetime64[ns]
dtype: object

In [10]:
fb_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headline,date
425,"Facebook updates privacy policy, sets stage fo...",2014-11-13
385,Facebook developing website for professionals,2014-11-17
384,"LinkedIn, Jive, Salesforce lower following Fac...",2014-11-17
374,Facebook launches standalone Groups apps,2014-11-18
300,What did the billionaires do in Q3?,2014-11-25
...,...,...
2108,Thinking about trading options or stock in Ali...,2020-03-27
2111,10 Stocks To Buy With Low Debt And High Liquidity,2020-03-27
2070,Netflix Watches as Instagram Unites a Quaranti...,2020-03-28
2069,To the investors gobbling up stocks: The intra...,2020-03-28


**Separate the Headlines Column for Preprocessing**

In [11]:
fb_headlines2=fb_headlines[['headline']].copy()

In [12]:
fb_headlines2

Unnamed: 0,headline
0,Internet stocks beat S&P 500 in 2019
1,Equities at US Record Highs in 2019 Mark 'Pola...
2,Sector Update: Solid Gains for Tech Stocks Mon...
3,Sector Update: Tech Stocks Advance in Monday A...
4,Analyst: Market needs FAANG to recover
...,...
5611,Facebook to Announce First Quarter 2015 Results
5612,Facebook's Oculus buys two startups; Zuck open...
5613,Instagram tops 300M active users
5614,"Facebook overhauls search engine, brings Graph..."


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [13]:
fb_headlines2.headline = [x.lower() for x in fb_headlines.headline]

In [14]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headline'])):#creating a function
    words = fb_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    fb_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [15]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [16]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(fb_headlines2['headline'])):#creating a function
    words = fb_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    fb_headlines2.iloc[i,0] = cleaned_string

In [17]:
for i in fb_headlines2['headline']:
    print(i)
    

internet stock beat s p 500 in 2019
equity at u record high in 2019 mark 'polar opposites' of beaten-down market a year, decade began, janney say
sector update: solid gain for tech stock monday not enough to reverse 15% sector drop this year
sector update: tech stock advance in monday afternoon trading
analyst: market need faang to recover
sector update: tech stock higher in pre-market trading monday
facebook's zuckerberg: company 'very different' than a year ago, altered to focus more on harm prevention
facebook add 1% in pre-market a government shutdown reportedly stall ftc probe
options: option implied volatility for social medium
market chatter: facebook relies on indian outsourcing to clean up it social medium platform
market chatter: facebook share slip, free basic sponsored-service suspended in egypt: reuters
u stock ease from record high a tech take a breather from strong 2019 run
facebook get new street-high target
market chatter: facebook reportedly get slapped with $1.6 mill

report: facebook stored user password unencrypted for year
facebook look to improve ai in wake of terror-attack video
mt newswires after-hours news mover
zuckerberg statement on cambridge analytica: 'i've been working to understand what happened, how to make sure it doesn't happen again'
facebook's zuckerberg: 'we have a responsibility to protect your data'
investor alert: brower piven encourages shareholder who have loss in excess of $100,000 from investment in facebook, inc. to contact brower piven before the lead plaintiff deadline in class action lawsuit
mid-day update: led by energy sector, stock turn positive into fomc announcement
facebook break decline, turning up 2.4% a analyst recap
shareholder alert - bronstein, gewirtz & grossman, llc notifies investor of class action against facebook, inc. (fb) & lead plaintiff deadline: may 21, 2018
wall street tip lower ahead of fed rate decision
sector update: technology stock mixed in pre-market; european union to unveil plan for a dig

facebook target uk growth with 1,000 hire this year
facebook launch facebook sport stadium
facebook launch sport platform; ad partner report strong q4 data
covered call alert for facebook, synaptics incorporated, abercrombie & fitch, inovio pharmaceutical and whiting petroleum released by investorsobserver
it’s time for netflix to consider ads, investor say
we're getting better at protecting elections: facebook's clegg
amazon executive challenge facebook’s clegg on user privacy
amazon twitch's viewership falls: googl, msft & fb gear up
digital tax in the spotlight this week
macron ha a plan to lure tech talent to france
facebook’s ‘failed’ libra cryptocurrency is no closer to release
digital giant in crosshairs of eu tax
analyst actions: facebook resumed at overweight by pacific crest with $150 pt
tech giant fall sharply a nasdaq tumble
increased acceptance of social networking service combined with higher gaming & social advertising budget likely to benefit global social platform
niel

chief operating officer sandberg sheryl exer. acquires $544.4k of fb on 3/3/16
facebook messenger top 800m active users; 2016 agenda discussed
growth in digital marketing popularity thanks to social medium
web giant cooperate on removal of extremist content
vice president fischer david b sell $4.5m of fb on 10/4/16-10/5/16
director koum jan sell $72.7m of fb on 10/4/16
facebook's oculus working toward mid-range vr
market chatter: facebook in talk with u official on possibly introducing high-speed internet app, free basics, to low-income, rural american
washington post: facebook pushing free basic in talk with u.s. government
instagram's story feature record 100m daily active viewer
investorsobserver release covered-call report for alibaba, conocophillips, facebook, huntington ingalls industry and suncor energy
facebook mark ten year of news feed
multi-billion dollar gig economy exploding with tech advancement fueling functionality & efficiency of latest user-friendly application
politi

In [18]:
fb_headlines2_copy=i
pd.DataFrame(fb_headlines2)
fb_headlines2

Unnamed: 0,headline
0,internet stock beat s p 500 in 2019
1,equity at u record high in 2019 mark 'polar op...
2,sector update: solid gain for tech stock monda...
3,sector update: tech stock advance in monday af...
4,analyst: market need faang to recover
...,...
5611,facebook to announce first quarter 2015 result
5612,facebook's oculus buy two startups; zuck open ...
5613,instagram top 300m active user
5614,"facebook overhaul search engine, brings graph ..."


In [19]:
fb_headline_date=fb_headlines[['date']]

In [20]:
fb_headlines[['date']]

Unnamed: 0,date
0,2019-12-31
1,2019-12-31
2,2018-12-31
3,2018-12-31
4,2018-12-31
...,...
5611,2015-04-01
5612,2014-12-11
5613,2014-12-10
5614,2014-12-08


In [21]:
 fb_headlines3= pd.concat([fb_headline_date, fb_headlines2], axis=1, join='inner')

In [22]:
 fb_headlines3.shape

(5616, 2)

In [23]:
fb_headlines3.set_index('date',inplace=True)

In [24]:
fb_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [25]:
fb_headlines3.head()

Unnamed: 0_level_0,headline
date,Unnamed: 1_level_1
2014-11-13,"facebook update privacy policy, set stage for ..."
2014-11-17,facebook developing website for professional
2014-11-17,"linkedin, jive, salesforce lower following fac..."
2014-11-18,facebook launch standalone group apps
2014-11-25,what did the billionaire do in q3?


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (FACEBOOK) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [26]:
!pip install vaderSentiment



In [27]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
fb_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in fb_headlines3['headline']]
fb_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in fb_headlines3['headline']]
fb_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in fb_headlines3['headline']]
fb_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in fb_headlines3['headline']]

fb_headlines3.head()
    


Unnamed: 0_level_0,headline,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11-13,"facebook update privacy policy, set stage for ...",0.0,0.0,1.0,0.0
2014-11-17,facebook developing website for professional,0.4588,0.0,0.571,0.429
2014-11-17,"linkedin, jive, salesforce lower following fac...",-0.296,0.216,0.784,0.0
2014-11-18,facebook launch standalone group apps,0.4588,0.0,0.571,0.429
2014-11-25,what did the billionaire do in q3?,0.0,0.0,1.0,0.0


# Facebook Headlines Average Vadar Scores By Date

In [29]:
fb_headline_vadar=fb_headlines3.groupby('date').sum()

In [30]:
fb_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11-13,0.0,0.0,1.0,0.0
2014-11-17,0.1628,0.216,1.355,0.429
2014-11-18,0.4588,0.0,0.571,0.429
2014-11-25,0.0,0.0,1.0,0.0
2014-11-26,0.4588,0.0,0.625,0.375


In [31]:
fb_headline_vadar.to_csv('./datasets/headlines/fb_headline_vadar.csv')