In [1]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) GOOGLE##


In [2]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [4]:
google_headlines.head(100)

Unnamed: 0,Headlines,Date
0,"ÔHonestly, we donÕt know what to expectÕ: Meg ...",04-05-2020
1,Zoom Video lurches from boom to backlash amid ...,04-05-2020
2,Are Lockdowns Working? Google Offers Location ...,04-05-2020
3,AmericaÕs housing market is showing the first ...,04-04-2020
4,Two. Trillion. Dollars? HereÕs where all that ...,04-04-2020
5,20 technology stocks with low debt to consider...,04-04-2020
6,"Zoom green screens, but not pants: Here are th...",04-04-2020
7,Everyone Is Worried About the Internet. So Far...,04-03-2020
8,Google and Facebook CanÕt Save the Advertising...,04-03-2020
9,New York CityÕs Economy Is in the Crucible of ...,04-03-2020


In [5]:
google_headlines.shape

(5736, 2)

In [6]:
google_headlines.isnull().sum()

Headlines    0
Date         0
dtype: int64

In [7]:
google_headlines.columns = [x.lower() for x in google_headlines.columns]

In [8]:
google_headlines['date'] = pd.to_datetime(google_headlines['date'])

In [9]:
google_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [10]:
google_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
5735,Google's massive DoubleClick ad server goes down,2014-11-12
5734,YouTube paid music service to see test launch ...,2014-11-12
5733,Google now reportedly aiming for 2015 Glass la...,2014-11-14
5732,Report: Google Play to finally arrive in China,2014-11-19
5731,Yahoo/Bing displaces Google as Mozilla's U.S. ...,2014-11-19
...,...,...
4,Two. Trillion. Dollars? HereÕs where all that ...,2020-04-04
3,AmericaÕs housing market is showing the first ...,2020-04-04
2,Are Lockdowns Working? Google Offers Location ...,2020-04-05
1,Zoom Video lurches from boom to backlash amid ...,2020-04-05


**Separate the Headlines Column for Preprocessing**

In [11]:
google_headlines2=google_headlines[['headlines']].copy()

In [12]:
google_headlines2

Unnamed: 0,headlines
0,"ÔHonestly, we donÕt know what to expectÕ: Meg ..."
1,Zoom Video lurches from boom to backlash amid ...
2,Are Lockdowns Working? Google Offers Location ...
3,AmericaÕs housing market is showing the first ...
4,Two. Trillion. Dollars? HereÕs where all that ...
...,...
5731,Yahoo/Bing displaces Google as Mozilla's U.S. ...
5732,Report: Google Play to finally arrive in China
5733,Google now reportedly aiming for 2015 Glass la...
5734,YouTube paid music service to see test launch ...


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [13]:
google_headlines2.headline = [x.lower() for x in google_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [14]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = google_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    google_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [15]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [16]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(google_headlines2['headlines'])):#creating a function
    words = google_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    google_headlines2.iloc[i,0] = cleaned_string

In [17]:
for i in google_headlines2['headlines']:
    print(i)
    

ôhonestly we donõt know what to expectõ meg whitmanõs big money streaming startup quibi to launch in a new world
zoom video lurch from boom to backlash amid privacy issues, ôzoom bombingõ attack
are lockdown working? google offer location data to help pandemic fight
americaõs housing market is showing the first sign of trouble from the coronavirus pandemic
two. trillion. dollars? hereõs where all that coronavirus stimulus is going
20 technology stock with low debt to consider owning in a down market
zoom green screens, but not pants: here are the new work from home essential
everyone is worried about the internet. so far, itõs actually doing just fine.
google and facebook canõt save the advertising industry this time
new york cityõs economy is in the crucible of the crisis. the rest of the country is next.
apple acquires ai startup to better understand natural language
tech giant such a facebook and google continue hiring a startup conduct layoff
google join with u.k. researcher to tra

the newest android hit pixel phone
state ag launching google probe next week
youtube cut 100k hate speech video in q2
sector update: leading tech stock slip pre-market tuesday
toyota to offer google's android auto in new lexus rx, rxl model
market chatter: alphabet's google settle ftc case on youtube privacy for child
google contractor move to unionize
google to pay $150m-$250m in youtube settlement - politico
alphabet's video platform youtube to drop paywall for upcoming show
google researcher found mass iphone hack attempt
google reveals major iphone security flaw
sector update: tech heavyweight climb pre-market friday
wipro partner google cloud to accelerate digital transformation for global enterprise
sector update: tech giant remain higher a close approach
sector update: tech major trade higher pre-market thursday
huawei's new phone to launch without android os, google apps
market chatter: slack technology stock 'looks overpriced,' wsj column say
youtube kid coming to web
market c

serving the 2 billion unbanked: a new trillion dollar market
rpt: google to invest $550 mln in chinese e-commerce company jd.com
google to invest $550 mln in chinese e-commerce company jd.com
big google investment in jd.com
apple confirms hiring waymo senior engineer
market chatter: alphabet's google purchase more land in europe in push for more data center
google buy 173 acre in the netherlands for data center
citron research warns on netflix vulnerability
google to open 1st african ai center this year
google set to open it first ai center in africa
alphabet's google roll out feature to simplify college search process
cramer: at&t judge set 'blueprint' for comcast/media buying spree
self-driving car company face choice between active and passive sensor
google, facebook, twitter face new house panel hearing
google home now handle three task in a query; alexa get deeper dish integration
vietnam pass cyber law that could hurt facebook, google
axios: gv will contribute $50m to round in sc

juniper network name bikash koley a chief technology officer
judge tell alphabet to submit narrower set of gender-pay data
alphabet's google launch google glass enterprise for industrial use
alphabet's google preparing to sell google home in australia and germany
google officially launch enterprise version of google glass
google home launch coming to australia, germany
google launch hire for g suite user
google to offer researcher access to quantum computing cloud
google play music launch new release station
jpmorgan chase home lending team up with alphabet's google to track home buyers' click
google cloud platform add another region to take on aws
google launch backup and sync tool
book a spa or salon appointment through google
french government to appeal google tax court decision
sector update: tech stock help carry u.s. market to new high
google acquires artificial intelligence start-up halli lab
market chatter: zillow group drop 4% after amazon said to be preparing to offer real-es

president brin sergey sell $11.2m of goog on 2/8/16
u future remain positive; yellen see no preset course for hiking rates, condition warrant 'gradual' increase
alphabet share rise 2% pre-bell a traffic safety agency say computer may be 'driver' of car
google robot finally get a driver's license?
chief executive officer page lawrence e sell $11.5m of goog on 2/5/16
google ceo pichai get record $199m stock grant
u stock tumble with nasdaq at october 2014 low a tech, bank share retreat
chief executive officer page lawrence e sell $11.9m of goog on 2/4/16
chief executive officer page lawrence e sell $4.1m of goog on 2/3/16
director hennessy john l sell $601.7k of goog on 2/3/16
chief executive officer page lawrence e sell $10.0m of goog on 2/3/16
sector update: technology share finish mostly higher
sector update: technology share edge lower
google's search chief leaving; share sell off day after post-earnings gain
u stock continue slide a data weighs, yahoo! drag down tech share
midday up

In [18]:
google_headlines2_copy=i
pd.DataFrame(google_headlines2)
google_headlines2

Unnamed: 0,headlines
0,ôhonestly we donõt know what to expectõ meg wh...
1,zoom video lurch from boom to backlash amid pr...
2,are lockdown working? google offer location da...
3,americaõs housing market is showing the first ...
4,two. trillion. dollars? hereõs where all that ...
...,...
5731,yahoo/bing displaces google a mozilla's u.s. s...
5732,report: google play to finally arrive in china
5733,google now reportedly aiming for 2015 glass la...
5734,youtube paid music service to see test launch ...


In [19]:
google_headline_date=google_headlines[['date']]

In [20]:
google_headlines[['date']]

Unnamed: 0,date
0,2020-04-05
1,2020-04-05
2,2020-04-05
3,2020-04-04
4,2020-04-04
...,...
5731,2014-11-19
5732,2014-11-19
5733,2014-11-14
5734,2014-11-12


In [21]:
 google_headlines3= pd.concat([google_headline_date, google_headlines2], axis=1, join='inner')

In [22]:
 google_headlines3.shape

(5736, 2)

In [23]:
google_headlines3.set_index('date',inplace=True)

In [24]:
google_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [25]:
google_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2014-11-12,google's massive doubleclick ad server go down
2014-11-12,youtube paid music service to see test launch ...
2014-11-14,google now reportedly aiming for 2015 glass la...
2014-11-19,report: google play to finally arrive in china
2014-11-19,yahoo/bing displaces google a mozilla's u.s. s...


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (GOOGLE) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [26]:
!pip install vaderSentiment



In [27]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
google_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in google_headlines3['headlines']]
google_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in google_headlines3['headlines']]
google_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in google_headlines3['headlines']]
google_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in google_headlines3['headlines']]

google_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11-12,google's massive doubleclick ad server go down,-0.4588,0.333,0.667,0.0
2014-11-12,youtube paid music service to see test launch ...,0.4588,0.0,0.75,0.25
2014-11-14,google now reportedly aiming for 2015 glass la...,0.4588,0.0,0.7,0.3
2014-11-19,report: google play to finally arrive in china,0.34,0.0,0.745,0.255
2014-11-19,yahoo/bing displaces google a mozilla's u.s. s...,0.0,0.0,1.0,0.0
2014-11-21,"ft: eu parliament ""poised to call"" for google'...",0.25,0.0,0.8,0.2
2014-11-25,what did the billionaire do in q3?,0.0,0.0,1.0,0.0
2014-11-25,report: yahoo/microsoft intent on taking googl...,0.0,0.0,1.0,0.0
2014-11-27,eu lawmaker vote for google breakup,0.0,0.0,1.0,0.0
2014-12-01,intel push further in wearable with google-glass,0.0,0.0,1.0,0.0


## Google Headlines Average Vadar Scores By Date

In [29]:
google_headline_vadar=google_headlines3.groupby('date').sum()

In [30]:
google_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11-12,0.0,0.333,1.417,0.25
2014-11-14,0.4588,0.0,0.7,0.3
2014-11-19,0.34,0.0,1.745,0.255
2014-11-21,0.25,0.0,0.8,0.2
2014-11-25,0.0,0.0,2.0,0.0


In [31]:
google_headline_vadar.to_csv('./datasets/headlines/google_headline_vadar.csv')