In [1]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) AMAZON##


In [2]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')

In [3]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [4]:
amazon_headlines.head(100)

Unnamed: 0,headlines,date
0,2014 leaders and laggards,01-01-15
1,2015 leaders and laggards,01-01-16
2,How will the FAANGs perform in 2019?,01-01-19
3,Bezos loses $7.4B in Amazon's worst year since...,01-02-15
4,OPTIONS: Large Tech Option Implied Volatility,01-02-19
5,DXC's Luxoft Expands Collaboration With Micros...,01-02-20
6,Report: Amazon India to offer streaming services,01-02-15
7,CFRA: History Suggests Owning 'Barbell Portfol...,01-02-18
8,"Evercore lowers Amazon target, revenue estimates",01-02-19
9,"Check Point, Zscaler, Fortinet, Qualys, CyberA...",01-02-19


In [5]:
amazon_headlines.shape

(8750, 2)

In [6]:
amazon_headlines.isnull().sum()

headlines    0
date         0
dtype: int64

In [7]:
amazon_headlines.columns = [x.lower() for x in amazon_headlines.columns]

In [8]:
amazon_headlines['date'] = pd.to_datetime(amazon_headlines['date'])

In [9]:
amazon_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [10]:
amazon_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
7628,Amazon goes after Oracle; no cloud price cuts ...,2014-11-12
7638,Amazon: ARM chipmakers are no match for Intel,2014-11-13
7648,Wal-Mart tells U.S. stores to match online prices,2014-11-13
7664,"Amazon, Hachette end e-book pricing battle",2014-11-14
7683,Amazon +4.7% on strong volume,2014-11-14
...,...,...
3849,"‘Honestly, we don’t know what to expect’: Meg ...",2020-04-06
3832,Market Chatter: Amazon in Talks for Potential ...,2020-04-06
3830,Watch Now: Here's What's Moving Markets - Apri...,2020-04-06
3848,"The Zacks Analyst Blog Highlights: Amazon, Med...",2020-04-06


**Separate the Headlines Column for Preprocessing**

In [11]:
amazon_headlines2=amazon_headlines[['headlines']].copy()

In [12]:
amazon_headlines2

Unnamed: 0,headlines
0,2014 leaders and laggards
1,2015 leaders and laggards
2,How will the FAANGs perform in 2019?
3,Bezos loses $7.4B in Amazon's worst year since...
4,OPTIONS: Large Tech Option Implied Volatility
...,...
8745,Wall Street Set for Upbeat Session as 2018 Dra...
8746,Amazon targets rural India market
8747,Market Chatter: Holiday Toy Sales Disappoint W...
8748,Analyst: Market needs FAANG to recover


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [13]:
amazon_headlines2.headline = [x.lower() for x in amazon_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [14]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = amazon_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    amazon_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [15]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [16]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(amazon_headlines2['headlines'])):#creating a function
    words = amazon_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    amazon_headlines2.iloc[i,0] = cleaned_string

In [17]:
for i in amazon_headlines2['headlines']:
    print(i)
    

2014 leader and laggard
2015 leader and laggard
how will the faangs perform in 2019?
bezos loses $7.4b in amazon's worst year since 2008
options: large tech option implied volatility
dxc's luxoft expands collaboration with microsoft on vehicle-related product
report: amazon india to offer streaming service
cfra: history suggests owning 'barbell portfolio' of last year's 10 best, 10 worst sub-industries
evercore lower amazon target, revenue estimate
check point, zscaler, fortinet, qualys, cyberark named a wedbush's favorite cybersecurity stock for 2019
retail jolt: amazon seen making a run at target
amazon’s best of prime 2017 reveals the year’s biggest trend —more than 5 billion item shipped with prime in 2017
thinking about trading option or stock in amazon.com, boeing, mastercard, microsoft, or verizon?
market chatter: target share rise a analyst predicts co may be acquired by amazon in 2018
amazon bull want gas station
amazon shipped 5b item to prime member last year
ambarella and a

microsoft join apple, hp in scrapping outlook on virus
uber drivers, pizza delivery worker get lift from eu’s vestager
cvc is in talk to join idreamsky in takeover of gaming firm leyou
retailer carrefour raise cost saving goal a 2019 core profit rise
moonlight win best picture oscar
amazon start delivery service down under
ice cream and thai curry are about to fly through irish sky
norway wealth fund earned a record $180 bln in 2019
pre-market briefing: u stock future flat; lse say deutsche boerse merger unlikely to get eu ok
market chatter: amazon, comcast, electronic art submit proposal for holding firm of gaming developer nexon
(re)tail spin: landlord struggle to lease retail space in mixed-use development
report: developing opportunity within amazon, prospect capital, alliance data, duke realty, stellus capital investment, and icon — future expectations, projection moving into 2019
home depot’s big-ticket item are a hit with shopper and analyst
grocery intrigue: sam's club partner 

kroger-ocado deal shake up online grocery
roku emphasizes growing viewership, improved inventory-mix, ad monetization during investor meeting
pharmacy operator under pressure on amazon threat
wireless tower stock riding high after amazon-dish report
amazon add flight to indian pay
audible’s next chapter in newark begin with grand opening of it innovation cathedral
fastly jump a cloud-computing company share open well above ipo price
market chatter: vp of alexa departs amazon, cnbc say
coupa software buy riskopy, inc.; launch open buy with amazon business
market chatter: gap ceo heard to say that he'd consider amazon, others a marketing partners: bloomberg
director ryder thomas o sell $2.8m of amzn on 5/16/16
grubhub close down 7.8% after news of amazon's nyc food delivery move
amazon web service announces availability of x1 instance for amazon ec2
apple's homepod rank fourth in smart speaker market
amazon rank #2 on linkedin’s annual top company list
amazon's share of smart speaker mar

sector update: tech stock fading in late trade
ford's self-driving car push could have major commercial implication
amazon.com's twitch unit buy curse
amazon's twitch acquires curse
options: option implied volatility increase for large tech
amazon introduces new original children’s album from grammy award-winning evanescence vocalist, amy lee—exclusively streaming on prime music
nyt delivers harsh expose on amazon's workplace culture; bezos responds
amazon open alexa toolkit for commercial developer
betterinvesting top 100 grew 13.3 percent annually for 5 year through july
tivo sink on report of amazon dvr plan
market chatter: tivo lower on report amazon's prepping digital dvr
amazon.com share pressured after walmart, alibaba result
amazon.com to open 13th u.k. fulfillment center in tilbury, essex
equity hedge fund see 7% return so far this year, helped by u.s., chinese tech giants, goldman sachs say
market chatter: amazon.com's twitch acquires video indexing platform, clipmine
vice pr

eu court say luxury brand can keep product off amazon
morgan stanley bearish on patterson and schein by amazon threat
amazon.com, inc. announces early participation result of exchange offer and consent solicitation for whole food market, inc. 5.200% note due 2025
uk retail sale growth slows in november a consumer save up for black friday
growth in digital marketing popularity thanks to social medium
richard armitage performs romeo and juliet: a novel for audible
evercore isi initiate amazon, alphabet, and facebook
the amazon prime video app is now available on apple tv in over 100 country
amazon.com trade offer a 6.05% return in 74 days, or find similar option trade on intercept pharmaceuticals, energous corporation, opko health, and 3d system
amazon prime video now on apple tv device
grocery store 2.0: place your bet
amazon invite user to help alexa
amazon site could join notorious market list - wsj
market chatter: facebook reportedly looking to lease 700,00 square foot of office spac

In [18]:
amazon_headlines2_copy=i
pd.DataFrame(amazon_headlines2)
amazon_headlines2

Unnamed: 0,headlines
0,2014 leader and laggard
1,2015 leader and laggard
2,how will the faangs perform in 2019?
3,bezos loses $7.4b in amazon's worst year since...
4,options: large tech option implied volatility
...,...
8745,wall street set for upbeat session a 2018 draw...
8746,amazon target rural india market
8747,market chatter: holiday toy sale disappoint wi...
8748,analyst: market need faang to recover


In [19]:
amazon_headline_date=amazon_headlines[['date']]

In [20]:
amazon_headlines[['date']]

Unnamed: 0,date
0,2015-01-01
1,2016-01-01
2,2019-01-01
3,2015-01-02
4,2019-01-02
...,...
8745,2018-12-31
8746,2018-12-31
8747,2018-12-31
8748,2018-12-31


In [21]:
amazon_headlines3= pd.concat([amazon_headline_date, amazon_headlines2], axis=1, join='inner')

In [22]:
 amazon_headlines3.shape

(8750, 2)

In [23]:
amazon_headlines3.set_index('date',inplace=True)

In [24]:
amazon_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [25]:
amazon_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2014-11-12,amazon go after oracle; no cloud price cut for...
2014-11-13,amazon: arm chipmakers are no match for intel
2014-11-13,wal-mart tell u.s. store to match online price
2014-11-14,"amazon, hachette end e-book pricing battle"
2014-11-14,amazon +4.7% on strong volume


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (AMAZON) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [26]:
!pip install vaderSentiment



In [27]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [28]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
amazon_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in amazon_headlines3['headlines']]
amazon_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in amazon_headlines3['headlines']]
amazon_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in amazon_headlines3['headlines']]
amazon_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in amazon_headlines3['headlines']]

amazon_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11-12,amazon go after oracle; no cloud price cut for...,-0.3818,0.331,0.538,0.131
2014-11-13,amazon: arm chipmakers are no match for intel,-0.128,0.222,0.606,0.172
2014-11-13,wal-mart tell u.s. store to match online price,0.0,0.0,1.0,0.0
2014-11-14,"amazon, hachette end e-book pricing battle",-0.2263,0.313,0.482,0.205
2014-11-14,amazon +4.7% on strong volume,0.6124,0.0,0.375,0.625
2014-11-18,streaming service to be tracked by nielsen,0.0,0.0,1.0,0.0
2014-11-19,holiday online spending forecast to rise 16%,0.4019,0.0,0.69,0.31
2014-11-20,amazon reportedly prepping travel service; nyc...,0.1779,0.0,0.825,0.175
2014-11-20,"alibaba to launch international taobao, sell d...",0.0,0.25,0.5,0.25
2014-11-21,amazon reportedly working on ad-supported stre...,0.1779,0.0,0.779,0.221


## Amazon Headlines Average Vadar Scores By Date

In [29]:
amazon_headline_vadar=amazon_headlines3.groupby('date').sum()

In [30]:
amazon_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11-12,-0.3818,0.331,0.538,0.131
2014-11-13,-0.128,0.222,1.606,0.172
2014-11-14,0.3861,0.313,0.857,0.83
2014-11-18,0.0,0.0,1.0,0.0
2014-11-19,0.4019,0.0,0.69,0.31


In [31]:
amazon_headline_vadar.to_csv('./datasets/headlines/amazon_headline_vadar.csv')