In [5]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) APPLE##


In [6]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')
equinix_headlines=pd.read_csv('./datasets/headlines/equinix_headlines.csv')
tsla_headlines=pd.read_csv('./datasets/headlines/tsla_headlines.csv')
apple_headlines=pd.read_csv('./datasets/headlines/apple_headlines.csv')

In [7]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [8]:
apple_headlines.head(100)

Unnamed: 0,headlines,date
0,Apple closes first negative year since 2008,01-01-16
1,How will the FAANGs perform in 2019?,01-01-19
2,2019 leaders and laggards,01-01-20
3,Imagination Technologies Inks Licensing Deal W...,01-02-20
4,OPTIONS: Large Tech Option Implied Volatility,01-02-19
5,Hong Kong Hang Seng Opens 2018 With 2% Surge; ...,01-02-18
6,RBC: Apple core holding despite headwinds,01-02-19
7,Sector Update: Tech,01-02-18
8,Apple More Than Just iPhones as Investors Look...,01-02-19
9,More rumors about Apple's Indian dreams,01-02-17


In [9]:
apple_headlines.shape

(9523, 2)

In [10]:
apple_headlines.isnull().sum()

headlines    0
date         0
dtype: int64

In [11]:
apple_headlines.columns = [x.lower() for x in apple_headlines.columns]

In [12]:
apple_headlines['date'] = pd.to_datetime(apple_headlines['date'])

In [13]:
apple_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [14]:
apple_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
8462,"Alibaba open to working with PayPal, Apple",2014-11-12
8596,"Apple roundup: iPhone 6 supplies, UnionPay, Sa...",2014-11-17
8628,"Apple roundup: Watch, camera, iAd, hedge funds",2014-11-19
8651,Apple gains following target hikes,2014-11-20
8732,Apple breaks highs following new bullish notes,2014-11-24
...,...,...
4412,Apple Donating 1 Million Face Masks Per Week,2020-04-06
4411,Stocks - U.S. Futures Higher Amid Optimism Ove...,2020-04-06
4410,Apple producing 1M face shields per week,2020-04-06
4430,Thinking about trading options or stock in App...,2020-04-06


**Separate the Headlines Column for Preprocessing**

In [15]:
apple_headlines2=apple_headlines[['headlines']].copy()

In [16]:
apple_headlines2

Unnamed: 0,headlines
0,Apple closes first negative year since 2008
1,How will the FAANGs perform in 2019?
2,2019 leaders and laggards
3,Imagination Technologies Inks Licensing Deal W...
4,OPTIONS: Large Tech Option Implied Volatility
...,...
9518,Hardware stocks underperformed in 2019
9519,Close Update: Wall Street Plagued by Cheap Oil...
9520,Wall Street Closes Year With Declines on Oil D...
9521,"Apple, Microsoft top Dow 2019 standings; Walgr..."


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [19]:
apple_headlines2.headline = [x.lower() for x in apple_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [20]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = apple_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    apple_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [21]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [23]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(apple_headlines2['headlines'])):#creating a function
    words = apple_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    apple_headlines2.iloc[i,0] = cleaned_string

In [24]:
for i in apple_headlines2['headlines']:
    print(i)
    

apple close first negative year since 2008
how will the faangs perform in 2019?
2019 leader and laggard
imagination technology ink licensing deal with apple
options: large tech option implied volatility
hong kong hang seng open 2018 with 2% surge; tech stock gain
rbc: apple core holding despite headwind
sector update: tech
apple more than just iphones a investor look at segmentation, streaming services, reallocation, rbc say
more rumor about apple's indian dream
apple instructs store to offer $29 battery regardless of diagnostic test
apple make new deal with imagination tech
sector update: technology stock tip higher ahead of first trading day of 2018
sector update: tech stock drop in wednesday's pre-bell trade
sector update: top tech stock rise during pre-market trading thursday
thinking about buying stock in apple, biopharmx, china ceramic co., netflix or weatherford international?
thinking about investing in apple, alibaba, netflix, nvidia and tesla motor in the new year?
market cha

sector update: technology stock lower in pre-market trading; snap climb following earnings beat
goldman sachs tout software and services, say semiconductor, tech hardware most exposed to coronavirus
u index continue rally and close higher thursday
apple target increased to $154 at canaccord genuity
goldman sachs: apple iphone x "not so super cycle" earns neutral rating
u stocks-wall st pull back from record high after four-day rally
sector update: financial stock extending advance with broader u market
buy these stock because the tech rally can continue to run, analyst say
federal reserve see potential of 'spill over' impact from coronavirus to economy worldwide
apple’s outlook clouded a coronavirus extends production delay
france fine apple €25m for slowed iphones
corning's project phire aim to head off sapphire threat
viacomcbs to launch a new streaming service
apple reorganization suggests modem chip - reuters
the best mutual fund put over $1 billion in apple, these 3 stock
sector u

goldman sachs expects zero gdp growth for u in q2; american airline performed worst in s&p 500 since outbreak
apple is latest tech giant to tell employee to stay home a bay area coronavirus case surge
doj holding tech forum to combat scam
new york case climb; south by southwest canceled: virus update
rpt-sxsw festival canceled amid coronavirus outbreak
turbulent market test u.s. retail investor a coronavirus fear rage
south by southwest is the latest conference canceled because of coronavirus
best long-term stock to buy and watch during coronavirus correction
u.s. stock end lower, but book weekly gains, a coronavirus case break above 100,000
market chatter: apple hit by "ransomware" malware after microsoft, but stock higher pre-bell; microsoft slip
u market set to open lower despite oil gain
sector update: tech major narrowly mixed pre-bell thursday
if every time ‘man of the year’ had been a woman, here’s whom the magazine would have picked
should you buy stock in apple, advanced micro

market chatter: hp unveiling "thinnest laptop" at luxury conference on tuesday
sector update: slim decline for tech stock wednesday
market chatter: apple developing touchless technology for iphone display
n.y. ha most death in a day; italy ha fewer: virus update
market chatter: apple iphone se sale underwhelm over the weekend: forbes
two. trillion. dollars? here’s where all that coronavirus stimulus is going
sector update: tech stock rocket higher in late trade
amazon planning airpods rival - bloomberg
market chatter: amazon.com plan earbuds for alexa access this year
pandemic data-sharing put new pressure on privacy protection
apple hire google's top ai expert
sector update: technology major climb pre-market friday
citi reiterates apple buy on capital return
sector update: tech
sector update: technology stock lower in pre-market trade
net element start offering applepay service support in russia
market chatter: apple might delay iphone 8 launch a supplier face technical issue
wsj: app

apple declares $0.77 dividend
apple top q3 expectations, guide q4 sale mostly above street view
apple +2.8% on q3 beats, upside revenue view
nasdaq lead u stock selloff with third straight day of loss
notable earnings after tuesday’s close
apple net cash total $102b; keeping mac pro in u.s.
mt newswires after-hours news mover
report: new apple tv set-top due in september, will have remote with touchpad
apple remove vpn apps in china
apple third-quarter result beat analysts' estimate a service revenue jump
big tech partner with healthcare giant
pre-market briefing: u equity future modestly positive a commodity help lift miner
analyst lift apple target after earnings
--analyst actions: wedbush raise price target on apple to $245 from $235, maintains outperform rating
wall street set for rebound ahead of rate decision
sector update: most tech heavyweight gain pre-market wednesday
wall street seen opening higher ahead of data, apple result
thinking about buying stock in apple, aurora canna

apple, fitbit, johnson & johnson selected for fda pilot health software program
market chatter: apple may double investment in japan display after harvest fund management's withdrawal
analyst actions: raymond james keep apple outperform, price target raised to $180
apple roundup: component cost won't hurt margins, apple tv now on amazon
close update: stock retreat a election debate anticipation weighs, financials lose ground
presidential debate anticipation pull u stock lower a bank retreat
dow add fourth day to losing streak a u market end mixed
options: option implied volatility into national day and golden week
sk hynix detail toshiba deal plan
--analyst actions: jp morgan initiate coverage on apple with overweight rating and $272 pt
jpmorgan start apple at overweight on service potential
market chatter: apple reportedly plan to debut film in theater before streaming
apple bringing original film to theater - wsj
wsj detail 3d sensor delay for iphone x
sector update: leading tech sto

wall street see another round of loss a trade tension resurface after arrest of huawei exec
sector update: tech stock drop pre-bell thursday
option-trading opportunity on apple, bank of america, general motors, nvidia, and energous corporation
ubs: iphone purchase intent at five-year low
ruling against samsung in apple patent case ordered reversed and remanded
apple supplier drop after largan warns of lower dec. revenue
credit suisse restates apple at outperform on iphone 8 prospect
market chatter: apple edge up - ceo say apple watch sale to consumer set record during 1st week of holiday shopping
feganscott law firm confirms phonegate: new fcc-accredited lab result show apple and samsung smartphone rf radiation level exceed federal limit
sector update: most top tech stock higher friday midday
cnbc: apple bid for violent israeli tv show
market chatter: apple loses patent infringement case against samsung in u.s. supreme court; case will be sent to lower court to determine appropriate co

In [25]:
apple_headlines2_copy=i
pd.DataFrame(apple_headlines2)
apple_headlines2

Unnamed: 0,headlines
0,apple close first negative year since 2008
1,how will the faangs perform in 2019?
2,2019 leader and laggard
3,imagination technology ink licensing deal with...
4,options: large tech option implied volatility
...,...
9518,hardware stock underperformed in 2019
9519,close update: wall street plagued by cheap oil...
9520,wall street close year with decline on oil dro...
9521,"apple, microsoft top dow 2019 standings; walgr..."


In [26]:
apple_headline_date=apple_headlines[['date']]

In [27]:
apple_headlines[['date']]

Unnamed: 0,date
0,2016-01-01
1,2019-01-01
2,2020-01-01
3,2020-01-02
4,2019-01-02
...,...
9518,2019-12-31
9519,2015-12-31
9520,2015-12-31
9521,2019-12-31


In [28]:
apple_headlines3= pd.concat([apple_headline_date, apple_headlines2], axis=1, join='inner')

In [29]:
apple_headlines3.shape

(9523, 2)

In [30]:
apple_headlines3.set_index('date',inplace=True)

In [31]:
apple_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [32]:
apple_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2014-11-12,"alibaba open to working with paypal, apple"
2014-11-17,"apple roundup: iphone 6 supplies, unionpay, sa..."
2014-11-19,"apple roundup: watch, camera, iad, hedge fund"
2014-11-20,apple gain following target hike
2014-11-24,apple break high following new bullish note


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (APPLE) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [33]:
!pip install vaderSentiment



In [34]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [35]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
apple_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in apple_headlines3['headlines']]
apple_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in apple_headlines3['headlines']]
apple_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in apple_headlines3['headlines']]
apple_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in apple_headlines3['headlines']]

apple_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11-12,"alibaba open to working with paypal, apple",0.0,0.0,1.0,0.0
2014-11-17,"apple roundup: iphone 6 supplies, unionpay, sa...",0.0,0.0,1.0,0.0
2014-11-19,"apple roundup: watch, camera, iad, hedge fund",0.0,0.0,1.0,0.0
2014-11-20,apple gain following target hike,0.5267,0.0,0.541,0.459
2014-11-24,apple break high following new bullish note,0.0,0.0,1.0,0.0
2014-11-25,apple's market cap top $700b,0.2023,0.0,0.69,0.31
2014-11-27,swatch ceo say he's unfazed by apple watch,0.0,0.0,1.0,0.0
2014-12-01,apple -3.6% in volatile trading,0.0,0.0,1.0,0.0
2014-12-02,"apple lower following cautious pac crest, deut...",-0.3818,0.375,0.625,0.0
2014-12-02,apple head to trial over itunes update,0.0,0.0,1.0,0.0


## Apple Headlines Average Vadar Scores By Date

In [36]:
apple_headline_vadar=apple_headlines3.groupby('date').mean()

In [37]:
apple_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11-12,0.0,0.0,1.0,0.0
2014-11-17,0.0,0.0,1.0,0.0
2014-11-19,0.0,0.0,1.0,0.0
2014-11-20,0.5267,0.0,0.541,0.459
2014-11-24,0.0,0.0,1.0,0.0


In [38]:
apple_headline_vadar.to_csv('./datasets/headlines/apple_headline_vadar.csv')