In [2]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) TESLA##


In [11]:

tsla_headlines=pd.read_csv('./datasets/headlines/tsla_headlines.csv')

In [12]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [13]:
tsla_headlines.head(100)

Unnamed: 0,headlines,date
0,Tesla faces lawsuit claiming racism,01-01-20
1,Tesla will face lawsuit claiming racism,01-01-20
2,Tesla Model 3 Dutch Registrations Quadruple in...,01-02-20
3,First customers to receive China-made Model 3s,01-02-20
4,Market Chatter: Court Rejects Tesla's Motion t...,01-02-20
5,Traffic Safety Agency to Probe Fatal Tesla Cra...,01-02-20
6,Market Chatter: Tesla to Start Delivering Mode...,01-02-20
7,Canaccord Genuity fires off Street-high PT on ...,01-02-20
8,"Tesla Q4 2018 Vehicle Production & Deliveries,...",01-02-19
9,"Tesla plunges after deliveries update, price cut",01-02-19


In [14]:
tsla_headlines.shape

(6349, 2)

In [15]:
tsla_headlines.isnull().sum()

headlines    0
date         1
dtype: int64

In [16]:
tsla_headlines.columns = [x.lower() for x in tsla_headlines.columns]

In [17]:
tsla_headlines['date'] = pd.to_datetime(tsla_headlines['date'])

In [18]:
tsla_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [19]:
tsla_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
5842,Tesla Motors preps solar rooftop initiative,2014-11-13
5851,Report: Markdown time for some 2014 Model S ve...,2014-11-14
5904,Worries on the Model X delivery schedule nip a...,2014-11-19
5954,"Tesla, BMW in talks over possible collaboration",2014-11-24
5997,Automaker stocks react to oil price slide,2014-11-28
...,...,...
2996,Stocks - JPMorgan up in Premarket After Dimon'...,2020-04-06
2994,Market Chatter: Tesla Dismisses Contractors Fr...,2020-04-06
3010,"Auto Stock Roundup: TSLA Q1 Deliveries, KMX Ea...",2020-04-06
2995,Tesla Shows 'Car Parts-Based' Ventilator Proto...,2020-04-06


**Separate the Headlines Column for Preprocessing**

In [20]:
tsla_headlines2=tsla_headlines[['headlines']].copy()

In [21]:
tsla_headlines2

Unnamed: 0,headlines
0,Tesla faces lawsuit claiming racism
1,Tesla will face lawsuit claiming racism
2,Tesla Model 3 Dutch Registrations Quadruple in...
3,First customers to receive China-made Model 3s
4,Market Chatter: Court Rejects Tesla's Motion t...
...,...
6344,Sector Update: Most Top Consumer Stocks Lower ...
6345,December U.S. auto sales forecast
6346,Tesla still with plenty of inventory?
6347,Market Chatter: Tesla Reportedly Left With Ove...


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [22]:
tsla_headlines2.headline = [x.lower() for x in tsla_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [23]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = tsla_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    tsla_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [24]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [25]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(tsla_headlines2['headlines'])):#creating a function
    words = tsla_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    tsla_headlines2.iloc[i,0] = cleaned_string

In [26]:
for i in tsla_headlines2['headlines']:
    print(i)
    

tesla face lawsuit claiming racism
tesla will face lawsuit claiming racism
tesla model 3 dutch registration quadruple in december
first customer to receive china-made model 3
market chatter: court reject tesla's motion to dismiss racial discrimination lawsuit
traffic safety agency to probe fatal tesla crash in california
market chatter: tesla to start delivering model 3 sedan to the public on jan.7
canaccord genuity fire off street-high pt on tesla
tesla q4 2018 vehicle production & deliveries, also announcing $2,000 price reduction in u
tesla plunge after delivery update, price cut
analyst actions: canaccord genuity lift tesla's price target to $515 from $375, reiterates buy rating
tsla and mt among premarket loser
thinking about investing in apple, alibaba, netflix, nvidia and tesla motor in the new year?
tesla cut u vehicle price by $2,000 a fourth-quarter production, delivery set company record
tesla q4 delivery miss expectations, lower price of all model to offset federal tax cred

tesla's capital raise called a smart move
jobless claims, cpi lower; tesla (tsla) raising stock offering
household inflation lower in january
auto stock roundup: tm beats, hfc misses, tsla issue recall & more
tesla plan $2 billion stock offering; musk and ellison to purchase new share
tesla bruise another hedge fund with bearish gmt facing loss
tesla's stock sale is so right but feel so wrong
tesla look to raise up to $2.31 billion in stock offering with musk, oracle's ellison set to invest
tesla discloses new sec investigation and doj inquiry
tesla plan $2 billion offering a elon musk seizes on stock surge
jim cramer: tesla's common stock offering a good thing in the long run
tesla is selling more stock, discloses subpoena from the sec
tesla's $2 billion capital raise is a 'wise insurance policy,' analyst say
tesla is highly challenged, hence the capital raise
tesla field a fresh sec inquiry after another probe end
tesla just solved it biggest problem by selling more stock
trader rene

tesla plan to halt u.s. car production over coronavirus
tesla suspends production at u.s. vehicle factory due to coronavirus
tesla to shut down fremont factory due to coronavirus, say it ha enough cash to survive
press digest- new york time business news - march 20
press digest- wall street journal - march 20
u.s. safety agency probe first self-driving death
tesla's china car registration fall 35% month-on-month in february
wedbush expects tesla to outperform
tesla discontinues 60kwh model s
coronavirus shuts down california, senate debate $1t stimulus, apple, tesla - 5 thing you must know friday
musk abandon attempt to keep tesla factory open
dow jones future signal coronavirus stock market rally amid stimulus hopes; tesla, boeing, amd, nvidia, amazon rise
why chile is the saudi arabia of the lithium industry
wall street set for two-day winning streak on stimulus high
this might be the best time to buy tesla stock ever
automaker look to air it out
the who say there are 7,000 total cor

mobileye, bmw, intel to develop self-driving vehicle by 2021
tesla hit model 3 mark a few hour late
dvd player found in the tesla car involved in crash, florida department of highway safety say
tesla delivers 11,507 vehicle in q2 of 2015
panasonic weighing further gigafactory investment
market chatter: tesla succeeds in hitting model 3 output goal but miss self-imposed deadline
tesla rally after model 3 production update
tesla delivery surge 52% to company record
sector update: consumer
tesla q2 2018 vehicle production and delivery
wall street see downbeat session a global trade uncertainty linger
tesla hit model 3 output goal of 5,000 car per week a musk aim for more
premarket gainer a of 9:05 am (07/02/2018)
latest tesla exit is an engineering exec
tesla up more than 5% on q2 increase in production, see model 3 production up to 6,000 per week by late aug; co reaffirms outlook
tesla update on production and reservation
equity slide in u a trade worries, oil decline weigh
sector update

market chatter: tesla motor little changed despite report of model s autopilot fatality in china
tesla on track to hit model 3 production guidance - electrek
update: tesla motor just higher, company say it's investigating fatal crash in china
baird a big believer in tesla's autopilot
market chatter: tesla on track for q3 model 3 production guidance with output outburst
shareholder alert: pomerantz law firm reminds shareholder with loss on their investment in tesla, inc. of class action lawsuit and upcoming deadline – tsla
tesla brand shareholder alert: claimsfiler reminds investor with loss in excess of $100,000 of lead plaintiff deadline in class action lawsuit against tesla, inc. - tsla
mobileye: tesla 'pushed the envelope on safety'
pre-market technical recap on auto maker stock -- tesla, fiat chrysler automobiles, ford motor, and general motor
research report on auto maker equity -- ford, general motors, fiat chrysler, and tesla
strengthening tech in the lithium-ion battery market


tesla bull ratchet down expectation
tesla motors, intel, domino's pizza, teck resource limited, and kite pharma and more offer option-trading opportunity that offer return of more than 20%
fitch: battery could be key disruptor to oil industry in "investor death spiral"
tesla motor post later schedule for model 3 delivery
solarcity offer $1,000 rebate to airbnb member for installing solar system
tesla motor up almost 2% although company delay model 3 delivery estimate to mid-2018 or later
tesla update on model 3 delivery schedule for new buyer
tesla add $100 non-refundable order fee, tweak company return policy
update: tesla motor up almost 2%, company say new model 3 delivery estimate to mid-2018 or later
electric hummer could be part of gm's move into ev trucks, suv - reuters
tsla loss notice: rosen law firm reminds tesla, inc. investor of important deadline in first filed class action – tsla
musk announces release of lower-cost, mid-range model 3
early premarket gainer include pypl, 

In [27]:
tsla_headlines2_copy=i
pd.DataFrame(tsla_headlines2)
tsla_headlines2

Unnamed: 0,headlines
0,tesla face lawsuit claiming racism
1,tesla will face lawsuit claiming racism
2,tesla model 3 dutch registration quadruple in ...
3,first customer to receive china-made model 3
4,market chatter: court reject tesla's motion to...
...,...
6344,sector update: most top consumer stock lower p...
6345,december u.s. auto sale forecast
6346,tesla still with plenty of inventory?
6347,market chatter: tesla reportedly left with ove...


In [28]:
tsla_headline_date=tsla_headlines[['date']]

In [29]:
tsla_headlines[['date']]

Unnamed: 0,date
0,2020-01-01
1,2020-01-01
2,2020-01-02
3,2020-01-02
4,2020-01-02
...,...
6344,2019-12-31
6345,2018-12-31
6346,2018-12-31
6347,2018-12-31


In [30]:
tsla_headlines3= pd.concat([tsla_headline_date, tsla_headlines2], axis=1, join='inner')

In [31]:
tsla_headlines3.shape

(6349, 2)

In [32]:
tsla_headlines3.set_index('date',inplace=True)

In [33]:
tsla_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [34]:
tsla_headlines3.tail()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2020-04-06,stock - jpmorgan up in premarket after dimon's...
2020-04-06,market chatter: tesla dismisses contractor fro...
2020-04-06,"auto stock roundup: tsla q1 deliveries, kmx ea..."
2020-04-06,tesla show 'car parts-based' ventilator protot...
NaT,yahoo


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (TESLA) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [35]:
!pip install vaderSentiment



In [36]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [37]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
tsla_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in tsla_headlines3['headlines']]
tsla_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in tsla_headlines3['headlines']]
tsla_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in tsla_headlines3['headlines']]
tsla_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in tsla_headlines3['headlines']]

tsla_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-11-13,tesla motor prep solar rooftop initiative,0.0,0.0,1.0,0.0
2014-11-14,report: markdown time for some 2014 model s ve...,0.0,0.0,1.0,0.0
2014-11-19,worry on the model x delivery schedule nip at ...,-0.4404,0.225,0.775,0.0
2014-11-24,"tesla, bmw in talk over possible collaboration",0.0,0.0,1.0,0.0
2014-11-28,automaker stock react to oil price slide,0.0,0.0,1.0,0.0
2014-11-28,report: bmw to pas on tesla motor investment,0.0,0.0,1.0,0.0
2014-12-01,down day for tesla motor amid new debate,-0.4588,0.3,0.7,0.0
2014-12-02,tesla motor lead owner satisfaction survey,0.4404,0.0,0.633,0.367
2014-12-03,study: nanoparticles could cut lithium-ion bat...,-0.2732,0.231,0.769,0.0
2014-12-08,$50k is the new $30k for automobile industry,0.0,0.0,1.0,0.0


## Tesla Headlines Average Vadar Scores By Date

In [38]:
tsla_headline_vadar=tsla_headlines3.groupby('date').sum()

In [39]:
tsla_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11-13,0.0,0.0,1.0,0.0
2014-11-14,0.0,0.0,1.0,0.0
2014-11-19,-0.4404,0.225,0.775,0.0
2014-11-24,0.0,0.0,1.0,0.0
2014-11-28,0.0,0.0,2.0,0.0


In [40]:
tsla_headline_vadar.to_csv('./datasets/headlines/tsla_headline_vadar.csv')