In [36]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) CLOUDERA##


In [37]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')
equinix_headlines=pd.read_csv('./datasets/headlines/equinix_headlines.csv')
tsla_headlines=pd.read_csv('./datasets/headlines/tsla_headlines.csv')
apple_headlines=pd.read_csv('./datasets/headlines/apple_headlines.csv')
nflx_headlines=pd.read_csv('./datasets/headlines/netflix_headlines.csv')
cloudera_headlines=pd.read_csv('./datasets/headlines/cloudera_headlines.csv')

In [38]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [39]:
cloudera_headlines.head(100)

Unnamed: 0,headlines,date
0,Cloudera and Hortonworks Complete Planned Merger,01-03-19
1,Cloudera-Hortonworks merger closes,01-03-19
2,Financialinsiders.com: 'Roller Coaster Week' M...,01-04-19
3,Cloudera to Participate in Upcoming Financial ...,01-07-20
4,Analyst Actions: DA Davidson Lifts Cloudera's ...,01-08-20
5,Northland starts Cloudera at 30% upside,01-09-19
6,Analyst Actions: Northland Initiates Cloudera ...,01-09-19
7,Analyst Actions: Citigroup Upgrades Cloudera t...,01-10-18
8,"Cloudera +3.7% on analyst upgrades, price targ...",01-10-18
9,Analyst Actions: Cloudera Gets Upgrade at Mizu...,01-10-18


In [40]:
cloudera_headlines.shape

(574, 2)

In [41]:
cloudera_headlines.isnull().sum()

headlines    0
date         0
dtype: int64

In [42]:
cloudera_headlines.columns = [x.lower() for x in cloudera_headlines.columns]

In [43]:
cloudera_headlines['date'] = pd.to_datetime(cloudera_headlines['date'])

In [44]:
cloudera_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [45]:
cloudera_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
160,Cloudera Announces Pricing of Initial Public O...,2017-04-27
161,Cloudera Prices IPO of 15 Mln Shares at $15 Ea...,2017-04-28
162,Cloudera jumps 25% in public debut,2017-04-28
163,Stock Move: Cloudera Shares Rise 25% Above IPO...,2017-04-28
164,Sector Update: Tech Stocks Counter Balance Dow...,2017-04-28
165,Equities End Weaker After US GDP But April Bri...,2017-04-28
166,Cloudera Announces General Availability of Dat...,2017-05-01
167,Cloudera Named to CRN's Big Data 100 List for ...,2017-05-02
169,Cloudera Announces Closing of Initial Public O...,2017-05-03
168,Good Guys Win: Apache Spot (Incubating) Team C...,2017-05-03


**Separate the Headlines Column for Preprocessing**

In [46]:
cloudera_headlines2=cloudera_headlines[['headlines']].copy()

In [47]:
cloudera_headlines2

Unnamed: 0,headlines
0,Cloudera and Hortonworks Complete Planned Merger
1,Cloudera-Hortonworks merger closes
2,Financialinsiders.com: 'Roller Coaster Week' M...
3,Cloudera to Participate in Upcoming Financial ...
4,Analyst Actions: DA Davidson Lifts Cloudera's ...
5,Northland starts Cloudera at 30% upside
6,Analyst Actions: Northland Initiates Cloudera ...
7,Analyst Actions: Citigroup Upgrades Cloudera t...
8,"Cloudera +3.7% on analyst upgrades, price targ..."
9,Analyst Actions: Cloudera Gets Upgrade at Mizu...


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [48]:
cloudera_headlines2.headline = [x.lower() for x in cloudera_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [49]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = cloudera_headlines2.iloc[i,0]#selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    cloudera_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [50]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [51]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(cloudera_headlines2['headlines'])):#creating a function
    words = cloudera_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    cloudera_headlines2.iloc[i,0] = cleaned_string

In [52]:
for i in cloudera_headlines2['headlines']:
    print(i)
    

cloudera and hortonworks complete planned merger
cloudera-hortonworks merger close
financialinsiders.com: 'roller coaster week' market news recap ending january 4th, 2019
cloudera to participate in upcoming financial conference
analyst actions: da davidson lift cloudera's price target to $14 from $13, reiterates buy rating
northland start cloudera at 30% upside
analyst actions: northland initiate cloudera at outperform
analyst actions: citigroup upgrade cloudera to buy from neutral
cloudera +3.7% on analyst upgrades, price target increase
analyst actions: cloudera get upgrade at mizuho security from neutral to buy with $21 price target
update: analyst actions: mizuho upgrade cloudera to buy; pt set at $21
how did cloudera, inc. (cldr) compare against top hedge fund stock in 2019?
needham initiate cloudera at buy
analyst actions: needham iniitates cloudera at buy; pt set at $22
cloudera appoints robert bearden president and chief executive officer
cloudera appoints hortonworks founder a

In [53]:
cloudera_headlines2_copy=i
pd.DataFrame(cloudera_headlines2)
cloudera_headlines2

Unnamed: 0,headlines
0,cloudera and hortonworks complete planned merger
1,cloudera-hortonworks merger close
2,financialinsiders.com: 'roller coaster week' m...
3,cloudera to participate in upcoming financial ...
4,analyst actions: da davidson lift cloudera's p...
5,northland start cloudera at 30% upside
6,analyst actions: northland initiate cloudera a...
7,analyst actions: citigroup upgrade cloudera to...
8,"cloudera +3.7% on analyst upgrades, price targ..."
9,analyst actions: cloudera get upgrade at mizuh...


In [54]:
cloudera_headline_date=cloudera_headlines[['date']]

In [55]:
cloudera_headlines[['date']]

Unnamed: 0,date
0,2019-01-03
1,2019-01-03
2,2019-01-04
3,2020-01-07
4,2020-01-08
5,2019-01-09
6,2019-01-09
7,2018-01-10
8,2018-01-10
9,2018-01-10


In [56]:
cloudera_headlines3= pd.concat([cloudera_headline_date, cloudera_headlines2], axis=1, join='inner')

In [57]:
cloudera_headlines3.shape

(574, 2)

In [58]:
cloudera_headlines3.set_index('date',inplace=True)

In [59]:
cloudera_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [60]:
cloudera_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2017-04-27,cloudera announces pricing of initial public o...
2017-04-28,cloudera price ipo of 15 mln share at $15 each...
2017-04-28,cloudera jump 25% in public debut
2017-04-28,stock move: cloudera share rise 25% above ipo ...
2017-04-28,sector update: tech stock counter balance down...


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (CLOUDERA) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [61]:
!pip install vaderSentiment



In [62]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [67]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
cloudera_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in cloudera_headlines3['headlines']]
cloudera_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in cloudera_headlines3['headlines']]
cloudera_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in cloudera_headlines3['headlines']]
cloudera_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in cloudera_headlines3['headlines']]

cloudera_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-04-27,cloudera announces pricing of initial public o...,0.0,0.0,1.0,0.0
2017-04-28,cloudera price ipo of 15 mln share at $15 each...,0.296,0.0,0.891,0.109
2017-04-28,cloudera jump 25% in public debut,0.0,0.0,1.0,0.0
2017-04-28,stock move: cloudera share rise 25% above ipo ...,0.296,0.0,0.833,0.167
2017-04-28,sector update: tech stock counter balance down...,-0.1027,0.149,0.851,0.0
2017-04-28,equity end weaker after u gdp but april brings...,0.5647,0.125,0.579,0.296
2017-05-01,cloudera announces general availability of dat...,0.0,0.0,1.0,0.0
2017-05-02,cloudera named to crn's big data 100 list for ...,0.0,0.0,1.0,0.0
2017-05-03,cloudera announces closing of initial public o...,0.296,0.0,0.885,0.115
2017-05-03,good guy win: apache spot (incubating) team co...,0.8807,0.0,0.595,0.405


## Cloudera Headlines Average Vadar Scores By Date

In [68]:
cloudera_headline_vadar=cloudera_headlines3.groupby('date').mean()

In [69]:
cloudera_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-04-27,0.0,0.0,1.0,0.0
2017-04-28,0.2108,0.0548,0.8308,0.1144
2017-05-01,0.0,0.0,1.0,0.0
2017-05-02,0.0,0.0,1.0,0.0
2017-05-03,0.58835,0.0,0.74,0.26


In [70]:
cloudera_headline_vadar.to_csv('./datasets/headlines/cloudera_headline_vadar.csv')