In [17]:
#All libraries used have been added here
import pandas as pd
import numpy as np
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

from functools import reduce

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline
from datetime import datetime

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

from nltk.corpus import stopwords
nltk.download('stopwords')
import csv




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# HEADLINE NEWS (SENTIMENT ANALYSIS) ADOBE##


In [18]:
fb_headlines=pd.read_csv('./datasets/headlines/fb_headlines.csv')
google_headlines=pd.read_csv('./datasets/headlines/google_headlines.csv')
amazon_headlines=pd.read_csv('./datasets/headlines/amazon_headlines.csv')
equinix_headlines=pd.read_csv('./datasets/headlines/equinix_headlines.csv')
tsla_headlines=pd.read_csv('./datasets/headlines/tsla_headlines.csv')
apple_headlines=pd.read_csv('./datasets/headlines/apple_headlines.csv')
nflx_headlines=pd.read_csv('./datasets/headlines/netflix_headlines.csv')
cloudera_headlines=pd.read_csv('./datasets/headlines/cloudera_headlines.csv')
adobe_headlines=pd.read_csv('./datasets/headlines/adobe_headlines.csv')

In [19]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [20]:
adobe_headlines.head(100)

Unnamed: 0,headlines,date
0,CFRA: History Suggests Owning 'Barbell Portfol...,01-02-18
1,S&P 500 Expected by Goldman Sachs to Climb 7% ...,01-05-18
2,Adobe to Host Conference Call to Discuss TubeM...,01-05-17
3,BMO Capital Markets rates Adobe Systems Overwe...,01-05-17
4,--Analyst Actions: BMO Capital Initiates Cover...,01-05-17
5,Pacific Crest Securities selects grouping of c...,01-05-17
6,Chairman of the Board Geschke Charles M Sells ...,01-05-17
7,Chief Executive Officer Narayen Shantanu Exerc...,01-05-16
8,OPTIONS: Option Implied Volatility for Semicon...,01-07-20
9,Analyst Actions: Adobe Systems Upgraded To Buy...,01-07-19


In [21]:
adobe_headlines.shape

(1355, 2)

In [22]:
adobe_headlines.isnull().sum()

headlines    0
date         0
dtype: int64

In [23]:
adobe_headlines.columns = [x.lower() for x in adobe_headlines.columns]

In [24]:
adobe_headlines['date'] = pd.to_datetime(adobe_headlines['date'])

In [25]:
adobe_headlines.dtypes

headlines            object
date         datetime64[ns]
dtype: object

In [26]:
adobe_headlines.sort_values(by='date', ascending=True)

Unnamed: 0,headlines,date
1243,After Hour Gainers / Losers,2014-12-11
1239,Adobe buying photo marketplace Fotolia for $800M,2014-12-11
1238,"Adobe Systems beats by $0.06, beats on revenue",2014-12-11
1240,More on Adobe: Creative Cloud subs top 3.4M,2014-12-11
1252,Premarket Gainers / Losers,2014-12-12
...,...,...
500,3 Earnings Reports to Watch Next Week,2020-04-03
507,"Traffic at Walmart, Costco and Target falls fo...",2020-04-05
511,Zacks Investment Ideas feature highlights: Twi...,2020-04-06
512,"No Baseball, but My 30-30 Club Continues",2020-04-06


**Separate the Headlines Column for Preprocessing**

In [27]:
adobe_headlines2=cloudera_headlines[['headlines']].copy()

In [28]:
adobe_headlines2

Unnamed: 0,headlines
0,Cloudera and Hortonworks Complete Planned Merger
1,Cloudera-Hortonworks merger closes
2,Financialinsiders.com: 'Roller Coaster Week' M...
3,Cloudera to Participate in Upcoming Financial ...
4,Analyst Actions: DA Davidson Lifts Cloudera's ...
5,Northland starts Cloudera at 30% upside
6,Analyst Actions: Northland Initiates Cloudera ...
7,Analyst Actions: Citigroup Upgrades Cloudera t...
8,"Cloudera +3.7% on analyst upgrades, price targ..."
9,Analyst Actions: Cloudera Gets Upgrade at Mizu...


### Preprocessing Using Tokenizing, Stop Words and Lemmatization/Stemming
---


In [29]:
adobe_headlines2.headline = [x.lower() for x in adobe_headlines.headlines]

  """Entry point for launching an IPython kernel.


In [30]:
tokenizer = RegexpTokenizer(r'\w+') #instantiating the tokenizer

for i in range(len(['headlines'])):#creating a function
    words = adobe_headlines2.iloc[i,0]#selecting by integer-location based indexing
    #(where i is looping through each row and 1 refers to the index [1] position of column)
    content_tokens = tokenizer.tokenize(words.lower())#running the tokenizer and replacing with lowercase
    cleaned_string = ' '.join(content_tokens)#joining back the output with a space " " in between
    adobe_headlines2.iloc[i,0] = cleaned_string #renaming the output of the function

**We made the decision not to implement stopwords as this may dilute the sentiment analysis of the headlines (which are already quite brief)**

In [31]:
# for i in range(len(fb_headlines2['headline'])): #creating a function
#     words = fb_headlines2.iloc[i,0] #selecting by integer-location based indexing(where i is looping through each row and 1 refers to the index [1] position of column)
#     words = words.split() #splitting the words
#     meaningful_words = [w for w in words if not w in stopwords.words('english')]#assigning a variable to collect words which are not found in stop words.
#     print(meaningful_words)
#     cleaned_string = ' '.join(meaningful_words)#joining back the output with a space " " in between
#     fb_headlines2.iloc[i,0] = cleaned_string 

In [34]:
lemmatizer = WordNetLemmatizer() #instantiating the lemmatizer

for i in range(len(adobe_headlines2['headlines'])):#creating a function
    words = adobe_headlines2.iloc[i,0]
    words = words.lower() # changing to lowercase    
    words = re.sub(r'\&\w*;', '', words) # Remove HTML special entities (e.g. &amp;)        
    words = re.sub(r'\s\s+', ' ', words) # Remove whitespace (including new line characters)        
    words = re.sub(r'https?:\/\/.*\/\w*', '', words) # Remove hyperlinks 
    list_words = words.split()
    content_lem = [lemmatizer.lemmatize(i) for i in list_words]
    cleaned_string = ' '.join(content_lem)
    adobe_headlines2.iloc[i,0] = cleaned_string

In [35]:
for i in adobe_headlines2['headlines']:
    print(i)
    

cloudera and hortonworks complete planned merger
cloudera-hortonworks merger close
financialinsiders.com: 'roller coaster week' market news recap ending january 4th, 2019
cloudera to participate in upcoming financial conference
analyst actions: da davidson lift cloudera's price target to $14 from $13, reiterates buy rating
northland start cloudera at 30% upside
analyst actions: northland initiate cloudera at outperform
analyst actions: citigroup upgrade cloudera to buy from neutral
cloudera +3.7% on analyst upgrades, price target increase
analyst actions: cloudera get upgrade at mizuho security from neutral to buy with $21 price target
update: analyst actions: mizuho upgrade cloudera to buy; pt set at $21
how did cloudera, inc. (cldr) compare against top hedge fund stock in 2019?
needham initiate cloudera at buy
analyst actions: needham iniitates cloudera at buy; pt set at $22
cloudera appoints robert bearden president and chief executive officer
cloudera appoints hortonworks founder a

In [36]:
adobe_headlines2_copy=i
pd.DataFrame(adobe_headlines2)
adobe_headlines2

Unnamed: 0,headlines
0,cloudera and hortonworks complete planned merger
1,cloudera-hortonworks merger close
2,financialinsiders.com: 'roller coaster week' m...
3,cloudera to participate in upcoming financial ...
4,analyst actions: da davidson lift cloudera's p...
5,northland start cloudera at 30% upside
6,analyst actions: northland initiate cloudera a...
7,analyst actions: citigroup upgrade cloudera to...
8,"cloudera +3.7% on analyst upgrades, price targ..."
9,analyst actions: cloudera get upgrade at mizuh...


In [37]:
adobe_headline_date=adobe_headlines[['date']]

In [38]:
adobe_headlines[['date']]

Unnamed: 0,date
0,2018-01-02
1,2018-01-05
2,2017-01-05
3,2017-01-05
4,2017-01-05
...,...
1350,2018-12-26
1351,2018-12-28
1352,2018-12-28
1353,2017-12-29


In [39]:
adobe_headlines3= pd.concat([adobe_headline_date, adobe_headlines2], axis=1, join='inner')

In [40]:
adobe_headlines3.shape

(574, 2)

In [41]:
adobe_headlines3.set_index('date',inplace=True)

In [42]:
adobe_headlines3.sort_values(by='date', ascending=True, inplace=True)

In [43]:
adobe_headlines3.head()

Unnamed: 0_level_0,headlines
date,Unnamed: 1_level_1
2015-01-14,cloudera introduces the industry's first machi...
2015-03-16,"apha, ntap, ftnt and flr among notable after h..."
2015-03-17,"research report identifies cloudera, snap, soc..."
2015-03-17,leading pharmaceutical and healthcare organiza...
2015-03-17,cloudera to participate in upcoming financial ...


**The original dataset will be decomposed such that each headline contains a sentiment label, i.e. boolean whether the stock goes up or down.**

# SENTIMENT ANALYSIS USING VADAR (Valence Aware Dictionary and sEntiment Reasoner) (CLOUDERA) 

Vadar  was constructed using human raters from Amazon Mechanical Turk. We regard it as a reliable lexicon to extract emotional or sentiment polarity

We note the limitations of Vadar for this project.Vadar is sensitive to both the polarity and the intensity of sentiments and while some may view it more suited to analyzing social media texts rather than factual headlines which tends to lack emotional intensity, emotions and acronyms, we find it sufficiently applicable to sentiment analysis in our project.

Vadar combines a dictionary of lexical features to valence scores with a set of five heuristics. 
We left in heuristics like punctuation but removed capitalization as this could inaccurately increase the intensity of positive and negative words which we felt might not be relevant for factual headlines.

The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).

positive sentiment : (compound score >= 0.05)
neutral sentiment : (compound score > -0.05) and (compound score < 0.05)
negative sentiment : (compound score <= -0.05)


Reference: http://comp.social.gatech.edu/papers/icwsm14.vader.hutto.pdf


In [44]:
!pip install vaderSentiment



In [45]:
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joyceooi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [46]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'launch': 2.0,
    'developing': 2.0, 
    'breach': -2.4,
    'rally': 2.0,
    'selloff': -2.0,
    'roll out': 2.0,
    'hurt by':-2.0,
    'notable earnings':2.0,
    'unveils':2.0,
    'reveals':2.0,
    'raised':2.0,
    'buy': 2.0,
    'sell':-2.0,
    'up':2.0,
    'ups':2.0,
    'down':-2.0,
    'dividend':2.0,
    'acquires':2.0,
    'expansion':2.0,
    'invests':2.0,
    }
    
analyzer.lexicon.update(new_words)
    
adobe_headlines3['compound'] = [analyzer.polarity_scores(v)['compound'] for v in adobe_headlines3['headlines']]
adobe_headlines3['neg'] = [analyzer.polarity_scores(v)['neg'] for v in adobe_headlines3['headlines']]
adobe_headlines3['neu'] = [analyzer.polarity_scores(v)['neu'] for v in adobe_headlines3['headlines']]
adobe_headlines3['pos'] = [analyzer.polarity_scores(v)['pos'] for v in adobe_headlines3['headlines']]

adobe_headlines3.head(1000)
    


Unnamed: 0_level_0,headlines,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-14,cloudera introduces the industry's first machi...,0.34,0.0,0.87,0.13
2015-03-16,"apha, ntap, ftnt and flr among notable after h...",0.0,0.0,1.0,0.0
2015-03-17,"research report identifies cloudera, snap, soc...",0.2732,0.0,0.909,0.091
2015-03-17,leading pharmaceutical and healthcare organiza...,0.0,0.0,1.0,0.0
2015-03-17,cloudera to participate in upcoming financial ...,0.0,0.0,1.0,0.0
2015-03-17,"intel add position in switch, carbon black, an...",0.0,0.0,1.0,0.0
2015-03-18,"earnings reaction history: cloudera inc, 87.5%...",0.0,0.0,1.0,0.0
2015-03-30,analyst actions: stifel nicolaus lift cloudera...,0.4588,0.0,0.824,0.176
2015-04-02,cloudera unveils vision for industrialization ...,0.6124,0.0,0.643,0.357
2015-04-07,cloudera announces enhanced partner program,0.0,0.0,1.0,0.0


## Cloudera Headlines Average Vadar Scores By Date

In [47]:
adobe_headline_vadar=adobe_headlines3.groupby('date').mean()

In [48]:
adobe_headline_vadar.head()

Unnamed: 0_level_0,compound,neg,neu,pos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-01-14,0.34,0.0,0.87,0.13
2015-03-16,0.0,0.0,1.0,0.0
2015-03-17,0.0683,0.0,0.97725,0.02275
2015-03-18,0.0,0.0,1.0,0.0
2015-03-30,0.4588,0.0,0.824,0.176


In [49]:
adobe_headline_vadar.to_csv('./datasets/headlines/adobe_headline_vadar.csv')