# Sentiment Analysis

## Get the closing prices of Regeneron

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
%matplotlib inline

# Load .env environment variables
load_dotenv()

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Create the Alpaca API object
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")

# Format current date as ISO format
start_date = pd.Timestamp("2014-11-06", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2021-01-16", tz="America/New_York").isoformat()

# Set the tickers
ticker = "REGN"

# Set timeframe to one day ('1D') for the Alpaca API
timeframe = "1D"

# Get current closing prices
df_closing_prices = alpaca.get_barset(
    ticker,
    timeframe,
    start = start_date,
    end = end_date
).df

# Dissolve multiindex and fetch the closing prices 
df_closing_prices = df_closing_prices.droplevel(0, axis=1)[['close']]

# Drop the time component of the date
df_closing_prices.index = df_closing_prices.index.date

# Display sample data
df_closing_prices.head()


Bad key "text.kerning_factor" on line 4 in
C:\Users\kn_na\anaconda3\envs\x\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


Unnamed: 0,close
2014-11-06,376.97
2014-11-07,377.62
2014-11-10,385.96
2014-11-11,398.41
2014-11-12,401.05


In [2]:
df_closing_prices['return'] = df_closing_prices['close'].pct_change()
df_closing_prices.dropna(inplace=True)
df_closing_prices.head()

Unnamed: 0,close,return
2014-11-07,377.62,0.001724
2014-11-10,385.96,0.022086
2014-11-11,398.41,0.032257
2014-11-12,401.05,0.006626
2014-11-13,402.64,0.003965


## Load the data of google research results

In [3]:
import pandas as pd
from pathlib import Path
import calendar
from sklearn.preprocessing import LabelEncoder

file_path = Path("../companies_tweet_data/regeneron.csv")
news_df = pd.read_csv(file_path, parse_dates=[['date','time']], infer_datetime_format=True, usecols=['date',"tweet",'time'])

# Delete the index label and sort in ascending order
news_df.set_index('date_time', inplace=True)
news_df.index.name = None
news_df.sort_index(axis=0, inplace=True)

# Display sample data
print(len(news_df))
display(news_df.head())

4006


Unnamed: 0,tweet
2014-11-06 09:19:43,Regeneron is officially tweeting! We look forw...
2014-11-06 10:07:15,"Thanks @bradloncar, and thanks for the follow!"
2014-11-06 10:20:19,We were named top employer by @sciencemagazine...
2014-11-06 11:03:07,Thanks @SanofiUS for the warm welcome to Twitter!
2014-11-06 12:31:21,We're running our #Thanksgiving Basket food dr...


In [4]:
# Drop news published after 4pm since it does not affect the closing price of that day
news_df = news_df.between_time('00:00:00','23:59:59', include_end=False)

# Drop time in the index labels
news_df.index = news_df.index.date

# Display sample data
print(len(news_df))
display(news_df.head())

4006


Unnamed: 0,tweet
2014-11-06,Regeneron is officially tweeting! We look forw...
2014-11-06,"Thanks @bradloncar, and thanks for the follow!"
2014-11-06,We were named top employer by @sciencemagazine...
2014-11-06,Thanks @SanofiUS for the warm welcome to Twitter!
2014-11-06,We're running our #Thanksgiving Basket food dr...


## Sentiment Analysis

In [5]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kn_na\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Create the Facebook Libra sentiment scores DataFrame
sentiments = []

for title in news_df["tweet"]:
    try:
        sentiment = analyzer.polarity_scores(title) # get sentiment score
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        sentiments.append({"compound": compound,
                                 "positive": pos,
                                 "negative": neg,
                                 "neutral": neu
                                })
        
    except AttributeError:
        pass
    
# Create DataFrame
sentiments_df = pd.DataFrame(sentiments, index=news_df.index)

# Reorder DataFrame columns
news_df = news_df.join(sentiments_df)

news_df.head()

Unnamed: 0,tweet,compound,negative,neutral,positive
2014-11-06,Regeneron is officially tweeting! We look forw...,0.4753,0.0,0.846,0.154
2014-11-06,Regeneron is officially tweeting! We look forw...,0.7263,0.0,0.451,0.549
2014-11-06,Regeneron is officially tweeting! We look forw...,0.5562,0.0,0.797,0.203
2014-11-06,Regeneron is officially tweeting! We look forw...,0.7959,0.0,0.382,0.618
2014-11-06,Regeneron is officially tweeting! We look forw...,0.0,0.0,1.0,0.0


In [7]:
# sentiment_score_df = pd.DataFrame(index=news_df.index)
# sentiment_score_df.head()

In [8]:
# calculate the average of compound scores for a day with more than 1 article
import numpy as np
# sentiment_score_df = news_df.groupby(level=0)[['compound']].agg([('avg_sentiment',np.mean), ('article_counts',count)])
sentiment_score_df = news_df.groupby(level=0)[['compound']].count()
sentiment_score_df["avg_sentiments"] = news_df.groupby(level=0)[['compound']].mean()
sentiment_score_df.rename(columns= {'compound':"article_counts"}, inplace=True)
sentiment_score_df.head()

Unnamed: 0,article_counts,avg_sentiments
2014-11-06,64,0.5456
2014-11-07,81,0.175889
2014-11-10,9,0.0
2014-11-11,16,0.37005
2014-11-12,25,0.02542


## Concatenate the DFs of closing prices and sentiments

In [9]:
# concatenate
df = pd.concat([df_closing_prices, sentiment_score_df], axis=1, join='inner')

df.head()

Unnamed: 0,close,return,article_counts,avg_sentiments
2014-11-07,377.62,0.001724,81,0.175889
2014-11-10,385.96,0.022086,9,0.0
2014-11-11,398.41,0.032257,16,0.37005
2014-11-12,401.05,0.006626,25,0.02542
2014-11-13,402.64,0.003965,16,0.023725


## Prepare Training and Testing data

In [10]:
df["class"] = df["return"].apply(lambda x: 1 if x >= 0.01 else (0 if -0.01<x<0.01 else -1))

df.head(15)

Unnamed: 0,close,return,article_counts,avg_sentiments,class
2014-11-07,377.62,0.001724,81,0.175889,0
2014-11-10,385.96,0.022086,9,0.0,1
2014-11-11,398.41,0.032257,16,0.37005,1
2014-11-12,401.05,0.006626,25,0.02542,0
2014-11-13,402.64,0.003965,16,0.023725,0
2014-11-14,395.22,-0.018428,25,0.21092,-1
2014-11-17,399.65,0.011209,64,-0.0933,1
2014-11-18,413.23,0.03398,81,0.104289,1
2014-11-19,415.25,0.004888,36,0.279883,0
2014-11-20,406.0,-0.022276,36,0.4239,-1


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
            train_test_split(df[['article_counts','avg_sentiments']], df['class'], 
                             test_size=0.2,
                             random_state=1,
                             stratify=df['class']
                            )

X_train = X_train.values.reshape(-1,2)
X_test = X_test.values.reshape(-1,2)

## Choosing and fitting models

In [12]:
# Compare the five models and choose the best one
# Logistic regression
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

# Support vector machine
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=1)

# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

# XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

# Create a list of the five model instances
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [13]:
# Compare the performance of the five models
for algorithm in algorithms:
    algorithm.fit(X_train, y_train)
    score = algorithm.score(X_test, y_test)
    name = algorithm.__class__.__name__
    
    print(f'{name} score: {score:.4f}')

LogisticRegression score: 0.4195
SVC score: 0.4195
DecisionTreeClassifier score: 0.3371
RandomForestClassifier score: 0.3858




XGBClassifier score: 0.3408


## Cross Validation

In [14]:
# Create a list of the five model instances
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', gamma="scale", C=1, random_state=1)

from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [15]:
# Compare the performance of the five models

# Create a balanced set of samples, create a StratifiedKFold instance
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score

for algorithm in algorithms:
    # Conduct cross validation for each one of the five models
    scores = cross_val_score(algorithm, X_train, y_train, cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'{name} average score: {score:.4f} / each score: {scores}')

LogisticRegression average score: 0.4180 / each score: [0.42134831 0.41853933 0.41408451]
SVC average score: 0.4208 / each score: [0.42134831 0.42134831 0.41971831]
DecisionTreeClassifier average score: 0.3796 / each score: [0.38202247 0.38202247 0.37464789]
RandomForestClassifier average score: 0.3730 / each score: [0.36516854 0.38483146 0.36901408]




XGBClassifier average score: 0.3674 / each score: [0.38483146 0.36797753 0.34929577]


## Grid Search

In [16]:
# Optimize the parameters
params = {'C':[1, 10, 100, 1000, 10000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
algorithm = SVC(random_state=1)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
gs.fit(X_train, y_train)

# Based on the best parameters, predict y values from test data
best = gs.best_estimator_
best_pred = best.predict(X_test)
print(best)

SVC(C=10, gamma=0.1, random_state=1)


In [17]:
# Accuracy
score = best.score(X_test, y_test)
print(f'score: {score:.4f}')

# Confusion matrix
from sklearn.metrics import confusion_matrix
print('confusion matrix')
print(confusion_matrix(y_test, best_pred))

score: 0.3820
confusion matrix
[[ 7 74  0]
 [17 95  0]
 [ 9 65  0]]
