# Sentiment Analysis

## Get the closing prices of pfizer

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
%matplotlib inline

# Load .env environment variables
load_dotenv()

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Create the Alpaca API object
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")

# Format current date as ISO format
start_date = pd.Timestamp("2009-07-15", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2021-01-16", tz="America/New_York").isoformat()

# Set the tickers
ticker = "PFE"

# Set timeframe to one day ('1D') for the Alpaca API
timeframe = "1D"

# Get current closing prices
df_closing_prices = alpaca.get_barset(
    ticker,
    timeframe,
    start = start_date,
    end = end_date
).df

# Dissolve multiindex and fetch the closing prices 
df_closing_prices = df_closing_prices.droplevel(0, axis=1)[['close']]

# Drop the time component of the date
df_closing_prices.index = df_closing_prices.index.date

# Display sample data
df_closing_prices.head()


Bad key "text.kerning_factor" on line 4 in
C:\Users\kn_na\anaconda3\envs\x\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


Unnamed: 0,close
2009-07-15,15.0
2009-07-16,15.06
2009-07-17,14.94
2009-07-20,15.24
2009-07-21,15.7


In [2]:
df_closing_prices['return'] = df_closing_prices['close'].pct_change()
df_closing_prices.dropna(inplace=True)
df_closing_prices.head()

Unnamed: 0,close,return
2009-07-16,15.06,0.004
2009-07-17,14.94,-0.007968
2009-07-20,15.24,0.02008
2009-07-21,15.7,0.030184
2009-07-22,15.88,0.011465


## Load the data of google research results

In [3]:
import pandas as pd
from pathlib import Path
import calendar
from sklearn.preprocessing import LabelEncoder

file_path = Path("../companies_tweet_data/pfizer.csv")
news_df = pd.read_csv(file_path, parse_dates=[['date','time']], infer_datetime_format=True, usecols=['date',"tweet",'time'])

# Delete the index label and sort in ascending order
news_df.set_index('date_time', inplace=True)
news_df.index.name = None
news_df.sort_index(axis=0, inplace=True)

# Display sample data
print(len(news_df))
display(news_df.head())

9434


Unnamed: 0,tweet
2009-07-15 17:01:49,July 22 11am EDT Pfizer’s 2Q09 Earnings confer...
2009-07-16 11:54:32,Pfizer MAINTAIN: Medicine Assistance Program. ...
2009-07-17 13:46:55,Pfe: European Commission approved company’s pe...
2009-07-20 15:15:29,Pfe: Wyeth Announces Stockholder Approval of P...
2009-07-20 15:19:09,Pfe: Pfizer is a committed partner in the inte...


In [4]:
# Drop news published after 4pm since it does not affect the closing price of that day
news_df = news_df.between_time('00:00:00','23:59:59', include_end=False)

# Drop time in the index labels
news_df.index = news_df.index.date

# Display sample data
print(len(news_df))
display(news_df.head())

9434


Unnamed: 0,tweet
2009-07-15,July 22 11am EDT Pfizer’s 2Q09 Earnings confer...
2009-07-16,Pfizer MAINTAIN: Medicine Assistance Program. ...
2009-07-17,Pfe: European Commission approved company’s pe...
2009-07-20,Pfe: Wyeth Announces Stockholder Approval of P...
2009-07-20,Pfe: Pfizer is a committed partner in the inte...


## Sentiment Analysis

In [5]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kn_na\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Create the Facebook Libra sentiment scores DataFrame
pfizer_sentiments = []

for title in news_df["tweet"]:
    try:
        sentiment = analyzer.polarity_scores(title) # get sentiment score
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        pfizer_sentiments.append({"compound": compound,
                                 "positive": pos,
                                 "negative": neg,
                                 "neutral": neu
                                })
        
    except AttributeError:
        pass
    
# Create DataFrame
sentiments_df = pd.DataFrame(pfizer_sentiments, index=news_df.index)

# Reorder DataFrame columns
news_df = news_df.join(sentiments_df)

news_df.head()

Unnamed: 0,tweet,compound,negative,neutral,positive
2009-07-15,July 22 11am EDT Pfizer’s 2Q09 Earnings confer...,0.0,0.0,1.0,0.0
2009-07-16,Pfizer MAINTAIN: Medicine Assistance Program. ...,0.4215,0.0,0.823,0.177
2009-07-17,Pfe: European Commission approved company’s pe...,0.4215,0.0,0.763,0.237
2009-07-20,Pfe: Wyeth Announces Stockholder Approval of P...,0.4767,0.0,0.795,0.205
2009-07-20,Pfe: Wyeth Announces Stockholder Approval of P...,0.2732,0.0,0.877,0.123


In [7]:
# sentiment_score_df = pd.DataFrame(index=news_df.index)
# sentiment_score_df.head()

In [8]:
# calculate the average of compound scores for a day with more than 1 article
import numpy as np
# sentiment_score_df = news_df.groupby(level=0)[['compound']].agg([('avg_sentiment',np.mean), ('article_counts',count)])
sentiment_score_df = news_df.groupby(level=0)[['compound']].count()
sentiment_score_df["avg_sentiments"] = news_df.groupby(level=0)[['compound']].mean()
sentiment_score_df.rename(columns= {'compound':"article_counts"}, inplace=True)
sentiment_score_df.head()

Unnamed: 0,article_counts,avg_sentiments
2009-07-15,1,0.0
2009-07-16,1,0.4215
2009-07-17,1,0.4215
2009-07-20,4,0.37495
2009-07-21,25,0.31674


## Concatenate the DFs of closing prices and sentiments

In [9]:
# concatenate
pfizer_df = pd.concat([df_closing_prices, sentiment_score_df], axis=1, join='inner')

pfizer_df.head()

Unnamed: 0,close,return,article_counts,avg_sentiments
2009-07-16,15.06,0.004,1,0.4215
2009-07-17,14.94,-0.007968,1,0.4215
2009-07-20,15.24,0.02008,4,0.37495
2009-07-21,15.7,0.030184,25,0.31674
2009-07-22,15.88,0.011465,4,0.0


## Prepare Training and Testing data

In [10]:
pfizer_df["class"] = pfizer_df["return"].apply(lambda x: 1 if x >= 0.01 else (0 if -0.01<x<0.01 else -1))

pfizer_df.head(15)

Unnamed: 0,close,return,article_counts,avg_sentiments,class
2009-07-16,15.06,0.004,1,0.4215,0
2009-07-17,14.94,-0.007968,1,0.4215,0
2009-07-20,15.24,0.02008,4,0.37495,1
2009-07-21,15.7,0.030184,25,0.31674,1
2009-07-22,15.88,0.011465,4,0.0,1
2009-07-23,16.15,0.017003,4,0.39225,1
2009-07-29,15.76,-0.017456,9,0.1597,-1
2009-07-30,15.91,0.009518,4,0.0129,0
2009-08-03,16.03,0.007542,1,0.0772,0
2009-08-06,15.79,-0.005041,4,0.1974,0


In [11]:
pfizer_df['class'].value_counts()

 0    1606
 1     416
-1     364
Name: class, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
            train_test_split(pfizer_df[['article_counts','avg_sentiments']], pfizer_df['class'], 
                             test_size=0.2,
                             random_state=1,
                             stratify=pfizer_df['class']
                            )

X_train = X_train.values.reshape(-1,2)
X_test = X_test.values.reshape(-1,2)

## Choosing and fitting models

In [13]:
# Compare the five models and choose the best one
# Logistic regression
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

# Support vector machine
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=1)

# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

# XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

# Create a list of the five model instances
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [14]:
# Compare the performance of the five models
for algorithm in algorithms:
    algorithm.fit(X_train, y_train)
    score = algorithm.score(X_test, y_test)
    name = algorithm.__class__.__name__
    
    print(f'{name} score: {score:.4f}')

LogisticRegression score: 0.6736
SVC score: 0.6736
DecisionTreeClassifier score: 0.5230
RandomForestClassifier score: 0.5418




XGBClassifier score: 0.6004


## Cross Validation

In [15]:
# Create a list of the five model instances
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', gamma="scale", C=1, random_state=1)

from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [16]:
# Compare the performance of the five models

# Create a balanced set of samples, create a StratifiedKFold instance
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score

for algorithm in algorithms:
    # Conduct cross validation for each one of the five models
    scores = cross_val_score(algorithm, X_train, y_train, cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'{name} average score: {score:.4f} / each score: {scores}')

LogisticRegression average score: 0.6724 / each score: [0.67295597 0.67138365 0.67295597]
SVC average score: 0.6724 / each score: [0.67295597 0.67138365 0.67295597]
DecisionTreeClassifier average score: 0.5419 / each score: [0.56289308 0.54245283 0.52044025]
RandomForestClassifier average score: 0.5535 / each score: [0.56603774 0.5408805  0.55345912]




XGBClassifier average score: 0.6116 / each score: [0.62106918 0.60377358 0.61006289]


## Grid Search

In [17]:
# Optimize the parameters
params = {'C':[1, 10, 100, 1000, 10000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
algorithm = SVC(random_state=1)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
gs.fit(X_train, y_train)

# Based on the best parameters, predict y values from test data
best = gs.best_estimator_
best_pred = best.predict(X_test)
print(best)

SVC(C=1, gamma=0.0001, random_state=1)


In [18]:
# Accuracy
score = best.score(X_test, y_test)
print(f'score: {score:.4f}')

# Confusion matrix
from sklearn.metrics import confusion_matrix
print('confusion matrix')
print(confusion_matrix(y_test, best_pred))

score: 0.6736
confusion matrix
[[  0  73   0]
 [  0 322   0]
 [  0  83   0]]
