# Sentiment Analysis

## Get the closing prices of Tesla

In [25]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
%matplotlib inline

# Load .env environment variables
load_dotenv()

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Create the Alpaca API object
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")

# Format current date as ISO format
start_date = pd.Timestamp("2009-09-21", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2021-01-15", tz="America/New_York").isoformat()

# Set the tickers
ticker = "MSFT"

# Set timeframe to one day ('1D') for the Alpaca API
timeframe = "1D"

# Get current closing prices
df_closing_prices = alpaca.get_barset(
    ticker,
    timeframe,
    start = start_date,
    end = end_date
).df

# Dissolve multiindex and fetch the closing prices 
df_closing_prices = df_closing_prices.droplevel(0, axis=1)[['close']]

# Drop the time component of the date
df_closing_prices.index = df_closing_prices.index.date

# Display sample data
df_closing_prices.head()

Unnamed: 0,close
2009-09-21,25.3
2009-09-22,25.77
2009-09-23,25.71
2009-09-24,25.93
2009-09-25,25.57


In [26]:
df_closing_prices['return'] = df_closing_prices['close'].pct_change()
df_closing_prices.dropna(inplace=True)
df_closing_prices.head()

Unnamed: 0,close,return
2009-09-22,25.77,0.018577
2009-09-23,25.71,-0.002328
2009-09-24,25.93,0.008557
2009-09-25,25.57,-0.013884
2009-09-28,25.88,0.012124


## Load the data of google research results

In [27]:
import pandas as pd
from pathlib import Path
import calendar
from sklearn.preprocessing import LabelEncoder

file_path = Path("../companies_tweet_data/microsoft.csv")
news_df = pd.read_csv(file_path, parse_dates=[['date','time']], infer_datetime_format=True, usecols=['date',"tweet",'time'])

# Delete the index label and sort in ascending order
news_df.set_index('date_time', inplace=True)
news_df.index.name = None
news_df.sort_index(axis=0, inplace=True)

# Display sample data
print(len(news_df))
display(news_df.head())

27082


Unnamed: 0,tweet
2009-09-21 13:19:33,Hi Twittersphere! This is the official page fo...
2009-09-21 13:19:33,Hi Twittersphere! This is the official page fo...
2009-09-21 14:40:40,Wanna get get game smart? Check out our tips h...
2009-09-21 14:40:40,Wanna get get game smart? Check out our tips h...
2009-09-23 14:33:20,Want to see friends and family in HD? Now you ...


In [28]:
# Drop news published after 4pm since it does not affect the closing price of that day
news_df = news_df.between_time('00:00:00','23:59:59', include_end=False)

# Drop time in the index labels
news_df.index = news_df.index.date

# Display sample data
print(len(news_df))
display(news_df.head())

27082


Unnamed: 0,tweet
2009-09-21,Hi Twittersphere! This is the official page fo...
2009-09-21,Hi Twittersphere! This is the official page fo...
2009-09-21,Wanna get get game smart? Check out our tips h...
2009-09-21,Wanna get get game smart? Check out our tips h...
2009-09-23,Want to see friends and family in HD? Now you ...


## Sentiment Analysis

In [29]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kn_na\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [30]:
# Create the Facebook Libra sentiment scores DataFrame
tweet_sentiments = []

for title in news_df["tweet"]:
    try:
        sentiment = analyzer.polarity_scores(title) # get sentiment score
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        tweet_sentiments.append({"compound": compound,
                                 "positive": pos,
                                 "negative": neg,
                                 "neutral": neu
                                })
        
    except AttributeError:
        pass
    
# Create DataFrame
sentiments_df = pd.DataFrame(tweet_sentiments, index=news_df.index)

# Reorder DataFrame columns
news_df = news_df.join(sentiments_df)

news_df.head()

Unnamed: 0,tweet,compound,positive,negative,neutral
2009-09-21,Hi Twittersphere! This is the official page fo...,0.0,0.0,0.0,1.0
2009-09-21,Hi Twittersphere! This is the official page fo...,0.0,0.0,0.0,1.0
2009-09-21,Hi Twittersphere! This is the official page fo...,0.4019,0.213,0.0,0.787
2009-09-21,Hi Twittersphere! This is the official page fo...,0.4019,0.213,0.0,0.787
2009-09-21,Hi Twittersphere! This is the official page fo...,0.0,0.0,0.0,1.0


In [31]:
# sentiment_score_df = pd.DataFrame(index=news_df.index)
# sentiment_score_df.head()

In [32]:
# calculate the average of compound scores for a day with more than 1 article
import numpy as np
# sentiment_score_df = news_df.groupby(level=0)[['compound']].agg([('avg_sentiment',np.mean), ('article_counts',count)])
sentiment_score_df = news_df.groupby(level=0)[['compound']].count()
sentiment_score_df["avg_sentiments"] = news_df.groupby(level=0)[['compound']].mean()
#sentiment_score_df.rename(columns= {'compound':"article_counts"}, inplace=True)
sentiment_score_df.drop(columns='compound', inplace=True)
sentiment_score_df.head(10)

Unnamed: 0,avg_sentiments
2009-09-21,0.20095
2009-09-23,0.26335
2009-09-24,0.100475
2009-09-25,0.286433
2009-09-28,-0.5267
2009-09-29,0.271283
2009-09-30,0.365929
2009-10-01,0.3582
2009-10-02,0.55905
2009-10-05,0.11435


In [33]:
joined_df = pd.concat([df_closing_prices, sentiment_score_df], axis=1, join="inner")
joined_df.head()

Unnamed: 0,close,return,avg_sentiments
2009-09-23,25.71,-0.002328,0.26335
2009-09-24,25.93,0.008557,0.100475
2009-09-25,25.57,-0.013884,0.286433
2009-09-28,25.88,0.012124,-0.5267
2009-09-29,25.76,-0.004637,0.271283


In [44]:
#joined_df["class"] = joined_df["return"].apply(lambda x: 1 if x >= 0.01 else (0 if -0.01<x<0.01 else -1))
joined_df["class"] = joined_df["return"].apply(lambda x: 1 if x >= 0.01 else (0 if -0.01<x<0.01 else -1))

joined_df.head()

Unnamed: 0,close,return,avg_sentiments,class
2009-09-23,25.71,-0.002328,0.26335,0
2009-09-24,25.93,0.008557,0.100475,0
2009-09-25,25.57,-0.013884,0.286433,-1
2009-09-28,25.88,0.012124,-0.5267,1
2009-09-29,25.76,-0.004637,0.271283,0


In [45]:
X=joined_df[['avg_sentiments']].values.reshape(-1,1)
y=joined_df[['class']].values.ravel()
print(X.shape)
print(y.shape)

(2526, 1)
(2526,)


In [46]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)
#print(X_scaled[:1])
# pipe = Pipeline([('scaler', StandardScaler()), ('pca',decomposition.PCA(n_components=2))])
# iris_X_scaled = pipe.fit_transform(iris_X)

## Concatenate the DFs of closing prices and sentiments

In [47]:
# # concatenate
# tesla_df = pd.concat([df_closing_prices, sentiment_score_df], axis=1, join='inner')

# tesla_df.head()

## Prepare Training and Testing data

In [48]:
# tesla_df["class"] = tesla_df["return"].apply(lambda x: 1 if x >= 0.01 else (0 if -0.01<x<0.01 else -1))

# tesla_df.head(15)

In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
            train_test_split(X, y, 
                             test_size=0.2,
                             random_state=1,
                             stratify=y
                            )

print(X_train.shape)
print(y_train.shape)

(2020, 1)
(2020,)


## Choosing and fitting models

In [50]:
# Compare the five models and choose the best one
# Logistic regression
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

# Support vector machine
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=1)

# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

# XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

# Create a list of the five model instances
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [51]:
# Compare the performance of the five models
for algorithm in algorithms:
    algorithm.fit(X_train, y_train)
    score = algorithm.score(X_test, y_test)
    name = algorithm.__class__.__name__
    
    print(f'{name} score: {score:.4f}')

LogisticRegression score: 0.5988
SVC score: 0.5988
DecisionTreeClassifier score: 0.4763
RandomForestClassifier score: 0.4743




XGBClassifier score: 0.5593


## Cross Validation

In [52]:
# Create a list of the five model instances
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', gamma="scale", C=1, random_state=1)

from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [53]:
# Compare the performance of the five models

# Create a balanced set of samples, create a StratifiedKFold instance
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score

for algorithm in algorithms:
    # Conduct cross validation for each one of the five models
    scores = cross_val_score(algorithm, X_train, y_train, cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'{name} average score: {score:.4f} / each score: {scores}')

LogisticRegression average score: 0.5990 / each score: [0.59792285 0.60029718 0.59881129]
SVC average score: 0.5990 / each score: [0.59792285 0.60029718 0.59881129]
DecisionTreeClassifier average score: 0.4480 / each score: [0.47626113 0.41753343 0.45022288]
RandomForestClassifier average score: 0.4525 / each score: [0.46884273 0.42347697 0.46508172]




XGBClassifier average score: 0.5134 / each score: [0.51928783 0.50371471 0.51708767]


## Grid Search

In [54]:
# Optimize the parameters
params = {'C':[1, 10, 100, 1000, 10000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
algorithm = SVC(random_state=1)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
gs.fit(X_train, y_train)

# Based on the best parameters, predict y values from test data
best = gs.best_estimator_
best_pred = best.predict(X_test)
print(best)

SVC(C=1, gamma=1, random_state=1)


In [55]:
# Accuracy
score = best.score(X_test, y_test)
print(f'score: {score:.4f}')

# Confusion matrix
from sklearn.metrics import confusion_matrix
print('confusion matrix')
print(confusion_matrix(y_test, best_pred))

score: 0.5988
confusion matrix
[[  0  92   0]
 [  0 303   0]
 [  0 111   0]]
