# Sentiment Analysis

## Get the closing prices of Tesla

In [1]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
%matplotlib inline

# Load .env environment variables
load_dotenv()

# Set Alpaca API key and secret
alpaca_api_key = os.getenv("ALPACA_API_KEY")
alpaca_secret_key = os.getenv("ALPACA_SECRET_KEY")

# Create the Alpaca API object
alpaca = tradeapi.REST(
    alpaca_api_key,
    alpaca_secret_key,
    api_version="v2")

# Format current date as ISO format
start_date = pd.Timestamp("2020-12-16", tz="America/New_York").isoformat()
end_date = pd.Timestamp("2021-01-15", tz="America/New_York").isoformat()

# Set the tickers
ticker = "TSLA"

# Set timeframe to one day ('1D') for the Alpaca API
timeframe = "1D"

# Get current closing prices
df_closing_prices = alpaca.get_barset(
    ticker,
    timeframe,
    start = start_date,
    end = end_date
).df

# Dissolve multiindex and fetch the closing prices 
df_closing_prices = df_closing_prices.droplevel(0, axis=1)[['close']]

# Drop the time component of the date
df_closing_prices.index = df_closing_prices.index.date

# Display sample data
df_closing_prices.head()

Unnamed: 0,close
2020-12-16,622.67
2020-12-17,655.725
2020-12-18,664.99
2020-12-21,649.78
2020-12-22,640.09


In [2]:
df_closing_prices['return'] = df_closing_prices['close'].pct_change()
df_closing_prices.dropna(inplace=True)
df_closing_prices.head()

Unnamed: 0,close,return
2020-12-17,655.725,0.053086
2020-12-18,664.99,0.014129
2020-12-21,649.78,-0.022873
2020-12-22,640.09,-0.014913
2020-12-23,645.98,0.009202


## Load the data of google research results

In [3]:
import pandas as pd
from pathlib import Path
import calendar
from sklearn.preprocessing import LabelEncoder

file_path = Path("tsla_news.csv")
news_df = pd.read_csv(file_path, index_col='sortkey', parse_dates=True, infer_datetime_format=True, usecols=['sortkey',"title"])

# Delete the index label and sort in ascending order
news_df.index.name = None
news_df.sort_index(axis=0, inplace=True)

# Display sample data
print(len(news_df))
display(news_df.head())

100


Unnamed: 0,title
2020-12-17 08:00:00,"Tesla’s 1,000% Stock Price Explosion Isn’t Abo..."
2020-12-18 08:00:00,The Quest to Replicate Tesla’s Success Keeps E...
2020-12-19 08:00:00,"Tesla, Profitable at Last, Bulls Its Way Into ..."
2020-12-20 08:00:00,Tesla's rise made 2020 the year the U.S. auto ...
2020-12-21 08:00:00,China provides 'heart and lungs' for Tesla's S...


In [4]:
# Drop news published after 4pm since it does not affect the closing price of that day
news_df = news_df.between_time('00:00:00','15:59:59', include_end=False)

# Drop time in the index labels
news_df.index = news_df.index.date

# Display sample data
print(len(news_df))
display(news_df.head())

57


Unnamed: 0,title
2020-12-17,"Tesla’s 1,000% Stock Price Explosion Isn’t Abo..."
2020-12-18,The Quest to Replicate Tesla’s Success Keeps E...
2020-12-19,"Tesla, Profitable at Last, Bulls Its Way Into ..."
2020-12-20,Tesla's rise made 2020 the year the U.S. auto ...
2020-12-21,China provides 'heart and lungs' for Tesla's S...


## Sentiment Analysis

In [5]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download/Update the VADER Lexicon
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\kn_na\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
# Create the Facebook Libra sentiment scores DataFrame
tesla_sentiments = []

for title in news_df["title"]:
    try:
        sentiment = analyzer.polarity_scores(title) # get sentiment score
        compound = sentiment["compound"]
        pos = sentiment["pos"]
        neu = sentiment["neu"]
        neg = sentiment["neg"]
        
        tesla_sentiments.append({"compound": compound,
                                 "positive": pos,
                                 "negative": neg,
                                 "neutral": neu
                                })
        
    except AttributeError:
        pass
    
# Create DataFrame
sentiments_df = pd.DataFrame(tesla_sentiments, index=news_df.index)

# Reorder DataFrame columns
news_df = news_df.join(sentiments_df)

news_df.head()

Unnamed: 0,title,compound,positive,negative,neutral
2020-12-17,"Tesla’s 1,000% Stock Price Explosion Isn’t Abo...",0.0,0.0,0.0,1.0
2020-12-18,The Quest to Replicate Tesla’s Success Keeps E...,0.743,0.412,0.0,0.588
2020-12-19,"Tesla, Profitable at Last, Bulls Its Way Into ...",0.4404,0.172,0.0,0.828
2020-12-20,Tesla's rise made 2020 the year the U.S. auto ...,0.0,0.0,0.0,1.0
2020-12-21,China provides 'heart and lungs' for Tesla's S...,0.0,0.0,0.0,1.0


In [7]:
# calculate the average of compound scores for a day with more than 1 article
sentiment_score_df = news_df.groupby(level=0)[['compound']].mean()

sentiment_score_df.head()

Unnamed: 0,compound
2020-12-17,0.0
2020-12-18,0.743
2020-12-19,0.4404
2020-12-20,0.0
2020-12-21,0.0


## Concatenate the DFs of closing prices and sentiments

In [8]:
# concatenate
tesla_df = pd.concat([df_closing_prices, sentiment_score_df], axis=1, join='inner')

tesla_df.head()

Unnamed: 0,close,return,compound
2020-12-17,655.725,0.053086,0.0
2020-12-18,664.99,0.014129,0.743
2020-12-21,649.78,-0.022873,0.0
2020-12-22,640.09,-0.014913,0.1806
2020-12-24,661.66,0.024273,0.0


## Prepare Training and Testing data

In [9]:
tesla_df["class"] = tesla_df["return"].apply(lambda x: 1 if x >= 0.01 else (0 if -0.01<x<0.01 else -1))

tesla_df.head(15)

Unnamed: 0,close,return,compound,class
2020-12-17,655.725,0.053086,0.0,1
2020-12-18,664.99,0.014129,0.743,1
2020-12-21,649.78,-0.022873,0.0,-1
2020-12-22,640.09,-0.014913,0.1806,-1
2020-12-24,661.66,0.024273,0.0,1
2020-12-28,663.77,0.003189,-0.01865,0
2020-12-29,665.98,0.003329,0.5719,0
2020-12-31,705.21,0.015114,0.23125,1
2021-01-04,729.75,0.034798,0.0,1
2021-01-05,735.055,0.00727,0.1909,0


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
            train_test_split(tesla_df['compound'], tesla_df['class'], 
                             test_size=0.2,
                             random_state=1,
                             stratify=tesla_df['class']
                            )

X_train = X_train.values.reshape(-1,1)
X_test = X_test.values.reshape(-1,1)

## Choosing and fitting models

In [11]:
# Compare the five models and choose the best one
# Logistic regression
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

# Support vector machine
from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', random_state=1)

# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

# Random Forest
from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

# XGBoost
from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

# Create a list of the five model instances
algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [12]:
# Compare the performance of the five models
for algorithm in algorithms:
    algorithm.fit(X_train, y_train)
    score = algorithm.score(X_test, y_test)
    name = algorithm.__class__.__name__
    
    print(f'{name} score: {score:.4f}')

LogisticRegression score: 0.5000
SVC score: 0.5000
DecisionTreeClassifier score: 0.5000
RandomForestClassifier score: 0.5000
XGBClassifier score: 0.5000




## Cross Validation

In [13]:
# Create a list of the five model instances
from sklearn.linear_model import LogisticRegression
algorithm1 = LogisticRegression(random_state=1)

from sklearn.svm import SVC
algorithm2 = SVC(kernel='rbf', gamma="scale", C=1, random_state=1)

from sklearn.tree import DecisionTreeClassifier
algorithm3 = DecisionTreeClassifier(random_state=1)

from sklearn.ensemble import RandomForestClassifier
algorithm4 = RandomForestClassifier(random_state=1)

from xgboost import XGBClassifier
algorithm5 = XGBClassifier(random_state=1)

algorithms = [algorithm1, algorithm2, algorithm3, algorithm4, algorithm5]

In [14]:
# Compare the performance of the five models

# Create a balanced set of samples, create a StratifiedKFold instance
from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import cross_val_score

for algorithm in algorithms:
    # Conduct cross validation for each one of the five models
    scores = cross_val_score(algorithm, X_train, y_train, cv=stratifiedkfold)
    score = scores.mean()
    name = algorithm.__class__.__name__
    print(f'{name} average score: {score:.4f} / each score: {scores}')

LogisticRegression average score: 0.5000 / each score: [0.5 0.5 0.5]
SVC average score: 0.5000 / each score: [0.5 0.5 0.5]
DecisionTreeClassifier average score: 0.4167 / each score: [0.25 0.5  0.5 ]
RandomForestClassifier average score: 0.4167 / each score: [0.25 0.5  0.5 ]




XGBClassifier average score: 0.4167 / each score: [0.25 0.5  0.5 ]


## Grid Search

In [15]:
# Optimize the parameters
params = {'C':[1, 10, 100, 1000, 10000], 'gamma':[1, 0.1, 0.01, 0.001, 0.0001, 0.00001]}
algorithm = SVC(random_state=1)

from sklearn.model_selection import StratifiedKFold
stratifiedkfold = StratifiedKFold(n_splits=3)

from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(algorithm, params, cv=stratifiedkfold)
gs.fit(X_train, y_train)

# Based on the best parameters, predict y values from test data
best = gs.best_estimator_
best_pred = best.predict(X_test)
print(best)

SVC(C=1, gamma=1, random_state=1)


In [16]:
# Accuracy
score = best.score(X_test, y_test)
print(f'score: {score:.4f}')

# Confusion matrix
from sklearn.metrics import confusion_matrix
print('confusion matrix')
print(confusion_matrix(y_test, best_pred))

score: 0.5000
confusion matrix
[[0 0 1]
 [0 0 1]
 [0 0 2]]
