In [31]:
# Initial imports
import os
from pathlib import Path
import pandas as pd
from datetime import datetime, timedelta
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi
import numpy as np

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

get_ipython().run_line_magic("matplotlib", "inline")


In [2]:
nltk.download("vader_lexicon")
analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\14694\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# Load .env enviroment variables
load_dotenv()


# Set Alpaca API key and secret
alpaca_api_key = os.getenv('ALPACA_API_KEY')
alpaca_secret_key = os.getenv('ALPACA_SECRET_KEY')

api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

In [4]:
def stock_info_grab(ticker):
    """
    Takes ticker symbol and returns DataFrame with Date, Close, and Pct Change columns.
    """
    # Set timeframe to '1D'
    timeframe = "1D"

    # Set current date and the date from one month ago using the ISO format
    current_date = pd.Timestamp("2020-11-09", tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2016-08-27", tz="America/New_York").isoformat()

    df = api.get_barset(
        ticker,
        timeframe,
        limit=None,
        start=past_date,
        end=current_date,
        after=None,
        until=None,
    ).df
    df = df.droplevel(axis=1, level=0)
    df.index = df.index.date
    df['pct change'] = df['close'].pct_change()
    df['pct change'].dropna
    df = df.reset_index()
    df = df.drop(columns=['open', 'high', 'low', 'volume'])
    df = df.rename(columns={'index':'Date'})
    df = df.set_index('Date')
    return df

In [5]:
aapl_stock_info = stock_info_grab("AAPL")
amzn_stock_info = stock_info_grab("AMZN")
tsla_stock_info = stock_info_grab("TSLA")
spy_stock_info = stock_info_grab("SPY")
amzn_stock_info

Unnamed: 0_level_0,close,pct change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-08-29,771.29,
2016-08-30,767.58,-0.004810
2016-08-31,769.16,0.002058
2016-09-01,770.64,0.001924
2016-09-02,772.45,0.002349
...,...,...
2020-11-03,3046.25,0.013903
2020-11-04,3241.16,0.063984
2020-11-05,3322.00,0.024942
2020-11-06,3310.41,-0.003489


In [6]:
aapl_file = Path('Resources/AAPL_HEADLINES.csv')
amzn_file = Path('Resources/AMZN_HEADLINES.csv')
spy_file = Path('Resources/SPY_HEADLINES.csv')
tsla_file = Path('Resources/TSLA_HEADLINES.csv')

aapl_headlines_df = pd.read_csv(aapl_file)
amzn_headlines_df = pd.read_csv(amzn_file)
spy_headlines_df = pd.read_csv(spy_file)
tsla_headlines_df = pd.read_csv(tsla_file)

#aapl_headlines['Date'] = pd.to_datetime(aapl_headlines['Date']).dt.strftime('%Y-%m-%d')
#aapl_headlines = aapl_headlines.set_index('Date')
amzn_headlines_df

Unnamed: 0,Headline,Date
0,Here are Wall Street’s 20 favorite value stocks,"Nov. 10, 2020 at 12:53 p.m. ET"
1,"Amazon ‘illegally distorted competition,’ the ...","Nov. 14, 2020 at 3:33 a.m. ET"
2,Mary Barra on General Motors’ All-Electric Future,"Nov. 13, 2020 at 8:39 p.m. ET"
3,Target Is Booming During the Pandemic. Why the...,"Nov. 13, 2020 at 8:38 p.m. ET"
4,These 5 Small Stocks Could Benefit From an Eco...,"Nov. 13, 2020 at 8:00 p.m. ET"
...,...,...
19631,Elsa to Barbie: Let it Go,"Nov. 25, 2014 at 12:36 p.m. ET"
19632,Tony Robbins doesn’t quite master the game of ...,"Nov. 25, 2014 at 11:23 a.m. ET"
19633,How banks are looking more like tech companies,"Nov. 25, 2014 at 9:24 a.m. ET"
19634,"Is the S&P’s future healthy after all, and a ‘...","Nov. 25, 2014 at 9:04 a.m. ET"


In [7]:
def get_sentiment(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.05:  # Positive
        result = 1
    elif score <= -0.05:  # Negative
        result = -1

    return result


In [8]:
def create_sentiment_df(df):
    """
    Takes headlines DataFrame & creates DataFrame with Sentiment columns.
    Splits Date & Time, creates Time column and moves Date to Index.
    """
    title_sent = {
        "compound": [],
        "positive": [],
        "neutral": [],
        "negative": [],
        "sentiment": [],
    }

    for index, row in df.iterrows():
        try:
            # Sentiment scoring with VADER
            title_sentiment = analyzer.polarity_scores(row["Headline"])
            title_sent["compound"].append(title_sentiment["compound"])
            title_sent["positive"].append(title_sentiment["pos"])
            title_sent["neutral"].append(title_sentiment["neu"])
            title_sent["negative"].append(title_sentiment["neg"])
            title_sent["sentiment"].append(get_sentiment(title_sentiment["compound"]))
        except AttributeError:
            pass

    title_sent_df = pd.DataFrame(title_sent)
    #title_sent_df.head()

    headline_sentiment_df = df.join(title_sent_df)
    headline_sentiment_df.dropna()
    headline_sentiment_df['Date'] = headline_sentiment_df['Date'].str.replace('at','-')
    headline_sentiment_df['Date'] = headline_sentiment_df['Date'].str.split('-').str[0]
    headline_sentiment_df = headline_sentiment_df.reindex(columns=['Date', 'Headline', 'compound', 'positive', 'neutral', 'negative', 'sentiment'])
    headline_sentiment_df['Date'] = pd.to_datetime(headline_sentiment_df['Date'])
    headline_sentiment_df.set_index('Date')
    return headline_sentiment_df

In [9]:
#issue with amzn_headlines --- need to fix
aapl_headlines = create_sentiment_df(aapl_headlines_df)
#amzn_headlines = create_sentiment_df(amzn_headlines_df)
tsla_headlines = create_sentiment_df(tsla_headlines_df)
spy_headlines = create_sentiment_df(spy_headlines_df)


In [25]:
# find average sentiment score by date

aapl_scores = aapl_headlines.groupby('Date').mean().sort_values(by='Date')
#amzn_scores = amzn_headlines.groupby(['Date']).mean().sort_values(by='Date')
tsla_scores = tsla_headlines.groupby(['Date']).mean().sort_values(by='Date')
spy_scores = spy_headlines.groupby(['Date']).mean().sort_values(by='Date')
aapl_scores

Unnamed: 0_level_0,compound,positive,neutral,negative,sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-03-19,0.836000,0.530000,0.470000,0.000000,1.000000
2016-08-27,0.038600,0.063000,0.937000,0.000000,0.500000
2016-08-28,0.440400,0.209000,0.791000,0.000000,1.000000
2016-08-29,0.067100,0.102000,0.842286,0.055714,0.000000
2016-08-30,-0.015205,0.061591,0.883455,0.054955,-0.090909
...,...,...,...,...,...
2020-11-04,-0.038410,0.078900,0.800900,0.120300,-0.300000
2020-11-05,0.304967,0.202333,0.747333,0.050333,0.333333
2020-11-06,-0.099333,0.054833,0.845500,0.099500,-0.500000
2020-11-07,-0.153100,0.000000,0.882000,0.118000,-1.000000


In [26]:
# TO DO: drop compund col on all scores
aapl_scores = aapl_scores.drop(columns='compound')
#amzn_scores = amzn_scores.drop(columns='compound')
tsla_scores = tsla_scores.drop(columns='compound')
spy_scores = spy_scores.drop(columns='compound')

In [27]:
tsla_scores

Unnamed: 0_level_0,positive,neutral,negative,sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2005-02-02,0.000000,1.000000,0.000000,0.000000
2005-02-09,0.000000,1.000000,0.000000,0.000000
2005-02-16,0.000000,1.000000,0.000000,0.000000
2005-02-23,0.000000,0.694000,0.306000,-1.000000
2005-03-16,0.000000,1.000000,0.000000,0.000000
...,...,...,...,...
2020-11-04,0.140500,0.805500,0.054000,0.166667
2020-11-05,0.052500,0.873125,0.074375,0.000000
2020-11-06,0.098400,0.862800,0.038800,0.200000
2020-11-07,0.000000,1.000000,0.000000,0.000000


In [28]:
# sent scores distribution across each df poss use histogram, calc meanstd, or percentiles 
aapl_complete = pd.concat([aapl_scores,aapl_stock_info], join='outer', axis=1).dropna()
#amzn_complete = pd.concat([amzn_scores,amzn_stock_info], join='outer', axis=1).dropna()
tsla_complete = pd.concat([tsla_scores,tsla_stock_info], join='outer', axis=1).dropna()
spy_complete = pd.concat([spy_scores,spy_stock_info], join='outer', axis=1).dropna()
aapl_complete

Unnamed: 0_level_0,positive,neutral,negative,sentiment,close,pct change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-08-30,0.061591,0.883455,0.054955,-0.090909,105.990,-0.007770
2016-08-31,0.070400,0.818600,0.111000,-0.200000,106.110,0.001132
2016-09-01,0.069625,0.897625,0.032750,0.125000,106.730,0.005843
2016-09-02,0.063143,0.845429,0.091429,-0.285714,107.730,0.009369
2016-09-06,0.131750,0.804500,0.063750,0.250000,107.700,-0.000278
...,...,...,...,...,...,...
2020-11-03,0.119000,0.842000,0.038833,0.500000,110.375,0.014756
2020-11-04,0.078900,0.800900,0.120300,-0.300000,114.940,0.041359
2020-11-05,0.202333,0.747333,0.050333,0.333333,118.990,0.035236
2020-11-06,0.054833,0.845500,0.099500,-0.500000,118.685,-0.002563


In [29]:
# TO DO: shift aapl_complete['pct change'] one day on all dfs
# TO DO: dropna() on all df['predicted pct change'] cols 
aapl_complete['predicted pct change'] = aapl_complete['pct change'].shift(periods=-1)
#amzn_complete['predicted pct change'] = amzn_complete['pct change'].shift(periods=-1)
tsla_complete['predicted pct change'] = tsla_complete['pct change'].shift(periods=-1)
spy_complete['predicted pct change'] = spy_complete['pct change'].shift(periods=-1)


In [30]:
aapl_complete = aapl_complete.dropna()
#amzn_complete = amzn_complete.dropna()
tsla_complete = tsla_complete.dropna()
spy_complete = spy_complete.dropna()


In [36]:
def get_sentiment(df):
    """
    Calculates the sentiment based on the compound score.
    """
    result = [
        (df['sentiment'] >= 0.10),
        (df['sentiment'] > -0.10) & (df['sentiment'] < 0.10),
        (df['sentiment'] <= -0.10)
    ]
    
    values = ['sell', 'hold', 'buy']
    
    df['buy/hold/sell'] = np.select(result, values)
    
    return df

In [39]:
aapl_complete_sentiment = get_sentiment(aapl_complete)
#amzn_complete_sentiment = get_sentiment(amzn_complete)
tsla_complete_sentiment = get_sentiment(tsla_complete)
spy_complete_sentiment = get_sentiment(spy_complete)
aapl_complete_sentiment

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['buy/hold/sell'] = np.select(result, values)


Unnamed: 0_level_0,positive,neutral,negative,sentiment,close,pct change,predicted pct change,buy/hold/sell
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-08-30,0.061591,0.883455,0.054955,-0.090909,105.990,-0.007770,0.001132,hold
2016-08-31,0.070400,0.818600,0.111000,-0.200000,106.110,0.001132,0.005843,buy
2016-09-01,0.069625,0.897625,0.032750,0.125000,106.730,0.005843,0.009369,sell
2016-09-02,0.063143,0.845429,0.091429,-0.285714,107.730,0.009369,-0.000278,buy
2016-09-06,0.131750,0.804500,0.063750,0.250000,107.700,-0.000278,0.006221,sell
...,...,...,...,...,...,...,...,...
2020-11-02,0.038600,0.896400,0.065000,-0.200000,108.770,-0.001194,0.014756,buy
2020-11-03,0.119000,0.842000,0.038833,0.500000,110.375,0.014756,0.041359,sell
2020-11-04,0.078900,0.800900,0.120300,-0.300000,114.940,0.041359,0.035236,buy
2020-11-05,0.202333,0.747333,0.050333,0.333333,118.990,0.035236,-0.002563,sell


In [45]:
# Define features data
y = aapl_complete_sentiment['buy/hold/sell'].values
y = y.reshape(-1, 1)

X = aapl_complete_sentiment.drop(columns="buy/hold/sell")
#X = X.reshape(-1, 1)

y[:5]

array([['hold'],
       ['buy'],
       ['sell'],
       ['buy'],
       ['sell']], dtype=object)

In [48]:
# Create training & testing datasets 
# random_state=1
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

X_train.shape

(788, 7)

In [49]:
# Instantiate a linear SVM model
from sklearn.svm import SVC

classifier = SVC(kernel='linear')
classifier

SVC(kernel='linear')

In [50]:
classifier.fit(X_train, y_train)

  return f(**kwargs)


SVC(kernel='linear')

In [51]:
# Score the accuracy
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9911167512690355
Testing Data Score: 0.9809885931558935


In [53]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[ 70,   4,   0],
       [  0,  56,   1],
       [  0,   0, 132]], dtype=int64)

In [54]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         buy       1.00      0.95      0.97        74
        hold       0.93      0.98      0.96        57
        sell       0.99      1.00      1.00       132

    accuracy                           0.98       263
   macro avg       0.98      0.98      0.98       263
weighted avg       0.98      0.98      0.98       263

