In [7]:
# Initial imports
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from dotenv import load_dotenv
import alpaca_trade_api as tradeapi

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import tensorflow as tf
get_ipython().run_line_magic("matplotlib", "inline")
%matplotlib inline


Bad key "text.kerning_factor" on line 4 in
C:\Users\annmi\anaconda3\envs\pyvizenv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


In [8]:
nltk.download("vader_lexicon")
analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\annmi\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [9]:
# Load .env enviroment variables
load_dotenv()


# Set Alpaca API key and secret
alpaca_api_key = os.getenv('ALPACA_API_KEY')
alpaca_secret_key = os.getenv('ALPACA_SECRET_KEY')

api = tradeapi.REST(alpaca_api_key, alpaca_secret_key, api_version='v2')

In [10]:
def stock_info_grab(ticker):
    """
    Takes ticker symbol and returns DataFrame with Date, Close, and Pct Change columns.
    """
    # Set timeframe to '1D'
    timeframe = "1D"

    # Set current date and the date from one month ago using the ISO format
    current_date = pd.Timestamp("2020-11-09", tz="America/New_York").isoformat()
    past_date = pd.Timestamp("2016-08-27", tz="America/New_York").isoformat()

    df = api.get_barset(
        ticker,
        timeframe,
        limit=None,
        start=past_date,
        end=current_date,
        after=None,
        until=None,
    ).df
    df = df.droplevel(axis=1, level=0)
    df.index = df.index.date
    df['pct change'] = df['close'].pct_change()
    df['pct change'].dropna
    df = df.reset_index()
    df = df.drop(columns=['open', 'high', 'low', 'volume'])
    df = df.rename(columns={'index':'Date'})
    df = df.set_index('Date')
    return df

In [11]:
aapl_stock_info = stock_info_grab("AAPL")
amzn_stock_info = stock_info_grab("AMZN")
tsla_stock_info = stock_info_grab("TSLA")
spy_stock_info = stock_info_grab("SPY")
aapl_stock_info

Unnamed: 0_level_0,close,pct change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-08-29,106.8200,
2016-08-30,105.9900,-0.007770
2016-08-31,106.1100,0.001132
2016-09-01,106.7300,0.005843
2016-09-02,107.7300,0.009369
2016-09-06,107.7000,-0.000278
2016-09-07,108.3700,0.006221
2016-09-08,105.5100,-0.026391
2016-09-09,103.1400,-0.022462
2016-09-12,105.4400,0.022300


In [12]:
aapl_file = Path('Resources/AAPL_HEADLINES.csv')
#amzn_file = Path('../Resources/AMZN_HEADLINES.csv')
spy_file = Path('Resources/SPY_HEADLINES.csv')
tsla_file = Path('Resources/TSLA_HEADLINES.csv')

aapl_headlines_df = pd.read_csv(aapl_file)
#amzn_headlines_df = pd.read_csv(amzn_file)
spy_headlines_df = pd.read_csv(spy_file)
tsla_headlines_df = pd.read_csv(tsla_file)

#aapl_headlines['Date'] = pd.to_datetime(aapl_headlines['Date']).dt.strftime('%Y-%m-%d')
#aapl_headlines = aapl_headlines.set_index('Date')
aapl_headlines_df

Unnamed: 0,Headline,Date
0,"Apple Inc. stock falls Monday, underperforms m...","Nov. 9, 2020 at 4:30 p.m. ET"
1,Big Tech Stocks Are Lagging Today. Why They’ll...,"Nov. 9, 2020 at 1:45 p.m. ET"
2,"As Apple releases its new line of Macs, the bi...","Nov. 9, 2020 at 1:18 p.m. ET"
3,"In the Midst of Election Uncertainty, Younger ...","Nov. 6, 2020 at 9:21 p.m. ET"
4,Berkshire Buybacks Hit Record $9 Billion in Th...,"Nov. 7, 2020 at 8:49 a.m. ET"
5,This single-country stock picker has beaten th...,"Nov. 3, 2020 at 7:12 a.m. ET"
6,"Apple Inc. stock falls Friday, underperforms m...","Nov. 6, 2020 at 4:30 p.m. ET"
7,T-Mobile Stock Is at a Record High After Earni...,"Nov. 6, 2020 at 2:16 p.m. ET"
8,Dow's 25-point fall led by losses in UnitedHea...,"Nov. 6, 2020 at 10:53 a.m. ET"
9,"Dow falls 110 points on losses for Apple Inc.,...","Nov. 6, 2020 at 9:45 a.m. ET"


In [13]:
def get_sentiment(score):
    """
    Calculates the sentiment based on the compound score.
    """
    result = 0  # Neutral by default
    if score >= 0.05:  # Positive
        result = 1
    elif score <= -0.05:  # Negative
        result = -1

    return result


In [14]:
def create_sentiment_df(df):
    """
    Takes headlines DataFrame & creates DataFrame with Sentiment columns.
    Splits Date & Time, creates Time column and moves Date to Index.
    """
    title_sent = {
        "compound": [],
        "positive": [],
        "neutral": [],
        "negative": [],
        "sentiment": [],
    }

    for index, row in df.iterrows():
        try:
            # Sentiment scoring with VADER
            title_sentiment = analyzer.polarity_scores(row["Headline"])
            title_sent["compound"].append(title_sentiment["compound"])
            title_sent["positive"].append(title_sentiment["pos"])
            title_sent["neutral"].append(title_sentiment["neu"])
            title_sent["negative"].append(title_sentiment["neg"])
            title_sent["sentiment"].append(get_sentiment(title_sentiment["compound"]))
        except AttributeError:
            pass

    title_sent_df = pd.DataFrame(title_sent)
    #title_sent_df.head()

    headline_sentiment_df = df.join(title_sent_df)
    headline_sentiment_df.dropna()
    headline_sentiment_df['Date'] = headline_sentiment_df['Date'].str.replace('at','-')
    headline_sentiment_df['Date'] = headline_sentiment_df['Date'].str.split('-').str[0]
    headline_sentiment_df = headline_sentiment_df.reindex(columns=['Date', 'Headline', 'compound', 'positive', 'neutral', 'negative', 'sentiment'])
    headline_sentiment_df['Date'] = pd.to_datetime(headline_sentiment_df['Date'])
    headline_sentiment_df.set_index('Date')
    return headline_sentiment_df

In [15]:
aapl_headlines = create_sentiment_df(aapl_headlines_df)
#amzn_headlines = create_sentiment_df(amzn_headlines_df)
tsla_headlines = create_sentiment_df(tsla_headlines_df)
spy_headlines = create_sentiment_df(spy_headlines_df)
aapl_headlines

Unnamed: 0,Date,Headline,compound,positive,neutral,negative,sentiment
0,2020-11-09,"Apple Inc. stock falls Monday, underperforms m...",0.0000,0.000,1.000,0.000,0
1,2020-11-09,Big Tech Stocks Are Lagging Today. Why They’ll...,-0.0772,0.121,0.738,0.141,-1
2,2020-11-09,"As Apple releases its new line of Macs, the bi...",0.4767,0.193,0.807,0.000,1
3,2020-11-06,"In the Midst of Election Uncertainty, Younger ...",-0.3400,0.000,0.806,0.194,-1
4,2020-11-07,Berkshire Buybacks Hit Record $9 Billion in Th...,-0.1531,0.000,0.882,0.118,-1
5,2020-11-03,This single-country stock picker has beaten th...,-0.0258,0.115,0.766,0.119,0
6,2020-11-06,"Apple Inc. stock falls Friday, underperforms m...",0.0000,0.000,1.000,0.000,0
7,2020-11-06,T-Mobile Stock Is at a Record High After Earni...,0.0000,0.000,1.000,0.000,0
8,2020-11-06,Dow's 25-point fall led by losses in UnitedHea...,-0.1280,0.171,0.620,0.209,-1
9,2020-11-06,"Dow falls 110 points on losses for Apple Inc.,...",-0.1280,0.158,0.647,0.194,-1


In [16]:
# find average sentiment score by date
aapl_scores = aapl_headlines.groupby('Date').mean().sort_values(by='Date')
#amzn_scores = amzn_headlines.groupby(['Date']).mean().sort_values(by='Date')
tsla_scores = tsla_headlines.groupby(['Date']).mean().sort_values(by='Date')
spy_scores = spy_headlines.groupby(['Date']).mean().sort_values(by='Date')

In [17]:
aapl_scores.head()

Unnamed: 0_level_0,compound,positive,neutral,negative,sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-03-19,0.836,0.53,0.47,0.0,1.0
2016-08-27,0.0386,0.063,0.937,0.0,0.5
2016-08-28,0.4404,0.209,0.791,0.0,1.0
2016-08-29,0.0671,0.102,0.842286,0.055714,0.0
2016-08-30,-0.015205,0.061591,0.883455,0.054955,-0.090909


In [18]:
aapl_scores = aapl_scores[['positive', 'neutral', 'negative', 'sentiment']]
#amzn_scores = amzn_scores[['positive', 'neutral', 'negative', 'sentiment']]
tsla_scores = tsla_scores[['positive', 'neutral', 'negative', 'sentiment']]
spy_scores = spy_scores[['positive', 'neutral', 'negative', 'sentiment']]
aapl_scores

Unnamed: 0_level_0,positive,neutral,negative,sentiment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-03-19,0.530000,0.470000,0.000000,1.000000
2016-08-27,0.063000,0.937000,0.000000,0.500000
2016-08-28,0.209000,0.791000,0.000000,1.000000
2016-08-29,0.102000,0.842286,0.055714,0.000000
2016-08-30,0.061591,0.883455,0.054955,-0.090909
2016-08-31,0.070400,0.818600,0.111000,-0.200000
2016-09-01,0.069625,0.897625,0.032750,0.125000
2016-09-02,0.063143,0.845429,0.091429,-0.285714
2016-09-03,0.000000,1.000000,0.000000,0.000000
2016-09-05,0.086500,0.755500,0.158000,0.000000


In [19]:
# sent scores distribution across each df poss use histogram, calc meanstd, or percentiles 
aapl_complete = pd.concat([aapl_scores,aapl_stock_info], join='outer', axis=1).dropna()
#amzn_complete = pd.concat([amzn_scores,amzn_stock_info], join='outer', axis=1).dropna()
tsla_complete = pd.concat([tsla_scores,tsla_stock_info], join='outer', axis=1).dropna()
spy_complete = pd.concat([spy_scores,spy_stock_info], join='outer', axis=1).dropna()
aapl_complete

Unnamed: 0_level_0,positive,neutral,negative,sentiment,close,pct change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-08-30,0.061591,0.883455,0.054955,-0.090909,105.9900,-0.007770
2016-08-31,0.070400,0.818600,0.111000,-0.200000,106.1100,0.001132
2016-09-01,0.069625,0.897625,0.032750,0.125000,106.7300,0.005843
2016-09-02,0.063143,0.845429,0.091429,-0.285714,107.7300,0.009369
2016-09-06,0.131750,0.804500,0.063750,0.250000,107.7000,-0.000278
2016-09-07,0.096000,0.877000,0.027000,0.250000,108.3700,0.006221
2016-09-08,0.069714,0.862357,0.067929,0.000000,105.5100,-0.026391
2016-09-09,0.049500,0.872750,0.077750,-0.250000,103.1400,-0.022462
2016-09-12,0.113000,0.887000,0.000000,0.333333,105.4400,0.022300
2016-09-13,0.089818,0.888455,0.021818,0.272727,108.0200,0.024469


In [20]:
# TO DO: shift aapl_complete['pct change'] one day on all dfs
# TO DO: dropna() on all df['predicted pct change'] cols 
aapl_complete['predicted pct change'] = aapl_complete['pct change'].shift(periods=-1)
#amzn_complete['predicted pct change'] = amzn_complete['pct change'].shift(periods=-1)
tsla_complete['predicted pct change'] = tsla_complete['pct change'].shift(periods=-1)
spy_complete['predicted pct change'] = spy_complete['pct change'].shift(periods=-1)


In [21]:
df = aapl_complete

In [22]:
df

Unnamed: 0_level_0,positive,neutral,negative,sentiment,close,pct change,predicted pct change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-08-30,0.061591,0.883455,0.054955,-0.090909,105.9900,-0.007770,0.001132
2016-08-31,0.070400,0.818600,0.111000,-0.200000,106.1100,0.001132,0.005843
2016-09-01,0.069625,0.897625,0.032750,0.125000,106.7300,0.005843,0.009369
2016-09-02,0.063143,0.845429,0.091429,-0.285714,107.7300,0.009369,-0.000278
2016-09-06,0.131750,0.804500,0.063750,0.250000,107.7000,-0.000278,0.006221
2016-09-07,0.096000,0.877000,0.027000,0.250000,108.3700,0.006221,-0.026391
2016-09-08,0.069714,0.862357,0.067929,0.000000,105.5100,-0.026391,-0.022462
2016-09-09,0.049500,0.872750,0.077750,-0.250000,103.1400,-0.022462,0.022300
2016-09-12,0.113000,0.887000,0.000000,0.333333,105.4400,0.022300,0.024469
2016-09-13,0.089818,0.888455,0.021818,0.272727,108.0200,0.024469,0.034716


# Neural Networks

In [23]:
import numpy as np
import pandas as pd
import hvplot.pandas

In [24]:
# Set the random seed for reproducibility
# Note: This is for the homework solution, but it is good practice to comment this out and run multiple experiments to evaluate your model
from numpy.random import seed
seed(1)
from tensorflow import random
random.set_seed(2)

In [25]:
# # This function accepts the column number for the features (X) and the target (y)
# # It chunks the data up with a rolling window of Xt-n to predict Xt
# # It returns a numpy array of X any y
# def window_data(df, window, feature_col_number, target_col_number):
#     X = []
#     y = []
#     for i in range(len(df) - window - 1):
#         features = df.iloc[i:(i + window), feature_col_number]
#         target = df.iloc[(i + window), target_col_number]
#         X.append(features)
#         y.append(target)
#     return np.array(X), np.array(y).reshape(-1, 1)

In [26]:
# # Predict Closing Prices
# window_size = 5

# # Column index 0 is the 'feature'
# # Column index 1 is the predictions
# feature_column = 0
# target_column = 1
# X, y = window_data(df, window_size, feature_column, target_column)

In [27]:
# Define features data
#X = df[['positive', 'neutral', 'negative','sentiment']].values
#X = X.drop(columns=["pct change"])


#X[:5]

X = df.copy()
X = df[['positive', 'neutral', 'negative','sentiment']].values
#X = X.drop(columns=["close", "pct change", "predicted pct change"]).values
#X = X.reshape(-1, 1)
X[:5]

array([[ 0.06159091,  0.88345455,  0.05495455, -0.09090909],
       [ 0.0704    ,  0.8186    ,  0.111     , -0.2       ],
       [ 0.069625  ,  0.897625  ,  0.03275   ,  0.125     ],
       [ 0.06314286,  0.84542857,  0.09142857, -0.28571429],
       [ 0.13175   ,  0.8045    ,  0.06375   ,  0.25      ]])

In [28]:
# Define target data
y = df["predicted pct change"].values
#y = df["pct change"].shift(periods=-1).values
y = y.reshape(-1, 1)
y[:5]

array([[ 0.00113218],
       [ 0.00584299],
       [ 0.00936944],
       [-0.00027847],
       [ 0.00622098]])

In [29]:
# Use 70% of the data for training and the remaineder for testing
# YOUR CODE HERE!
split = int(0.7 * len(X))

X_train = X[: split]
X_test = X[split:]

y_train = y[: split]
y_test = y[split:]

In [30]:
from sklearn.preprocessing import MinMaxScaler
# Use the MinMaxScaler to scale data between 0 and 1.
# YOUR CODE HERE!

#Create Scaler Object
scaler=MinMaxScaler()

# Fit the MinMaxScaler object with the features data X
scaler.fit(X)

# Scale the features training and testing sets
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Fit the MinMaxScaler object with the target data Y
scaler.fit(y)

# Scale the target training and testing sets
y_train = scaler.transform(y_train)
y_test = scaler.transform(y_test)

In [31]:
# Reshape the features for the model
# YOUR CODE HERE!
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [33]:
#Imports
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [34]:
# Build the LSTM model. 
# The return sequences need to be set to True if you are adding additional LSTM layers, but 
# You don't have to do this for the final layer. 
# Note: The dropouts help prevent overfitting
# Note: The input shape is the number of time steps and the number of indicators
# Note: Batching inputs has a different input shape of Samples/TimeSteps/Features

# YOUR CODE HERE!
# Define the LSTM RNN model.
model = Sequential()

# Initial model setup
number_units = 10
dropout_fraction = 0.2

# Layer 1
model.add(LSTM(
    units=number_units,
    return_sequences=True,
    input_shape=(X_train.shape[1], 1))
    )
model.add(Dropout(dropout_fraction))

# Layer 2
model.add(LSTM(units=number_units, return_sequences=True))
model.add(Dropout(dropout_fraction))

# Layer 3
model.add(LSTM(units=number_units))
model.add(Dropout(dropout_fraction))

# Output layer
model.add(Dense(1))

In [39]:
# Compile the model
# YOUR CODE HERE!
model.compile(optimizer="adam", loss="mean_squared_error", metrics=['accuracy'])

In [40]:
# Summarize the model
# YOUR CODE HERE!
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 4, 10)             480       
_________________________________________________________________
dropout (Dropout)            (None, 4, 10)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 4, 10)             840       
_________________________________________________________________
dropout_1 (Dropout)          (None, 4, 10)             0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 10)                840       
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1

In [41]:
# Train the model
# Use at least 10 epochs
# Do not shuffle the data
# Experiement with the batch size, but a smaller batch size is recommended
# YOUR CODE HERE! 
model.fit(X_train, y_train, epochs=75, shuffle=False, batch_size=20,  verbose=1)

Epoch 1/75
Epoch 2/75
Epoch 3/75
Epoch 4/75
Epoch 5/75
Epoch 6/75
Epoch 7/75
Epoch 8/75
Epoch 9/75
Epoch 10/75
Epoch 11/75
Epoch 12/75
Epoch 13/75
Epoch 14/75
Epoch 15/75
Epoch 16/75
Epoch 17/75
Epoch 18/75
Epoch 19/75
Epoch 20/75
Epoch 21/75
Epoch 22/75
Epoch 23/75
Epoch 24/75
Epoch 25/75
Epoch 26/75
Epoch 27/75
Epoch 28/75
Epoch 29/75
Epoch 30/75
Epoch 31/75
Epoch 32/75
Epoch 33/75
Epoch 34/75
Epoch 35/75
Epoch 36/75
Epoch 37/75
Epoch 38/75
Epoch 39/75
Epoch 40/75
Epoch 41/75
Epoch 42/75
Epoch 43/75
Epoch 44/75
Epoch 45/75
Epoch 46/75
Epoch 47/75
Epoch 48/75
Epoch 49/75
Epoch 50/75
Epoch 51/75
Epoch 52/75
Epoch 53/75
Epoch 54/75
Epoch 55/75
Epoch 56/75
Epoch 57/75
Epoch 58/75
Epoch 59/75
Epoch 60/75
Epoch 61/75
Epoch 62/75
Epoch 63/75
Epoch 64/75
Epoch 65/75
Epoch 66/75
Epoch 67/75
Epoch 68/75
Epoch 69/75
Epoch 70/75
Epoch 71/75
Epoch 72/75
Epoch 73/75
Epoch 74/75
Epoch 75/75


<tensorflow.python.keras.callbacks.History at 0x21307019088>