In [14]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import psycopg2
from config import PGHOST, PGDATABASE, PGUSER, PGPASSWORD

from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, LSTM

In [15]:
# Define a function to connect to AWS database instance

def connect():
    
    # Set up a connection to the postgres server.
    conn_string = "host="+PGHOST +" port="+ "5432" +" dbname="+PGDATABASE +" user=" + PGUSER \
                  +" password="+ PGPASSWORD
    
    conn = psycopg2.connect(conn_string)
    print("Connected!")

    # Create a cursor object
    cursor = conn.cursor()
    
    return conn, cursor

In [16]:
conn, cursor = connect()

Connected!


In [17]:
# Get the data, read SQL query into data frame
gg_data = pd.read_sql_query("SELECT * FROM google", con=conn)
gg_df = pd.DataFrame(gg_data)
gg_df.head()

Unnamed: 0,Date,open_price,high_price,low_price,close_price,adj_close,volume
0,2004-08-19,2.502503,2.604104,2.401401,2.511011,2.511011,893181924
1,2004-08-20,2.527778,2.72973,2.515015,2.71046,2.71046,456686856
2,2004-08-23,2.771522,2.83984,2.728979,2.737738,2.737738,365122512
3,2004-08-24,2.783784,2.792793,2.591842,2.624374,2.624374,304946748
4,2004-08-25,2.626627,2.702703,2.5996,2.652653,2.652653,183772044


In [18]:
gg_df = gg_df.drop(['adj_close','volume'], axis=1)

In [19]:
gg_df = gg_df[['Date','open_price', 'high_price', 'low_price', 'close_price']]

gg_df.head()

Unnamed: 0,Date,open_price,high_price,low_price,close_price
0,2004-08-19,2.502503,2.604104,2.401401,2.511011
1,2004-08-20,2.527778,2.72973,2.515015,2.71046
2,2004-08-23,2.771522,2.83984,2.728979,2.737738
3,2004-08-24,2.783784,2.792793,2.591842,2.624374
4,2004-08-25,2.626627,2.702703,2.5996,2.652653


In [7]:
# Create a date series for plotting purpose
date_train = pd.to_datetime(gg_df['Date'])

date_train.head()

0   2004-08-19
1   2004-08-20
2   2004-08-23
3   2004-08-24
4   2004-08-25
Name: Date, dtype: datetime64[ns]

In [13]:
# Select "open, high, low, volume" as input features

columns = list(gg_df)[1:6]

print(columns)

['open_price', 'high_price', 'low_price', 'volume', 'close_price']


In [9]:
# New dataframe with only training data - 5 columns

gg_training_df = gg_df[columns].astype(float)

gg_training_df.head()

Unnamed: 0,open_price,high_price,low_price,volume
0,2.502503,2.604104,2.401401,893181924.0
1,2.527778,2.72973,2.515015,456686856.0
2,2.771522,2.83984,2.728979,365122512.0
3,2.783784,2.792793,2.591842,304946748.0
4,2.626627,2.702703,2.5996,183772044.0


In [10]:
# LSTM uses sigmoid and tanh that are sensitive to magnitude so values need to be normalized
# scaled the traing dataset

scaler = MinMaxScaler(feature_range=(0,1))
scaler = scaler.fit(gg_training_df) # .values.reshape(-1,1))
scaled_training_data = scaler.transform(gg_training_df)

In [11]:
print(scaled_training_data.shape)

(4553, 4)


In [12]:
# LSTM require to reshape the input data into (n_samples, timesteps, n_features). 
# This project we choose n_features=4 and timesteps = 30 (past days data used for training). 
# Create empty X_train and y_train list 
X_train = []
y_train = []

training_data_len = round(len(scaled_training_data)*0.8)
# Define future days we want to predict and look-back days used for prediction

look_back = 30
num_future_days = 1

for i in range(look_back, training_data_len-num_future_days+1):
    X_train.append(scaled_training_data[i-look_back:i, 0:scaled_training_data.shape[1]])
    y_train.append(scaled_training_data[i+num_future_days-1:i+num_future_days, 4])

IndexError: index 4 is out of bounds for axis 1 with size 4

In [None]:
X_train, y_train = np.array(X_train), np.array(y_train)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
# Build the LSTM model

model = Sequential()
model.add(LSTM(units=64, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True))
model.add(LSTM(units=32, activation='relu', return_sequences=False))
# model.add(Dropout(0.2))
model.add(Dense(units=1))

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

In [None]:
# Fit the model

history = model.fit(X_train, y_train, batch_size =32, epochs=20, verbose=1)

In [None]:
import seaborn as sns
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
us_bd = CustomBusinessDay(calendar=USFederalHolidayCalendar())
#Remember that we can only predict one day in future as our model needs 5 variables
#as inputs for prediction. We only have all 5 variables until the last day in our dataset.
n_past = 16
n_days_for_prediction=15  #let us predict past 15 days

predict_period_dates = pd.date_range(list(date_train)[-n_past], periods=n_days_for_prediction, freq=us_bd).tolist()
print(predict_period_dates)

#Make prediction
prediction = model.predict(X_train[-n_days_for_prediction:]) #shape = (n, 1) where n is the n_days_for_prediction

#Perform inverse transformation to rescale back to original range
#Since we used 5 variables for transform, the inverse expects same dimensions
#Therefore, let us copy our values 5 times and discard them after inverse transform
prediction_copies = np.repeat(prediction,gg_training_df.shape[1], axis=-1)
y_pred_future = scaler.inverse_transform(prediction_copies)[:,0]


# Convert timestamp to date
forecast_dates = []
for time_i in predict_period_dates:
    forecast_dates.append(time_i.date())
    
df_forecast = pd.DataFrame({'Date':np.array(forecast_dates), 'Close':y_pred_future})
df_forecast['Date']=pd.to_datetime(df_forecast['Date'])


original = gg_df[['Date', 'close_price']]
original['Date']=pd.to_datetime(original['Date'])
original = original.loc[original['Date'] >= '2020-5-1']

sns.lineplot(original['Date'], original['close_price'])
sns.lineplot(df_forecast['Date'], df_forecast['close_price'])

In [None]:
# Test the model
# Create the prediction date list for purpost of plotting
predict_period_dates = pd.date_range(list(date_train)[-16], periods=15, freq='1d').tolist()

print(predict_period_dates)

In [None]:
# Make prediction
prediction = model.predict(X_train[-15:])


In [None]:
print(prediction)

In [None]:
prediction_copies = np.repeat(prediction, scaled_training_data.shape[1], axis=-1)
print(prediction_copies)

In [None]:
y_pred = scaler.inverse_transform(prediction_copies)[:,0]
print(y_pred)

In [None]:
# plot test predictions

plt.figure(figsize=(12, 8))
plt.plot(y_pred, color='red', label='Predicted')
plt.plot(gg_training_df['close_price'], color='green', label='Actual')
plt.xlabel('Date')
plt.ylabel('Stock Price')

plt.title("Stock Price predicted by LSTM Model")
plt.grid()
plt.legend()
plt.show()