In [1]:
import pandas as pd
import io
import requests
import numpy as np
import collections
from sklearn import metrics
from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

from tensorflow.keras import optimizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

In [3]:
dt = pd.read_csv('C:/Users/Owner/Documents/Sac State/CSC215_P2_Stock_Price.csv')
dt['Close_y'] = dt['Close']
split = int(0.7 * len(dt))
df_train = dt[:split]
df_test = dt[split:len(dt)]

print("Training set has {} records.".format(len(df_train)))
print("Test set has {} records.".format(len(df_test)))


Training set has 3074 records.
Test set has 1318 records.


Since we want to ensure time-based relationships between observations, we split the data in a sequential way. The first 70% of observations are in the training set, with the remaining in our test.

In [4]:
close_train = df_train['Close']
close_test = df_test['Close']

We then save a pre-normalized copy of the Close values. This will be used as our dependent variable in models.

In [5]:
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()
        
    if sd is None:
        sd = df[name].std()
        
    df[name] = (df[name] - mean) / sd

normal_list = ['Open', 'High', 'Low', 'Volume', 'Close'];

for element in normal_list:
    encode_numeric_zscore(df_train, element)
    encode_numeric_zscore(df_test, element)    



params_train = df_train[['Open', 'High', 'Low', 'Volume', 'Close']].values.tolist()
params_test = df_test[['Open', 'High', 'Low', 'Volume', 'Close',]].values.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Next we normalize the values, and drop the 'date' column from the dataframe. The way we did this caused a warning, however this does not impact our final program.

In [8]:
def to_sequences(seq_size, data):
    x = []
    y = []

    for i in range(len(data)-SEQUENCE_SIZE-1):
        #print(i)
        window = data[i:(i+SEQUENCE_SIZE)]
        after_window = data[i+SEQUENCE_SIZE]
        window = [x for x in window]
        #print("{} - {}".format(window,after_window))
        x.append(window)
        y.append(after_window)
        
    return np.array(x),np.array(y)

SEQUENCE_SIZE = 7
x_train,y_train = to_sequences(SEQUENCE_SIZE,params_train)
obs_train = close_train[SEQUENCE_SIZE:len(close_train)].values.tolist()
obs_train.pop()
obs_train = np.asarray(obs_train)

x_test,y_test = to_sequences(SEQUENCE_SIZE,params_test)
obs_test = close_test[SEQUENCE_SIZE:len(close_test)].values.tolist()
obs_test.pop()
obs_test = np.asarray(obs_test)

print("Shape of x_train: {}".format(x_train.shape))
print("Shape of x_test: {}".format(x_test.shape))
print("Shape of y_train: {}".format(obs_train.shape))
print("Shape of y_test: {}".format(obs_test.shape))



Shape of x_train: (3066, 7, 5)
Shape of x_test: (1310, 7, 5)
Shape of y_train: (3066,)
Shape of y_test: (1310,)


Here we transform our data in sequence data. Now, each record is a sequence of 7 days of data, each with 5 relevant pieces of information on that day's market behavior. Additionally, there is one dependent variable for each of these 7x5 structures which holds the close price of the stock on the seventh day.

In [13]:
myDict = dict()
##activationType = ['relu', 'sigmoid', 'tanh']
optimizerType = ['adam', 'sgd']
iteration = 0

for opt in optimizerType:
    checkpointer = ModelCheckpoint(filepath="C:/Users/Owner/Documents/Sac State/csc215/proj2/best_weights.hdf5", verbose=0, save_best_only=True) # save best model        
    for i in range(2):
        print(i)        
        # Build network
        model = Sequential()
        model.add(LSTM(64, dropout=0.1, recurrent_dropout=0.1, input_shape=(SEQUENCE_SIZE, 5)))
        model.add(Dense(32))
        model.add(Dense(1))
        model.compile(loss='mean_squared_error', optimizer='adam')        
        monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')        
        model.fit(x_train,obs_train,validation_data=(x_test,obs_test), callbacks=[monitor],verbose=2, epochs=100)  


    print('Training finished...Loading the best model')  
    print()
    model.load_weights("C:/Users/Owner/Documents/Sac State/csc215/proj2/best_weights.hdf5") # load weights from best model
    myDict.update({iteration : (opt, model)})
    iteration += 1




for ele in myDict.values():
    print('Analyzing model with optimizer {}'.format(ele[0]))
    model = ele[1]
    pred = model.predict(x_test)
    score = np.sqrt(metrics.mean_squared_error(pred,obs_test))
    print("Score (RMSE): {}".format(score))

0
Train on 3066 samples, validate on 1310 samples
Epoch 1/100
3066/3066 - 13s - loss: 0.0795 - val_loss: 0.0209
Epoch 2/100
3066/3066 - 13s - loss: 0.0223 - val_loss: 0.0171
Epoch 3/100
3066/3066 - 13s - loss: 0.0177 - val_loss: 0.0135
Epoch 4/100
3066/3066 - 14s - loss: 0.0158 - val_loss: 0.0120
Epoch 5/100
3066/3066 - 15s - loss: 0.0139 - val_loss: 0.0108
Epoch 6/100
3066/3066 - 9s - loss: 0.0130 - val_loss: 0.0118
Epoch 7/100
3066/3066 - 9s - loss: 0.0113 - val_loss: 0.0129
Epoch 8/100
3066/3066 - 10s - loss: 0.0101 - val_loss: 0.0100
Epoch 9/100
3066/3066 - 10s - loss: 0.0086 - val_loss: 0.0099
Epoch 10/100
3066/3066 - 14s - loss: 0.0083 - val_loss: 0.0097
Epoch 11/100
3066/3066 - 12s - loss: 0.0076 - val_loss: 0.0165
Epoch 12/100
3066/3066 - 12s - loss: 0.0070 - val_loss: 0.0084
Epoch 13/100
3066/3066 - 9s - loss: 0.0062 - val_loss: 0.0075
Epoch 14/100
3066/3066 - 10s - loss: 0.0055 - val_loss: 0.0086
Epoch 15/100
3066/3066 - 7s - loss: 0.0048 - val_loss: 0.0106
Epoch 16/100
3066/