In [0]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from pyspark.sql import SQLContext
from pyspark import SparkContext
from pyspark.mllib.evaluation import RegressionMetrics

In [0]:
%sh 
curl -O https://s3.amazonaws.com/cs6350.0u1/Project/infy_stock.csv

In [0]:
inputData = sqlContext.read.format("csv").option("header", "true").option("inferSchema", "true").load("file:/databricks/driver/infy_stock.csv")

In [0]:
df = inputData.drop(*['Date','Symbol','Series'])

In [0]:
#Params that can be changed 
trainSplit = 0.9
timeSteps = 20
lstmUnits = 100
dropoutRate = 0.4
modelOptimizer = 'Adam' #SGD, RMSprop, Adam, Adadelta, Adagrad, Adamax, Nadam, Ftrl
modelLoss ='mean_squared_error' #mean_squared_error, poisson, mean_squared_logarithmic_error
numEpochs = 100

In [0]:
trainSize = int(df.count() * trainSplit)
testSize = df.count()- trainSize
print(trainSize, testSize)

In [0]:
train = sqlContext.createDataFrame(df.head(trainSize), df.schema)
test = df.subtract(train)

In [0]:
train = train.toPandas()
test = test.toPandas()

In [0]:
scaler = MinMaxScaler(feature_range=(0,1))
scaleColumns = ['Prev Close', 'Open', 'High', 'Low', 'Last', 'VWAP', 'Volume', 'Turnover','Trades' ,'Deliverable Volume', '%Deliverble']
predictColumn = ['Close']
train[scaleColumns] = scaler.fit_transform(train.loc[:, scaleColumns])
test[scaleColumns] = scaler.fit_transform(test.loc[:, scaleColumns])

closeScaler =MinMaxScaler(feature_range=(0,1))
train[predictColumn] = closeScaler.fit_transform(train.loc[:, predictColumn])
test[predictColumn] = closeScaler.fit_transform(test.loc[:, predictColumn])

In [0]:
def generateSeqs(input, target, steps):
  inputData = []
  targetData = []
  for x in range (len(input) - steps):
    #get series of data
    inputData.append(input.iloc[x: (x + steps)].values)
    #get actual value of Close and append for target value
    targetData.append(target.iloc[x + steps])
  return np.array(inputData), np.array(targetData)
  

In [0]:
inputTrain, outputTrain = generateSeqs(train, train.Close, timeSteps)
inputTest, outputTest = generateSeqs(test, test.Close, timeSteps)

print(inputTrain.shape, outputTrain.shape)

In [0]:
model = keras.Sequential()
model.add(
  keras.layers.Bidirectional(
    keras.layers.LSTM(
      units=lstmUnits, 
      input_shape=(inputTrain.shape[1], inputTrain.shape[2])
    )
  )
)
model.add(keras.layers.Dropout(rate=dropoutRate))
model.add(keras.layers.Dense(units=1))
model.compile(loss=modelLoss, optimizer=modelOptimizer)

In [0]:
model.fit(
    inputTrain, outputTrain, 
    epochs=numEpochs, 
    validation_split=0.1,
    shuffle=False
)

In [0]:
predictTrain = closeScaler.inverse_transform(model.predict(inputTrain)).ravel().tolist()
actualTrain = closeScaler.inverse_transform(outputTrain.reshape(-1,1)).ravel().tolist()
trainPredictionsAndActual = sc.parallelize(list(zip(predictTrain, actualTrain)))

predictTest = closeScaler.inverse_transform(model.predict(inputTest)).ravel().tolist()
actualTest = closeScaler.inverse_transform(outputTest.reshape(-1,1)).ravel().tolist()
testPredictionsAndActual = sc.parallelize(list(zip(predictTest, actualTest)))

In [0]:
trainMetrics = RegressionMetrics(trainPredictionsAndActual)
print("Training RMSE = %s" % trainMetrics.rootMeanSquaredError)

testMetrics = RegressionMetrics(testPredictionsAndActual)
print("Test RMSE = %s" % testMetrics.rootMeanSquaredError)