#LetsGrowMore Virtual Internship Program

#Data Science

#Task-2:- Stock Market Prediction And Forecasting Using Stacked LSTM

#Author:- Kushal Adhyaru

Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Importing Dataset

In [2]:
url="https://raw.githubusercontent.com/mwitiderrick/stockprice/master/NSE-TATAGLOBAL.csv"
data=pd.read_csv(url)

Printing first 5 elements from the Dataset

In [3]:
data.head(5)


Unnamed: 0,Date,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
0,2018-09-28,234.05,235.95,230.2,233.5,233.75,3069914,7162.35
1,2018-09-27,234.55,236.8,231.1,233.8,233.25,5082859,11859.95
2,2018-09-26,240.0,240.0,232.5,235.0,234.25,2240909,5248.6
3,2018-09-25,233.3,236.75,232.0,236.25,236.1,2349368,5503.9
4,2018-09-24,233.55,239.2,230.75,234.0,233.3,3423509,7999.55


Printing last 5 elements from the Dataset

In [4]:
data.tail(5)

Unnamed: 0,Date,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
2030,2010-07-27,117.6,119.5,112.0,118.8,118.65,586100,694.98
2031,2010-07-26,120.1,121.0,117.1,117.1,117.6,658440,780.01
2032,2010-07-23,121.8,121.95,120.25,120.35,120.65,281312,340.31
2033,2010-07-22,120.3,122.0,120.25,120.75,120.9,293312,355.17
2034,2010-07-21,122.1,123.0,121.05,121.1,121.55,658666,803.56


To know the number of columns and rows in the dataset

In [5]:
data.shape

(2035, 8)

Providing info about the Dataset

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2035 entries, 0 to 2034
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Date                  2035 non-null   object 
 1   Open                  2035 non-null   float64
 2   High                  2035 non-null   float64
 3   Low                   2035 non-null   float64
 4   Last                  2035 non-null   float64
 5   Close                 2035 non-null   float64
 6   Total Trade Quantity  2035 non-null   int64  
 7   Turnover (Lacs)       2035 non-null   float64
dtypes: float64(6), int64(1), object(1)
memory usage: 127.3+ KB


Describing the Data

In [7]:
data.describe()

Unnamed: 0,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
count,2035.0,2035.0,2035.0,2035.0,2035.0,2035.0,2035.0
mean,149.713735,151.992826,147.293931,149.474251,149.45027,2335681.0,3899.980565
std,48.664509,49.413109,47.931958,48.73257,48.71204,2091778.0,4570.767877
min,81.1,82.8,80.0,81.0,80.95,39610.0,37.04
25%,120.025,122.1,118.3,120.075,120.05,1146444.0,1427.46
50%,141.5,143.4,139.6,141.1,141.25,1783456.0,2512.03
75%,157.175,159.4,155.15,156.925,156.9,2813594.0,4539.015
max,327.7,328.75,321.65,325.95,325.75,29191020.0,55755.08


Checking for null values if any

In [8]:
data.isnull().sum()

Date                    0
Open                    0
High                    0
Low                     0
Last                    0
Close                   0
Total Trade Quantity    0
Turnover (Lacs)         0
dtype: int64

#Exploratory Data Analysis

Pair Plot:- Pair Plots are a really simple way to visualize relationships between each variable. It produces a matrix of relationships between each variable in your data for an instant examination of our data. 

In [None]:
sns.pairplot(data, hue= 'Turnover (Lacs)', palette= "rocket")
plt.show()

Pyplot:- Pyplot is a state-based interface to matplotlib. It provides a MATLAB-like way of plotting. pyplot is mainly intended for interactive plots and simple cases of programmatic plot generation

Plotting the Close value graph using pyplot

In [None]:
plt.figure(figsize=(15,6))

df_high=data.reset_index()['Close']
plt.plot(df_high, c="g")
plt.show()

Plotting the High value graph using pyplot

In [None]:
plt.figure(figsize=(15,6))

df_high=data.reset_index()['High']
plt.plot(df_high, c="g")
plt.show()

Since LSTM are sensitive to the scale of the data, so we apply MinMax Scaler

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range = (0,1))
df_high = scaler.fit_transform(np.array(df_high).reshape(-1,1))

Train-Test Split

In [None]:
training_size = int(len(df_high) * 0.75)
test_size = len(df_high) - training_size
train_data, test_data = df_high[0:training_size,:], df_high[training_size:len(df_high),:1]

Data Prepocessing

In [None]:
def create_dataset(dataset, time_step = 1):
    dataX, dataY = [], []
    for i in range(len(dataset) - time_step - 1):
        a = dataset[i:(i+time_step), 0]
        dataX.append(a)
        dataY.append(dataset[i+time_step, 0])
    return np.array(dataX), np.array(dataY)

In [None]:
time_step = 100
x_train, y_train = create_dataset(train_data, time_step)
x_test, y_test = create_dataset(test_data, time_step)

#LSTM

LSTM networks are well-suited to classifying, processing and making predictions based on time series data, since there can be lags of unknown duration between important events in a time series.

In [None]:
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)

In [None]:
print(x_train.shape), print(y_train.shape)

Imported required Modules for stacked LSTM

In [None]:
import math
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import LSTM

In [None]:
model = Sequential()
model.add(LSTM(50, return_sequences = True, input_shape = (100,1)))
model.add(LSTM(50, return_sequences = True))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss = 'mean_squared_error', optimizer = 'adam')

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, validation_data = (x_test, y_test), epochs = 4, batch_size = 64, verbose = 1)

Lets predict and check performance metrics


In [None]:
train_predict = model.predict(x_train)
test_predict = model.predict(x_test)

Transform back to original form

In [None]:
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)

Calculating RMSE performance metrics


In [None]:
math.sqrt(mean_squared_error(y_train, train_predict))

Test Data RMSE


In [None]:
math.sqrt(mean_squared_error(y_test, test_predict))

#Data Visualization

Line Chart:- A line chart is a graphical representation of an asset's historical price action that connects a series of data points with a continuous line. This is the most basic type of chart used in finance, and it typically only depicts a security's closing prices over time.

In [None]:
#Shift train prediction for plotting
look_back = 100
trainPredictPlot = np.empty_like(df_high)
trainPredictPlot[:,:] = np.nan
trainPredictPlot[look_back:len(train_predict) + look_back, :] = train_predict

#Shift test prediction for plotting
testPredictPlot = np.empty_like(df_high)
testPredictPlot[:,:] = np.nan
testPredictPlot[len(train_predict) + (look_back * 2)+1:len(df_high) - 1, :] = test_predict

In [None]:
plt.figure(figsize=(10,6))

plt.plot(scaler.inverse_transform(df_high))
plt.plot(trainPredictPlot)
plt.plot(testPredictPlot)
plt.show()

print("Green  = Predicted Data")
print("Blue   = Complete Data")
print("Orange = Train Data")

#Thank You