# Author - Deepak Kothari
## Stocks Closing Price Prediction - Linear Regression Model (Machine Learning)

In [1]:
import pandas as pd
import numpy as np

In [2]:
# load dataset
dataframe = pd.read_csv("timeseriesdata.csv")

In [3]:
dataframe.sort_values(by='Date', inplace=True)

In [4]:
dataframe.shape

(1235, 8)

In [5]:
dataframe.head()

Unnamed: 0,Date,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
1234,2013-10-08,157.0,157.8,155.2,155.8,155.8,1720413.0,2688.94
1233,2013-10-09,155.7,158.2,154.15,155.3,155.55,2049580.0,3204.49
1232,2013-10-10,156.0,160.8,155.85,160.3,160.15,3124853.0,4978.8
1231,2013-10-11,161.15,163.45,159.0,159.8,160.05,1880046.0,3030.76
1230,2013-10-14,160.85,161.45,157.7,159.3,159.45,1281419.0,2039.09


In [6]:
dataframe.tail()

Unnamed: 0,Date,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
4,2018-10-01,234.55,234.6,221.05,230.3,230.9,1534749.0,3486.05
3,2018-10-03,230.0,237.5,225.75,226.45,227.6,1708590.0,3960.27
2,2018-10-04,223.5,227.8,216.15,217.25,218.2,1728786.0,3815.79
1,2018-10-05,217.0,218.6,205.9,210.25,209.2,3519515.0,7407.06
0,2018-10-08,208.0,222.25,206.85,216.0,215.15,4642146.0,10062.83


In [7]:
date_df = pd.DataFrame(dataframe['Date'])

date_df['Date'] = date_df['Date'].astype('datetime64[ns]')
#date_df['weekend'] = ((date_df.Date.dt.dayofweek) // 5 == 1).astype(float)
date_df['weekday'] = date_df['Date'].apply(lambda x: x.weekday())
date_df['year']=date_df.Date.dt.year
date_df['month']=date_df.Date.dt.month 
date_df['day']=date_df.Date.dt.day
date_df['quarter'] = date_df.Date.dt.quarter
date_df['halfYear'] = date_df['quarter'].apply(lambda x: 1 if x < 3 else 2)

In [8]:
date_df.shape

(1235, 7)

In [9]:
date_df.head()

Unnamed: 0,Date,weekday,year,month,day,quarter,halfYear
1234,2013-10-08,1,2013,10,8,4,2
1233,2013-10-09,2,2013,10,9,4,2
1232,2013-10-10,3,2013,10,10,4,2
1231,2013-10-11,4,2013,10,11,4,2
1230,2013-10-14,0,2013,10,14,4,2


In [10]:
data = date_df.merge(dataframe, how="inner", left_index=True, right_index=True)

In [11]:
data.shape

(1235, 15)

In [12]:
data.head()

Unnamed: 0,Date_x,weekday,year,month,day,quarter,halfYear,Date_y,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs)
1234,2013-10-08,1,2013,10,8,4,2,2013-10-08,157.0,157.8,155.2,155.8,155.8,1720413.0,2688.94
1233,2013-10-09,2,2013,10,9,4,2,2013-10-09,155.7,158.2,154.15,155.3,155.55,2049580.0,3204.49
1232,2013-10-10,3,2013,10,10,4,2,2013-10-10,156.0,160.8,155.85,160.3,160.15,3124853.0,4978.8
1231,2013-10-11,4,2013,10,11,4,2,2013-10-11,161.15,163.45,159.0,159.8,160.05,1880046.0,3030.76
1230,2013-10-14,0,2013,10,14,4,2,2013-10-14,160.85,161.45,157.7,159.3,159.45,1281419.0,2039.09


In [13]:
data.drop(['Date_x', 'Date_y'], axis=1, inplace=True)

In [14]:
#Getting Unique values for column - year dataframe - data
year_columns = data['year'].unique()
year_columns.sort()
#Creating a list for  column
D = list(range(0, len(year_columns)))


#Creating a dictionary for each column with integer as value
year_columns_dict = dict(zip(year_columns,D))


#Mapping values encoded values to page_details dataframe
data['year'] = data['year'].map(year_columns_dict)

In [15]:
# Let's go ahead and plot out several moving averages of Closing Price
'''
5 - weekly
22 - monthly
50 - Quarterly
'''

ma_day = [5,22,50]

for ma in ma_day:
    column_name = "MA-%s-Days" %(str(ma))
    data[column_name] = pd.Series(data['Close']).rolling(window=ma).mean()

In [16]:
data.shape

(1235, 16)

In [17]:
data.head()

Unnamed: 0,weekday,year,month,day,quarter,halfYear,Open,High,Low,Last,Close,Total Trade Quantity,Turnover (Lacs),MA-5-Days,MA-22-Days,MA-50-Days
1234,1,0,10,8,4,2,157.0,157.8,155.2,155.8,155.8,1720413.0,2688.94,,,
1233,2,0,10,9,4,2,155.7,158.2,154.15,155.3,155.55,2049580.0,3204.49,,,
1232,3,0,10,10,4,2,156.0,160.8,155.85,160.3,160.15,3124853.0,4978.8,,,
1231,4,0,10,11,4,2,161.15,163.45,159.0,159.8,160.05,1880046.0,3030.76,,,
1230,0,0,10,14,4,2,160.85,161.45,157.7,159.3,159.45,1281419.0,2039.09,158.2,,


In [18]:
data.isnull().sum()

weekday                  0
year                     0
month                    0
day                      0
quarter                  0
halfYear                 0
Open                     0
High                     0
Low                      0
Last                     0
Close                    0
Total Trade Quantity     0
Turnover (Lacs)          0
MA-5-Days                4
MA-22-Days              21
MA-50-Days              49
dtype: int64

In [19]:
data.fillna(method='bfill', inplace=True)

In [20]:
cols = list(data.columns.values)

In [21]:
newCols = ['weekday','day','month','quarter','halfYear','year','Open','High','Low','Last','MA-5-Days','MA-22-Days','MA-50-Days',
           'Total Trade Quantity','Turnover (Lacs)','Close']

In [22]:
set(cols) - set(newCols)

set()

In [23]:
data = data[newCols]

In [24]:
data.head()

Unnamed: 0,weekday,day,month,quarter,halfYear,year,Open,High,Low,Last,MA-5-Days,MA-22-Days,MA-50-Days,Total Trade Quantity,Turnover (Lacs),Close
1234,1,8,10,4,2,0,157.0,157.8,155.2,155.8,158.2,162.186364,153.681,1720413.0,2688.94,155.8
1233,2,9,10,4,2,0,155.7,158.2,154.15,155.3,158.2,162.186364,153.681,2049580.0,3204.49,155.55
1232,3,10,10,4,2,0,156.0,160.8,155.85,160.3,158.2,162.186364,153.681,3124853.0,4978.8,160.15
1231,4,11,10,4,2,0,161.15,163.45,159.0,159.8,158.2,162.186364,153.681,1880046.0,3030.76,160.05
1230,0,14,10,4,2,0,160.85,161.45,157.7,159.3,158.2,162.186364,153.681,1281419.0,2039.09,159.45


In [25]:
data.tail()

Unnamed: 0,weekday,day,month,quarter,halfYear,year,Open,High,Low,Last,MA-5-Days,MA-22-Days,MA-50-Days,Total Trade Quantity,Turnover (Lacs),Close
4,0,1,10,4,2,5,234.55,234.6,221.05,230.3,233.65,230.406818,234.866,1534749.0,3486.05,230.9
3,2,3,10,4,2,5,230.0,237.5,225.75,226.45,231.95,230.090909,234.645,1708590.0,3960.27,227.6
2,3,4,10,4,2,5,223.5,227.8,216.15,217.25,228.74,229.281818,234.347,1728786.0,3815.79,218.2
1,4,5,10,4,2,5,217.0,218.6,205.9,210.25,223.93,228.140909,233.759,3519515.0,7407.06,209.2
0,0,8,10,4,2,5,208.0,222.25,206.85,216.0,220.21,227.161364,233.196,4642146.0,10062.83,215.15


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [27]:
dataset = data.values
dataset.shape

(1235, 16)

In [28]:
# Scaling values into 0,1 dimension for neural networks
scaler = MinMaxScaler(feature_range = (0, 1))

dataset = scaler.fit_transform(dataset)  

In [29]:
# split into input (X) and output (Y) variables
X = dataset[:,0:15].astype(float)
Y = dataset[:,15]

In [30]:
# split into train and test
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3, random_state=0)


In [31]:
print(trainX.shape, trainY.shape, testX.shape, testY.shape)

(864, 15) (864,) (371, 15) (371,)


In [32]:
regressor = LinearRegression()  
regressor.fit(trainX, trainY) #training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [33]:
predictions = regressor.predict(testX)

In [34]:
# Invert Scaling for Predictions
set_pred = np.append(testX, predictions[:, None], axis=1)
pred = scaler.inverse_transform(set_pred)
y_pred = pred[:,15]

In [35]:
# Invert scaling for actual
set_actual = np.append(testX, testY[:, None], axis=1)
actual = scaler.inverse_transform(set_actual)
y_true = actual[:,15]

In [36]:
print('Root Mean Squared Error (Transformed Data) :', np.sqrt(metrics.mean_squared_error(testY, predictions)))
print('Root Mean Squared Error (Original Data) :', np.sqrt(metrics.mean_squared_error(y_true, y_pred)))

Root Mean Squared Error (Transformed Data) : 0.0018841727437420093
Root Mean Squared Error (Original Data) : 0.42035893912884187
