In [1]:
import pandas as pd
from pandas import DataFrame
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

In [2]:
#Read in the data set
df = pd.read_csv(r'C:\Users\user\Desktop\Data Scientist Course\Predicting the stock market\sphist.csv')
df['Date'] = pd.to_datetime(df['Date'])

In [3]:
#Sort by the date column in ascending order
df = df.sort_values("Date", ascending=True)

In [4]:
#Add features that could be helpful for machine learning
df['5 Days Open'] = df['Open'].rolling(window=5).mean()
df['5 Days High'] = df['High'].rolling(window=5).mean()
df['5 Days Low'] = df['Low'].rolling(window=5).mean()
df['5 Days Volume'] = df['Volume'].rolling(window=5).mean()
df['Year'] = df['Date'].apply(lambda x: x.year)

In [5]:
#Adding Day of week column and set it to categorical
df['DOW'] = df['Date'].apply(lambda x: x.weekday())
dow_df = pd.get_dummies(df['DOW'])
df = pd.concat([df, dow_df], axis=1)
df = df.drop(['DOW'], axis=1)

In [6]:
#Because 'rolling' method above include the current date to compute, the current date include future knowledge, which means that the prediction will look not good in real world , so we shift all the values forward one day.
df['5 Days Open'] = df['5 Days Open'].shift(1)
df['5 Days High'] = df['5 Days High'].shift(1)
df['5 Days Low'] = df['5 Days Low'].shift(1)
df['5 Days Volume'] = df['5 Days Volume'].shift(1)

In [7]:
df = df[df['Date'] >= datetime(year=1951, month=1, day=3)]
df.dropna(axis=0)

#Split dataset into train set and test set.
train_df = df[df['Date'] < datetime(year=2013, month=1, day=1)]
test_df = df[df['Date'] >= datetime(year=2013, month=1, day=1)]

In [8]:
#Feature columns
train_colums = ['5 Days Open', '5 Days Volume', '5 Days High', '5 Days Low', 'Year', 0, 1, 2, 3, 4]

In [9]:
# Perform linear regression.
lr = LinearRegression()
lr.fit(train_df[train_colums], train_df['Close'])
prediction = lr.predict(test_df[train_colums])

In [10]:
test_df_copy = test_df.copy()
test_df_copy['prediction_close'] = prediction
test_df_copy['difference'] = abs(test_df_copy['Close'] - test_df_copy['prediction_close'])
test_df_copy['accuracy(%)'] = (1-(abs(test_df_copy['Close'] - test_df_copy['prediction_close']) / test_df_copy['Close']
)) * 100

In [13]:
# Error metrics.
mse = mean_squared_error(test_df['Close'], prediction)
rmse = np.sqrt(mse)

print('mse:',mse)
print('rmse:',rmse)
print(test_df_copy[['Date','Close','prediction_close','difference', 'accuracy(%)']].head(10))

mse: 378.9356384494125
rmse: 19.466269248354
          Date        Close  prediction_close  difference  accuracy(%)
738 2013-01-02  1462.420044       1409.290878   53.129166    96.367038
737 2013-01-03  1459.369995       1429.899615   29.470380    97.980609
736 2013-01-04  1466.469971       1438.455036   28.014935    98.089635
735 2013-01-07  1461.890015       1456.820228    5.069787    99.653203
734 2013-01-08  1457.150024       1469.353529   12.203505    99.162509
733 2013-01-09  1461.020020       1465.772722    4.752702    99.674700
732 2013-01-10  1472.119995       1456.688956   15.431039    98.951781
731 2013-01-11  1472.050049       1463.666946    8.383103    99.430515
730 2013-01-14  1470.680054       1460.167481   10.512573    99.285190
729 2013-01-15  1472.339966       1462.878450    9.461516    99.357382
