# Predicting the Price of the S&P500 Index with Linear Regression

In [1]:
import pandas as pd
from pandas import DataFrame
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
import numpy as np
from IPython.display import display


In [2]:
df=pd.read_csv('sphist.csv')
df.head(5)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2015-12-07,2090.419922,2090.419922,2066.780029,2077.070068,4043820000.0,2077.070068
1,2015-12-04,2051.23999,2093.840088,2051.23999,2091.689941,4214910000.0,2091.689941
2,2015-12-03,2080.709961,2085.0,2042.349976,2049.620117,4306490000.0,2049.620117
3,2015-12-02,2101.709961,2104.27002,2077.110107,2079.51001,3950640000.0,2079.51001
4,2015-12-01,2082.929932,2103.370117,2082.929932,2102.629883,3712120000.0,2102.629883


In [3]:
# datetime format
df['Date']=pd.to_datetime(df['Date'])

In [4]:
df['Date'].head()

0   2015-12-07
1   2015-12-04
2   2015-12-03
3   2015-12-02
4   2015-12-01
Name: Date, dtype: datetime64[ns]

In [5]:
# Sort by the date column in ascending order
df=df.sort_values('Date',ascending=True)

In [6]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08


In [7]:
# Add features that will be usefull for future machine learning process
df['5_day_open']=df['Open'].rolling(window=5).mean()
df['5_day_high']=df['High'].rolling(window=5).mean()
df['5_day_low']=df['Low'].rolling(window=5).mean()
df['5_day_volume']=df['Volume'].rolling(window=5).mean()
df['Year']=df['Date'].apply(lambda x: x.year)

In [8]:
#Add day of week column and set it to categorical
df['DOW']=df['Date'].apply(lambda x: x.weekday())
dow_df=pd.get_dummies(df['DOW'])
df=pd.concat([df,dow_df], axis=1)
df=df.drop(['DOW'],axis=1)

In [9]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close,5_day_open,5_day_high,5_day_low,5_day_volume,Year,0,1,2,3,4
16589,1950-01-03,16.66,16.66,16.66,16.66,1260000.0,16.66,,,,,1950,0,1,0,0,0
16588,1950-01-04,16.85,16.85,16.85,16.85,1890000.0,16.85,,,,,1950,0,0,1,0,0
16587,1950-01-05,16.93,16.93,16.93,16.93,2550000.0,16.93,,,,,1950,0,0,0,1,0
16586,1950-01-06,16.98,16.98,16.98,16.98,2010000.0,16.98,,,,,1950,0,0,0,0,1
16585,1950-01-09,17.08,17.08,17.08,17.08,2520000.0,17.08,16.9,16.9,16.9,2046000.0,1950,1,0,0,0,0


In [11]:
# reindex the resulting series to shift all the values "forward" one day, because 'rolling' method above include the current date to compute, which will negatively affect the future prediction
df['5_day_open']=df['5_day_open'].shift(1)
df['5_day_high']=df['5_day_high'].shift(1)
df['5_day_low']=df['5_day_low'].shift(1)
df['5_day_volume']=df['5_day_volume'].shift(1)

In [14]:
df=df[df['Date']>=datetime(year=1951, month=1, day=3)]
df.dropna(axis=0)

# Split dataset into train and test set.
train_df=df[df['Date']<datetime(year=2013,month=1,day=1)]
test_df=df[df['Date']>=datetime(year=2013,month=1,day=1)]

In [15]:
# Feature columns
train_columns=['5_day_open','5_day_volume','5_day_high','5_day_low','Year',0,1,2,3,4]

In [16]:
# perform linear regression
lr=LinearRegression()
lr.fit(train_df[train_columns],train_df['Close'])
predictions=lr.predict(test_df[train_columns])

In [17]:
test_df_copy=test_df.copy()
test_df_copy['predicted_close_price']=predictions
test_df_copy['Difference']=abs(test_df_copy['Close']-test_df_copy['predicted_close_price'])
test_df_copy["Accuracy(%)"]=(1-(abs(test_df_copy['Close']-test_df_copy['predicted_close_price'])/test_df_copy['Close']))*100

In [19]:
# Error metrics
mse=mean_squared_error(test_df['Close'],predictions)
rmse=np.sqrt(mse)
test_df_copy=DataFrame(test_df_copy[['Date','Close','predicted_close_price','Difference',"Accuracy(%)"]])
print("mse:",mse)
print('rmse:',rmse)
print("-"*60)
print('The S&P500 Index')
display(test_df_copy.sort_values(by=['Date'],ascending=True).head(20))

mse: 378.93563844926695
rmse: 19.466269248350258
------------------------------------------------------------
The S&P500 Index


Unnamed: 0,Date,Close,predicted_close_price,Difference,Accuracy(%)
738,2013-01-02,1462.420044,1409.290878,53.129166,96.367038
737,2013-01-03,1459.369995,1429.899615,29.47038,97.980609
736,2013-01-04,1466.469971,1438.455036,28.014935,98.089635
735,2013-01-07,1461.890015,1456.820228,5.069787,99.653203
734,2013-01-08,1457.150024,1469.353529,12.203505,99.162509
733,2013-01-09,1461.02002,1465.772722,4.752702,99.6747
732,2013-01-10,1472.119995,1456.688956,15.431039,98.951781
731,2013-01-11,1472.050049,1463.666946,8.383103,99.430515
730,2013-01-14,1470.680054,1460.167481,10.512573,99.28519
729,2013-01-15,1472.339966,1462.87845,9.461516,99.357382
