# Setup:
Importing modules, ensure MatplotLib plots figures inline and prepare a function to save the figures.

In [None]:
# Common imports
import numpy as np
import os
import pandas as pd
import sklearn 
import datetime as dt

# For plotting
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

#For preprocessing and modelling data
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#validating model
from sklearn.metrics import (mean_squared_error, 
                             r2_score)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Get the data from CSV
Reading the file and checking first 5 rows

In [None]:
df = pd.read_csv('SP500_WSJMarkets_Oct2020to2022.csv')
df.head(5)  

### Preprocessing data:
-Formatting column "Date" 

In [None]:
#format colum Date from Sep 12, 2022 to 2022-09-12
df["Date"] = pd.to_datetime(df["Date"])

Two columns are added:
<br>-Difference between the highest and lowest prices indicating price variance during the day
<br>-Whether a day has been a turning point, this indicates possibility for resistance level, which is commonly used in trading

In [None]:
df2 = df.copy()
#difference between columns High and Low
df2["Hi_Low_Difference"] = (df2["High"] - df2["Low"])

#interphase for the final column of "Turning_Point"
df2['HigherThanDayBefore'] = df2['Close'] > df2['Close'].shift(periods=-1, freq=None, axis=0)
df2['HigherThanDayAfter'] = df2['Close'] > df2['Close'].shift(periods=1, freq=None, axis=0)


#comparing the values in two columns created before and checking that everything looks good
df2['Turning_Point'] = np.where( df2['HigherThanDayBefore'] == df2['HigherThanDayAfter'] , 1, 0) 
df2.head(5)

df2 = df2.drop(['HigherThanDayBefore', 'HigherThanDayAfter'],axis=1) #deleting extra columns

In [None]:
df2.head(5)

Now we have a dataset df2 with extra columns "Hi_Low_Difference" and "Turning_Point", which we use in the prediction process. In column Turning_Point value 1 means that the price was at its turning point on that day.

# Viewing data
Next we'll see how the datapoints are distributed in columns Hi_Low_difference and Turning_Points.

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(20,10)) # create a figure with two axes (1 row,2 columns) on it

#filtering out only the rows, which have been turning points
TP_data = df2[df2['Turning_Point'] == 1]
TP_data = TP_data['Close']

axes[0].hist(TP_data, bins=30)
axes[0].set_title('Turning points',size=15)
axes[0].set_ylabel("# times prices has turned",size=15)
axes[0].set_xlabel("Close prices USD",size=15)

axes[1].scatter(df2['Close'],df2['Hi_Low_Difference'])
axes[1].set_title('Difference between highest and lowest price',size=15)
axes[1].set_ylabel("Difference of the highest and lowest price of the day",size=15)
axes[1].set_xlabel("Close prices USD",size=15)


fig.tight_layout()
plt.show()
save_fig("plots")

From the plots we can see the majority of the turning points and the smallest variability of prices during a day are below 3000 USD, which makes sense because the data is from past five years and therefore lower prices will always be overrepresented in historical data. Because the price of the index is now at around 3600 USD, we are more interested in the values closer to that. Based on the historgram plot there is weak indication of resistance levels between 3000 - 3400 USD approximately and the scatter plot hghlights prices 3400 USD and 3200 USD approximately. These price levels will be taken into account in the section results.

## Processing data for training

In [None]:
#creating a separate dataset
data = pd.DataFrame(index=range(0,len(df)),columns=['Date','Close', 'Open', 'High', 'Low', 'Turning_Point', 'Hi_Low_Difference'])

for i in range(0,len(data)):
    data['Date'][i] = df2['Date'][i]
    data['Close'][i] = df2['Close'][i]
    data['Open'][i] = df2['Open'][i]
    data['High'][i] = df2['High'][i]
    data['Low'][i] = df2['Low'][i]
    data['Turning_Point'][i] = df2['Turning_Point'][i]
    data['Hi_Low_Difference'][i] = df2['Hi_Low_Difference'][i]

#making sure that the data is sorted by date
data = data.sort_index(ascending=False, axis=0)

#setting numerical date as index for Linear Regression
data.index = data['Date'].map(dt.datetime.toordinal)

In [None]:
#split into train, validation and test datasets 60/20/20 %
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, shuffle = True, random_state = 42)

train, val = train_test_split(train, test_size=0.25, shuffle = True, random_state = 42)

## Linear Regression

### Training the model and calculating errors

In [None]:
#deleting column Date for Linear Regression to avoid error
train_No_Date = train.drop(['Date'],axis=1)

#Training data
#creating features and target
X_train = train_No_Date.drop('Close', axis=1)
y_train = train_No_Date['Close']

#training the model and predicting using training data
model = LinearRegression()
model.fit(X_train, y_train)
predictions_train = model.predict(X_train)

In [None]:
#Validation data
#deleting column Date for Linear Regression
val_No_Date = val.drop(['Date'],axis=1)

#creating features and target
X_val = val_No_Date.drop('Close', axis=1)
y_val = val_No_Date['Close']

#predicting with validation data
predictions_val = model.predict(X_val)

In [None]:
#Test data

#deleting column Date for Linear Regression avoiding error
test_No_Date = test.drop(['Date'],axis=1)

#creating features and target
X_test = test_No_Date.drop('Close', axis=1)
y_test = test_No_Date['Close']

#predicting using test data
predictions_test = model.predict(X_test)

In [None]:
#Calculating errors
mse_train_lr = mean_squared_error(predictions_train, y_train)
r2_train_lr = r2_score(y_train, predictions_train)
mse_val_lr = mean_squared_error(predictions_val, y_val)
r2_val_lr = r2_score(y_val, predictions_val)
mse_test_lr = mean_squared_error(predictions_test, y_test)
r2_test_lr = r2_score(y_test, predictions_test)

from tabulate import tabulate
errors = [['Training', mse_train_lr, r2_train_lr],
['Validation', mse_val_lr, r2_val_lr],
['Test', mse_test_lr, r2_test_lr]]
print(tabulate(errors, headers=["Mean squared error", "R2 Score"]))

In [None]:
#Plotting results
#Training set
fig = plt.figure(figsize=(20,6))

for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i == 0:
        plt.scatter(train['Date'], y_train, c='black')
        plt.title('Actual S&P500 Index')
    else:
        plt.scatter(train['Date'], predictions_train, c='red')
        plt.title('Linear regression prediction over training set')
    plt.xlabel('Years')
    plt.ylabel('S&P500 Index')
    
plt.show()

#Validation set
fig = plt.figure(figsize=(20,6))

for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i == 0:
        plt.scatter(val['Date'], y_val, c='black')
        plt.title('Actual S&P500 Index')
    else:
        plt.scatter(val['Date'], predictions_val, c='red')
        plt.title('Linear regression prediction over validation set')
    plt.xlabel('Years')
    plt.ylabel('S&P500 Index')
    
plt.show()

#Test set
fig = plt.figure(figsize=(20,6))

for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i == 0:
        plt.scatter(test['Date'], y_test, c='black')
        plt.title('Actual S&P500 Index')
    else:
        plt.scatter(test['Date'], predictions_test, c='red')
        plt.title('Linear regression prediction over test set')
    plt.xlabel('Years')
    plt.ylabel('S&P500 Index')
    
plt.show()
save_fig("predictions_LR")

## K Nearest Neighbour

### Training the model and calculating errors

In [None]:
#using gridsearch to find the best parameter
params = {'n_neighbors':[2,3,4,5,6,7,8,9]}
knn = neighbors.KNeighborsRegressor()
model_KNN = GridSearchCV(knn, params, cv=5)

#fit the model and make prediction using training data
model_KNN.fit(X_train,y_train)
predictions_train_knn = model_KNN.predict(X_train)

#print out the number of neighbors used in the model
model_KNN.best_params_

#predictions using validation data
predictions_val_knn = model_KNN.predict(X_val)

#predictions using test data
predictions_test_knn = model_KNN.predict(X_test)

#calculating errors
mse_train_knn = mean_squared_error(predictions_train_knn, y_train)
r2_train_knn = r2_score(y_train, predictions_train_knn)

mse_val_knn = mean_squared_error(predictions_val_knn, y_val)
r2_val_knn = r2_score(y_val, predictions_val_knn)

mse_test_knn = mean_squared_error(predictions_test_knn, y_test)
r2_test_knn = r2_score(y_test, predictions_test_knn)

errors = [['Training', mse_train_knn, r2_train_knn],
['Validation', mse_val_knn, r2_val_knn],
['Test', mse_test_knn, r2_test_knn]]
print(tabulate(errors, headers=["Mean squared error", "R2 Score"]))

In [None]:
#Plotting results
#Training data
fig = plt.figure(figsize=(20,6))

for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i == 0:
        plt.scatter(train['Date'], y_train, c='black')
        plt.title('Actual S&P500 Index')
    else:
        plt.scatter(train['Date'], predictions_train_knn, c='red')
        plt.title('kNN Prediction over training set')
    plt.xlabel('Years')
    plt.ylabel('S&P500 Index')
    
plt.show()

#Validation data
fig = plt.figure(figsize=(20,6))

for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i == 0:
        plt.scatter(val['Date'], y_val, c='black')
        plt.title('Actual S&P500 Index')
    else:
        plt.scatter(val['Date'], predictions_val_knn, c='red')
        plt.title('kNN Prediction over validation set')
    plt.xlabel('Years')
    plt.ylabel('S&P500 Index')
    
plt.show()

#Test data
fig = plt.figure(figsize=(20,6))

for i in range(2):
    ax = fig.add_subplot(1, 2, i+1)
    if i == 0:
        plt.scatter(test['Date'], y_test, c='black')
        plt.title('Actual S&P500 Index')
    else:
        plt.scatter(test['Date'], predictions_test_knn, c='red')
        plt.title('kNN Prediction over test set')
    plt.xlabel('Years')
    plt.ylabel('S&P500 Index')
    
plt.show()
save_fig("predictions_kNN")