<a href="https://colab.research.google.com/github/kappandrew2/DataPreProcessing/blob/main/MarketResearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model Purpose

Utilize historical value and time attributes to predict the next day's gain or loss value

!Dataset Notes: The dataset for this data solution must come from the following web sit and contain a large historical sample of data. For example:

Begin Date = 12/01/2007 (Be mindful that the last 35 periods (in this case, days) will get chopped off of the bottom of the dataset during data preprocessing)

End Date = Today's current value (to be run an hour before market close)

Ticker = SPY

Train Set = all data except last 60 periods (rows)

Prediction Set = all data from -90 periods (days) to current

https://www.wsj.com/market-data/quotes/index/SPX/historical-prices


In [359]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split
from datetime import date, datetime, timedelta

#Connect to drive and import data set

Using google drive

Importing historical prices for ticker "SPY"

In [360]:
#Create CSV from data export
#https://www.wsj.com/market-data/quotes/index/SPX/historical-prices

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HistoricalPricesSPY.csv')

print(dataset)

Mounted at /content/drive
          Date     Open      High     Low   Close     Volume
0     07/29/21  439.815  441.8000  439.81  440.65   46716900
1     07/28/21  439.680  440.3000  437.31  438.83   52472359
2     07/27/21  439.910  439.9400  435.99  439.01   67397133
3     07/26/21  439.310  441.0300  439.26  441.02   43719191
4     07/23/21  437.520  440.3000  436.79  439.94   63766641
...        ...      ...       ...     ...     ...        ...
3433  12/07/07  151.420  151.5000  150.55  150.91  148951391
3434  12/06/07  148.630  151.2100  148.57  150.94  154487203
3435  12/05/07  147.930  149.2000  147.83  148.81  170813406
3436  12/04/07  146.660  147.5409  146.31  146.36  136528609
3437  12/03/07  148.190  148.4500  147.29  147.68  145852797

[3438 rows x 6 columns]


#Modifiy dataset Content and Headers

Remove contents not required for this exercise

Renaming columns to remove leading white space

In [361]:
dataset = dataset.drop([' Open', ' High', ' Low', ' Volume'], axis = 1)

dataset.rename({' Close': 'Close'}, axis=1, inplace = True)


#Dataset information validation

Validate date frame, column contents and data types

In [362]:
dataset['Date'] = pd.to_datetime(dataset['Date'])

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3438 entries, 0 to 3437
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3438 non-null   datetime64[ns]
 1   Close   3438 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 53.8 KB


#Change indext to date (for troublshooting)

Moving date to the index assists in visually validating processes are working correctly

!Note: This should be "off" except when troublshooting

In [363]:
#dataset['Date_Index'] = dataset['Date']
#dataset.set_index('Date_Index', inplace=True)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3438 entries, 0 to 3437
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3438 non-null   datetime64[ns]
 1   Close   3438 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 53.8 KB


#Create time attributes

Time attributes will change date from a continous variable into discrete (a numeric categorical value)

In [364]:
dataset['DOW'] = dataset['Date'].dt.dayofweek
dataset['DOY'] = dataset['Date'].dt.dayofyear
dataset['Week'] = dataset['Date'].dt.week
dataset['Month'] = dataset['Date'].dt.month
dataset['Quarter'] = dataset['Date'].dt.quarter

dataset.dtypes

  This is separate from the ipykernel package so we can avoid doing imports until


Date       datetime64[ns]
Close             float64
DOW                 int64
DOY                 int64
Week                int64
Month               int64
Quarter             int64
dtype: object

#Create skip-day gain loss values (and dependant variable #1)

!Note: gain-loss-0 will ultimately end up being the dependant variable but also an independant variable (we will create a new column later and shift it down a row)

1) Calculate the first day's gain loss by subtracting day -1 from day 0

2) Calculate the second day's gain loss by subtracting day -2 from day 0

3) Calculate the third day's gain loss by subtracting day -n from day 0

!Note: This should be turned into a loop using i=n where n = the rows to be processed (now many previous rows)




In [365]:
dataset['gain_loss-0'] = dataset['Close'].diff(-1)
dataset['gain_loss-1'] = dataset['Close'].diff(-2)
#dataset['gain_loss-1'] = dataset['gain_loss-1'].shift(periods=-1, fill_value=0) #Removed these to experiment 
#with switching around the dependant variable rather than the independant variable
dataset['gain_loss-2'] = dataset['Close'].diff(-3) 
#dataset['gain_loss-2'] = dataset['gain_loss-2'].shift(periods=-1, fill_value=0)
dataset['gain_loss-3'] = dataset['Close'].diff(-4) 
#dataset['gain_loss-3'] = dataset['gain_loss-3'].shift(periods=-1, fill_value=0)
dataset['gain_loss-4'] = dataset['Close'].diff(-5) 
#dataset['gain_loss-4'] = dataset['gain_loss-4'].shift(periods=-1, fill_value=0)

print(dataset)
#dataset.dtypes

           Date   Close  DOW  ...  gain_loss-2  gain_loss-3  gain_loss-4
0    2021-07-29  440.65    3  ...        -0.37         0.71         5.19
1    2021-07-28  438.83    2  ...        -1.11         3.37         4.28
2    2021-07-27  439.01    1  ...         3.55         4.46         7.95
3    2021-07-26  441.02    0  ...         6.47         9.96        16.05
4    2021-07-23  439.94    4  ...         8.88        14.97         8.60
...         ...     ...  ...  ...          ...          ...          ...
3433 2007-12-07  150.91    4  ...         4.55         3.23          NaN
3434 2007-12-06  150.94    3  ...         3.26          NaN          NaN
3435 2007-12-05  148.81    2  ...          NaN          NaN          NaN
3436 2007-12-04  146.36    1  ...          NaN          NaN          NaN
3437 2007-12-03  147.68    0  ...          NaN          NaN          NaN

[3438 rows x 12 columns]


#Create binary version of skip-day gain loss values (and dependant variable #2)

!Note: gain-loss-0b will ultimately end up being the dependant variable but also an independant variable (we will create a new column later and shift it down a row)

This process changes all gain loss continuous variables into a binary-descrete (dichotomous) variables

!Note - This process should be converted into the previous process when that process is converted into a loop

In [366]:
dataset['gain_loss-0b'] = np.where(dataset['gain_loss-0'] > 0, 1, 0)
dataset['gain_loss-1b'] = np.where(dataset['gain_loss-1'] > 0, 1, 0)
dataset['gain_loss-2b'] = np.where(dataset['gain_loss-2'] > 0, 1, 0)
dataset['gain_loss-3b'] = np.where(dataset['gain_loss-3'] > 0, 1, 0)
dataset['gain_loss-4b'] = np.where(dataset['gain_loss-4'] > 0, 1, 0)

dataset.dtypes

Date            datetime64[ns]
Close                  float64
DOW                      int64
DOY                      int64
Week                     int64
Month                    int64
Quarter                  int64
gain_loss-0            float64
gain_loss-1            float64
gain_loss-2            float64
gain_loss-3            float64
gain_loss-4            float64
gain_loss-0b             int64
gain_loss-1b             int64
gain_loss-2b             int64
gain_loss-3b             int64
gain_loss-4b             int64
dtype: object

#Aggregate the binary skip-day gain loss values

This creates a true categorical value from the binary descrete values.

The theory is that, having binary values for each period (sparce matrix) and an aggregate (categorical), the values will work together to increase the value of this data

!Note = This process should be indluded in the loop mentioned in notes from the above process (future modifications to the data pre-processing procedures)

In [367]:
dataset['gain_loss-total_b'] = dataset['gain_loss-0b'] + dataset['gain_loss-1b'] + dataset['gain_loss-2b'] + dataset['gain_loss-3b'] + dataset['gain_loss-4b']

dataset.head(-1)

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.60,1,1,1,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3432,2007-12-10,152.08,0,344,50,12,4,1.17,1.14,3.27,5.72,4.40,1,1,1,1,1,5
3433,2007-12-07,150.91,4,341,49,12,4,-0.03,2.10,4.55,3.23,,0,1,1,1,0,3
3434,2007-12-06,150.94,3,340,49,12,4,2.13,4.58,3.26,,,1,1,1,0,0,3
3435,2007-12-05,148.81,2,339,49,12,4,2.45,1.13,,,,1,1,0,0,0,2


#Create daily gain loss and denormalize values

1) Calculate the first day's gain loss by subtracting day -1 from day 0

2) Calculate the second day's gain loss by subtracting day -2 from day -1

3) Calculate the third day's gain loss by subtracting day -n from day -n+1

This process creates a new column and removes the top rows in accordance with the desired "lookback" period - shift over 1 and lift by 1, shift over 2 and lift by 2, shift over n and lift by n

!Note: This should be turned into a loop using i=n where n = the rows to be processed (now many previous rows)


In [368]:
dataset['prior_day-0'] = dataset['gain_loss-0']
#dataset['prior_day-1'] = dataset['prior_day-1'].shift(periods=-1, fill_value=0)#Removed this to experiment 
#with switching around the dependant variable rather than the independant variable
dataset['prior_day-1'] = dataset['gain_loss-0']
dataset['prior_day-1'] = dataset['prior_day-1'].shift(periods=-1, fill_value=0)
dataset['prior_day-2'] = dataset['gain_loss-0']
dataset['prior_day-2'] = dataset['prior_day-2'].shift(periods=-2, fill_value=0)
dataset['prior_day-3'] = dataset['gain_loss-0']
dataset['prior_day-3'] = dataset['prior_day-3'].shift(periods=-3, fill_value=0)
dataset['prior_day-4'] = dataset['gain_loss-0']
dataset['prior_day-4'] = dataset['prior_day-4'].shift(periods=-4, fill_value=0)
dataset.head()

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37


#Create binary version of daily gain loss values

This process changes all gain loss continuous variables into a binary-descrete (dichotomous) variables

!Note - This process should be converted into the previous process when that process is converted into a loop

In [369]:
dataset['prior_day-0b'] = np.where(dataset['prior_day-0'] > 0, 1, 0)
dataset['prior_day-1b'] = np.where(dataset['prior_day-1'] > 0, 1, 0)
dataset['prior_day-2b'] = np.where(dataset['prior_day-2'] > 0, 1, 0)
dataset['prior_day-3b'] = np.where(dataset['prior_day-3'] > 0, 1, 0)
dataset['prior_day-4b'] = np.where(dataset['prior_day-4'] > 0, 1, 0)
dataset.head()

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-0b,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48,1,0,0,1,1
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91,0,0,1,1,1
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49,0,1,1,1,1
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09,1,1,1,1,1
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37,1,1,1,1,0


#Aggregate the binary daily gain loss values

This creates a true categorical value from the binary descrete values.

The theory is that, having binary values for each period (sparce matrix) and an aggregate (categorical), the values will work together to increase the value of this data

!Note = This process should be indluded in the loop mentioned in notes from the above process (future modifications to the data pre-processing procedures)

In [370]:
dataset['prior_day-total_b'] = dataset['prior_day-0b'] + dataset['prior_day-1b'] + dataset['prior_day-2b'] + dataset['prior_day-3b'] + dataset['prior_day-4b'] 
dataset.head(5)

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-0b,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b,prior_day-total_b
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48,1,0,0,1,1,3
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91,0,0,1,1,1,3
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49,0,1,1,1,1,4
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09,1,1,1,1,1,5
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37,1,1,1,1,0,4


#Creating Rolling mean attribute values

Rolling mean values are based on daily gain loss and represent the trending direction of the prior n mean values (5, 10, 15, n, row mean values)

the rolling mean works from the top row down - for exampple the mean of row 1 and 2 would appear on row 2. We need the mean of row 1 and 2 to land on row 1. This requires us to reverse the index of each desired mean column. The process to do this creates pandas value lists

!Note: this process can convert into a loop  where n = list of n mean values (as described in the description above)

In [371]:
#Rolling averages based on prior day gain loss
rolling_prior_day = dataset['prior_day-0']

rolling_prior_day_5 = rolling_prior_day[::-1].rolling(5).mean()[::-1]
rolling_prior_day_10 = rolling_prior_day[::-1].rolling(10).mean()[::-1]
rolling_prior_day_15 = rolling_prior_day[::-1].rolling(15).mean()[::-1]
rolling_prior_day_20 = rolling_prior_day[::-1].rolling(20).mean()[::-1]
rolling_prior_day_25 = rolling_prior_day[::-1].rolling(25).mean()[::-1]
rolling_prior_day_30 = rolling_prior_day[::-1].rolling(30).mean()[::-1]

print(rolling_prior_day_10)

0       0.590
1       0.259
2       0.342
3       0.394
4       0.442
        ...  
3433      NaN
3434      NaN
3435      NaN
3436      NaN
3437      NaN
Name: prior_day-0, Length: 3438, dtype: float64


#Remove NaN rows

Need to remove the NaN rows from bottom of dataset. These will cause errors in the analysis if not removed.

Due to this delete, the dataset must contain 35 additional data of history beyond what is desired. This was mentioned in the notation heading of this solution. (due to rolling means and shifts).

In [372]:
#dataset.dropna(inplace = True)

dataset.head(-5)

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-0b,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b,prior_day-total_b
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48,1,0,0,1,1,3
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91,0,0,1,1,1,3
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49,0,1,1,1,1,4
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09,1,1,1,1,1,5
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.60,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37,1,1,1,1,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3428,2007-12-14,147.17,4,348,50,12,4,-1.89,-2.20,-0.74,-4.91,-3.74,0,0,0,0,0,0,-1.89,-0.31,1.46,-4.17,1.17,0,0,1,0,1,2
3429,2007-12-13,149.06,3,347,50,12,4,-0.31,1.15,-3.02,-1.85,-1.88,0,1,0,0,0,1,-0.31,1.46,-4.17,1.17,-0.03,0,1,0,1,0,2
3430,2007-12-12,149.37,2,346,50,12,4,1.46,-2.71,-1.54,-1.57,0.56,1,0,0,0,1,2,1.46,-4.17,1.17,-0.03,2.13,1,0,1,0,1,3
3431,2007-12-11,147.91,1,345,50,12,4,-4.17,-3.00,-3.03,-0.90,1.55,0,0,0,0,1,1,-4.17,1.17,-0.03,2.13,2.45,0,1,0,1,1,3


#Create dependant variables (2 dependants)

As mentioned earlier, the gain_loss-0 attribute is a dependant variable. It's binary conterpart, gain_loss-0b is also a dependant variable.

the dependant variables need shifted down one row. This will adjust all of the independant variable into a position where they are trying to predict the "day ahead". Because the data is shifted down one day the last day must be removed.

!Note: Due to the organizaiton of this dataset (train and test set being time-based) this adjustment for the dependant variables will create results for next day. 


In [373]:
y = pd.DataFrame(dataset['gain_loss-0']).reset_index(drop = True)
y.loc[-1] = [0]
y.index = y.index + 1
y = y.sort_index()
y.drop(y.tail(1).index, inplace = True)
y.rename(columns={'gain_loss-0': 'y'}, inplace=True)
y_df = pd.DataFrame(y, columns=['y'])

yb = pd.DataFrame(dataset['gain_loss-0b']).reset_index(drop = True)
yb.loc[-1] = [0]
yb.index = yb.index + 1
yb = yb.sort_index()
yb.drop(yb.tail(1).index, inplace = True)
yb.rename(columns={'gain_loss-0b': 'yb'}, inplace=True)
yb_df = pd.DataFrame(yb, columns=['yb'])

print(y)
print("----------------")
print(yb)

         y
0     0.00
1     1.82
2    -0.18
3    -2.01
4     1.08
...    ...
3433  1.17
3434 -0.03
3435  2.13
3436  2.45
3437 -1.32

[3438 rows x 1 columns]
----------------
      yb
0      0
1      1
2      0
3      0
4      1
...   ..
3433   1
3434   0
3435   1
3436   1
3437   0

[3438 rows x 1 columns]


In [374]:
a = len(y.index)
b = len(yb.index)
c = len(dataset.index)
d = len(rolling_prior_day_5.index)
a1 = len(y_df.index)
b1 = len(yb_df.index)

print(a)
print(b)
print(c)
print(d)
print(a1)
print(b1)

3438
3438
3438
3438
3438
3438


#Create final dataset and review

A concat procedure is necessary to create the final dataset.

There should be a total of 37 columns

In [375]:
dataset_final = pd.concat([dataset,
           rolling_prior_day_5, 
           rolling_prior_day_10, 
           rolling_prior_day_15, 
           rolling_prior_day_20, 
           rolling_prior_day_25, 
           rolling_prior_day_30,
           y_df,
           yb_df],
           axis = 1)

dataset_final.head(-5)

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-0b,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b,prior_day-total_b,prior_day-0.1,prior_day-0.2,prior_day-0.3,prior_day-0.4,prior_day-0.5,prior_day-0.6,y,yb
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48,1,0,0,1,1,3,1.038,0.590,0.648667,0.6295,0.7220,0.618000,0.00,0
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91,0,0,1,1,1,3,0.856,0.259,0.291333,0.5565,0.6288,0.478333,1.82,1
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49,0,1,1,1,1,4,1.590,0.342,0.405333,0.5770,0.7260,0.458333,-0.18,0
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09,1,1,1,1,1,5,3.210,0.394,0.486667,0.7205,1.0440,0.557000,-2.01,0
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.60,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37,1,1,1,1,0,4,1.720,0.442,0.634000,0.7420,0.7188,0.544333,1.08,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3428,2007-12-14,147.17,4,348,50,12,4,-1.89,-2.20,-0.74,-4.91,-3.74,0,0,0,0,0,0,-1.89,-0.31,1.46,-4.17,1.17,0,0,1,0,1,2,-0.748,,,,,,-2.10,0
3429,2007-12-13,149.06,3,347,50,12,4,-0.31,1.15,-3.02,-1.85,-1.88,0,1,0,0,0,1,-0.31,1.46,-4.17,1.17,-0.03,0,1,0,1,0,2,-0.376,,,,,,-1.89,0
3430,2007-12-12,149.37,2,346,50,12,4,1.46,-2.71,-1.54,-1.57,0.56,1,0,0,0,1,2,1.46,-4.17,1.17,-0.03,2.13,1,0,1,0,1,3,0.112,,,,,,-0.31,0
3431,2007-12-11,147.91,1,345,50,12,4,-4.17,-3.00,-3.03,-0.90,1.55,0,0,0,0,1,1,-4.17,1.17,-0.03,2.13,2.45,0,1,0,1,1,3,0.310,,,,,,1.46,1


#Evaluate dataset for NaN

Throught the processes above there should have been some NaN values created at the tail

In [376]:
dataset_final.dropna(inplace = True)

a = len(dataset.index)
e = len(dataset_final.index)

print("rows dropped = {}".format(a-e))

rows dropped = 30


#Create model variables

Date variables based on today date are required to prevent "hardcoding" dates into the model

For testing the variable "date_var" can be hard coded to a set date. Comment out date.today() and put a 'mm/dd/yyyy' date in place to run test scenarios.

In [377]:
date_var = pd.to_datetime(date.today())
train_end_date = date_var - timedelta(days = 60)
pred_start_date = date_var - timedelta(days = 90)

print(train_end_date)
print(pred_start_date)

2021-06-01 00:00:00
2021-05-02 00:00:00


#Split between training and predict data sets

The top last 90 periods (rows) will generate the pred data set.

All but the top 60 periods (rows) will generate the training data set.

The 30 day overlap can provide a measure of the model's degredation over time

In [378]:
#split text and train datasets
predset = dataset_final.loc[(dataset['Date'] >= pred_start_date)]
trainset = dataset_final.loc[(dataset['Date'] <= train_end_date)]
type(testset)

pandas.core.frame.DataFrame

#Convert dataset into X and y and refine column membership

This process separates the dependant and independant variables

X should not contain the y or yb attributes

for X, "Date" should be removed since it is a time-series value; date attributes will represent time

for X, "Close" should be removed due to its relationship to the indepenant variable

Two models will come out of this model, one for continuous variable y and binary value yb

In [379]:
X = trainset
X.drop(X.tail(31).index, inplace = True)
X = X.drop(['y','yb', 'Date', 'Close'], axis = 1).values
y = trainset['y'].values
yb = trainset['yb'].values
print(X)
print("-------------------------")
print(y)
print("-------------------------")
print(yb)

[[ 1.00000000e+00  1.52000000e+02  2.20000000e+01 ...  7.35000000e-02
   8.24000000e-02  1.48666667e-01]
 [ 4.00000000e+00  1.48000000e+02  2.10000000e+01 ...  1.37000000e-01
   1.32000000e-01  9.26666667e-02]
 [ 3.00000000e+00  1.47000000e+02  2.10000000e+01 ... -3.85000000e-02
   2.80800000e-01  1.14000000e-01]
 ...
 [ 2.00000000e+00  6.50000000e+01  1.00000000e+01 ... -1.50000000e-02
  -8.32000000e-02  1.03666667e-01]
 [ 1.00000000e+00  6.40000000e+01  1.00000000e+01 ... -2.41500000e-01
  -9.00000000e-02  3.10000000e-02]
 [ 0.00000000e+00  6.30000000e+01  1.00000000e+01 ... -3.04000000e-01
   1.84000000e-02  2.33333333e-03]]
-------------------------
[ 0.66 -0.37  0.75 ... -2.77  0.84 -0.51]
-------------------------
[1 0 1 ... 0 1 0]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [380]:
#Validation of row counts

f = len(X)
g = len(y)
h = len(yb)

print(f, g, h)

3336 3336 3336


#Train the models

The model can be extended to use any regression or classificaiton model.

Current model inventory:

1) Random Forest

In [381]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

regressor.fit(X, y)

regressor_b = RandomForestRegressor(n_estimators = 100, random_state = 0)

regressor_b.fit(X, yb)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

#Create predict dataset

The predict dataset should match the process used to generate the training dataset

In [382]:
#Prepare predict set
Xpred = predset
Xpred = Xpred.drop(['y','yb', 'Date', 'Close'], axis = 1).values
y_actual = predset['y'].values
yb_actual = predset['yb'].values
print(Xpred)
print("-------------------------")
print(y_actual)
print("-------------------------")
print(yb_actual)

[[  3.         210.          30.         ...   0.6295       0.722
    0.618     ]
 [  2.         209.          30.         ...   0.5565       0.6288
    0.47833333]
 [  1.         208.          30.         ...   0.577        0.726
    0.45833333]
 ...
 [  2.         125.          18.         ...   0.458        0.8408
    0.875     ]
 [  1.         124.          18.         ...   0.475        0.7936
    0.76766667]
 [  0.         123.          18.         ...   0.592        0.8888
    0.95733333]]
-------------------------
[ 0.    1.82 -0.18 -2.01  1.08  4.48  0.91  3.49  6.09 -6.37 -3.41 -1.49
  0.65 -1.49  1.56  4.6  -3.54  1.53 -0.79  3.29  2.37  0.36  0.23  0.86
  1.51  2.5  -0.51  2.25  5.94 -7.05 -0.14 -2.37 -0.78  0.95  0.7   1.96
 -0.63  0.09 -0.41  3.83 -1.56  0.66 -0.37  0.75  0.22  0.83 -0.93  4.23
 -0.34  4.42 -1.08 -3.58 -1.06  6.3   4.87 -8.8  -3.73 -4.18  3.05  3.32
  0.13 -2.58]
-------------------------
[0 1 0 0 1 1 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 1 1 1 1 1 0 1 1 0 0 0 0 

In [383]:
i = len(Xpred)
j = len(y_actual)
k = len(yb_actual)

print(i, j, k)

62 62 62


#Generate predictions

Predictions are made for both continuous and binary

In [384]:
y_pred = regressor.predict(Xpred)

yb_pred = regressor_b.predict(Xpred)

np_array = np.concatenate((y_pred.reshape(len(y_pred),1), 
                           yb_pred.reshape(len(y_pred),1),
                           y_actual.reshape(len(y_actual),1),
                           yb_actual.reshape(len(yb_actual),1),
                           ), axis = 1)

results = pd.DataFrame(np_array, columns = ['y_pred', 'yb_pred', 'y_actual', 'yb_actual'])

print(results)


      y_pred  yb_pred  y_actual  yb_actual
0  -0.740116     0.57      0.00        0.0
1   0.699892     0.51      1.82        1.0
2   0.116892     0.63     -0.18        0.0
3   1.285900     0.55     -2.01        0.0
4  -1.019342     0.56      1.08        1.0
..       ...      ...       ...        ...
57 -2.507821     0.23     -4.18        0.0
58  1.622172     0.84      3.05        1.0
59  1.843287     0.82      3.32        1.0
60  0.352668     0.82      0.13        1.0
61 -1.619800     0.18     -2.58        0.0

[62 rows x 4 columns]


#Create buy/sell indicator based on pred

1 = Buy next day

0 = Sell next day

In [385]:
results['y_pred_arg'] = np.where(results['y_pred'] > 0, 1, 0)
results['yb_pred_arg'] = np.where(results['yb_pred'] > 0.5, 1, 0)
results['y_actual_arg'] = np.where(results['y_actual'] > 0, 1, 0)
results['yb_acutal_arg'] = np.where(results['yb_actual'] > 0, 1, 0)

print(results)


      y_pred  yb_pred  y_actual  ...  yb_pred_arg  y_actual_arg  yb_acutal_arg
0  -0.740116     0.57      0.00  ...            1             0              0
1   0.699892     0.51      1.82  ...            1             1              1
2   0.116892     0.63     -0.18  ...            1             0              0
3   1.285900     0.55     -2.01  ...            1             0              0
4  -1.019342     0.56      1.08  ...            1             1              1
..       ...      ...       ...  ...          ...           ...            ...
57 -2.507821     0.23     -4.18  ...            0             0              0
58  1.622172     0.84      3.05  ...            1             1              1
59  1.843287     0.82      3.32  ...            1             1              1
60  0.352668     0.82      0.13  ...            1             1              1
61 -1.619800     0.18     -2.58  ...            0             0              0

[62 rows x 8 columns]


In [386]:
results.to_csv('/content/drive/MyDrive/Colab Notebooks/results.csv')