<a href="https://colab.research.google.com/github/kappandrew2/DataPreProcessing/blob/main/MarketResearch_v0_0_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Model Purpose

Utilize historical value and time attributes to predict the next day's gain or loss value

In [46]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from sklearn.model_selection import train_test_split

#Connect to drive and import data set

Using google drive

Importing historical prices for ticker "SPY"

In [47]:
#Create CSV from data export
#https://www.wsj.com/market-data/quotes/index/SPX/historical-prices

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HistoricalPricesSPY.csv')

print(dataset)

Mounted at /content/drive
          Date     Open      High     Low   Close     Volume
0     07/29/21  439.815  441.8000  439.81  440.65   46716900
1     07/28/21  439.680  440.3000  437.31  438.83   52472359
2     07/27/21  439.910  439.9400  435.99  439.01   67397133
3     07/26/21  439.310  441.0300  439.26  441.02   43719191
4     07/23/21  437.520  440.3000  436.79  439.94   63766641
...        ...      ...       ...     ...     ...        ...
3433  12/07/07  151.420  151.5000  150.55  150.91  148951391
3434  12/06/07  148.630  151.2100  148.57  150.94  154487203
3435  12/05/07  147.930  149.2000  147.83  148.81  170813406
3436  12/04/07  146.660  147.5409  146.31  146.36  136528609
3437  12/03/07  148.190  148.4500  147.29  147.68  145852797

[3438 rows x 6 columns]


#Modifiy dataset Content and Headers

Remove contents not required for this exercise

Renaming columns to remove leading white space

In [48]:
dataset = dataset.drop([' Open', ' High', ' Low', ' Volume'], axis = 1)

dataset.rename({' Close': 'Close'}, axis=1, inplace = True)


#Dataset information validation

Validate date frame, column contents and data types

In [49]:
dataset['Date'] = pd.to_datetime(dataset['Date'])

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3438 entries, 0 to 3437
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3438 non-null   datetime64[ns]
 1   Close   3438 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 53.8 KB


#Change indext to date

Moving date to the index assists in visually validating processes are working correctly

In [50]:
dataset['Date_Index'] = dataset['Date']
dataset.set_index('Date_Index', inplace=True)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3438 entries, 2021-07-29 to 2007-12-03
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    3438 non-null   datetime64[ns]
 1   Close   3438 non-null   float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 80.6 KB


#Create time attributes

Time attributes will change date from a continous variable into discrete (a numeric categorical value)

In [51]:
dataset['DOW'] = dataset['Date'].dt.dayofweek
dataset['DOY'] = dataset['Date'].dt.dayofyear
dataset['Week'] = dataset['Date'].dt.week
dataset['Month'] = dataset['Date'].dt.month
dataset['Quarter'] = dataset['Date'].dt.quarter

dataset.dtypes

  This is separate from the ipykernel package so we can avoid doing imports until


Date       datetime64[ns]
Close             float64
DOW                 int64
DOY                 int64
Week                int64
Month               int64
Quarter             int64
dtype: object

#Create skip-day gain loss values (and dependant variable #1)

!Note: gain-loss-0 will ultimately end up being the dependant variable but also an independant variable (we will create a new column later and shift it down a row)

1) Calculate the first day's gain loss by subtracting day -1 from day 0

2) Calculate the second day's gain loss by subtracting day -2 from day 0

3) Calculate the third day's gain loss by subtracting day -n from day 0

!Note: This should be turned into a loop using i=n where n = the rows to be processed (now many previous rows)




In [52]:
dataset['gain_loss-0'] = dataset['Close'].diff(-1)
dataset['gain_loss-1'] = dataset['Close'].diff(-2)
#dataset['gain_loss-1'] = dataset['gain_loss-1'].shift(periods=-1, fill_value=0) #Removed these to experiment 
#with switching around the dependant variable rather than the independant variable
dataset['gain_loss-2'] = dataset['Close'].diff(-3) 
#dataset['gain_loss-2'] = dataset['gain_loss-2'].shift(periods=-1, fill_value=0)
dataset['gain_loss-3'] = dataset['Close'].diff(-4) 
#dataset['gain_loss-3'] = dataset['gain_loss-3'].shift(periods=-1, fill_value=0)
dataset['gain_loss-4'] = dataset['Close'].diff(-5) 
#dataset['gain_loss-4'] = dataset['gain_loss-4'].shift(periods=-1, fill_value=0)

print(dataset)
#dataset.dtypes

                 Date   Close  DOW  ...  gain_loss-2  gain_loss-3  gain_loss-4
Date_Index                          ...                                       
2021-07-29 2021-07-29  440.65    3  ...        -0.37         0.71         5.19
2021-07-28 2021-07-28  438.83    2  ...        -1.11         3.37         4.28
2021-07-27 2021-07-27  439.01    1  ...         3.55         4.46         7.95
2021-07-26 2021-07-26  441.02    0  ...         6.47         9.96        16.05
2021-07-23 2021-07-23  439.94    4  ...         8.88        14.97         8.60
...               ...     ...  ...  ...          ...          ...          ...
2007-12-07 2007-12-07  150.91    4  ...         4.55         3.23          NaN
2007-12-06 2007-12-06  150.94    3  ...         3.26          NaN          NaN
2007-12-05 2007-12-05  148.81    2  ...          NaN          NaN          NaN
2007-12-04 2007-12-04  146.36    1  ...          NaN          NaN          NaN
2007-12-03 2007-12-03  147.68    0  ...          NaN

#Create binary version of skip-day gain loss values (and dependant variable #2)

!Note: gain-loss-0b will ultimately end up being the dependant variable but also an independant variable (we will create a new column later and shift it down a row)

This process changes all gain loss continuous variables into a binary-descrete (dichotomous) variables

!Note - This process should be converted into the previous process when that process is converted into a loop

In [53]:
dataset['gain_loss-0b'] = np.where(dataset['gain_loss-0'] > 0, 1, 0)
dataset['gain_loss-1b'] = np.where(dataset['gain_loss-1'] > 0, 1, 0)
dataset['gain_loss-2b'] = np.where(dataset['gain_loss-2'] > 0, 1, 0)
dataset['gain_loss-3b'] = np.where(dataset['gain_loss-3'] > 0, 1, 0)
dataset['gain_loss-4b'] = np.where(dataset['gain_loss-4'] > 0, 1, 0)

dataset.dtypes

Date            datetime64[ns]
Close                  float64
DOW                      int64
DOY                      int64
Week                     int64
Month                    int64
Quarter                  int64
gain_loss-0            float64
gain_loss-1            float64
gain_loss-2            float64
gain_loss-3            float64
gain_loss-4            float64
gain_loss-0b             int64
gain_loss-1b             int64
gain_loss-2b             int64
gain_loss-3b             int64
gain_loss-4b             int64
dtype: object

#Aggregate the binary skip-day gain loss values

This creates a true categorical value from the binary descrete values.

The theory is that, having binary values for each period (sparce matrix) and an aggregate (categorical), the values will work together to increase the value of this data

!Note = This process should be indluded in the loop mentioned in notes from the above process (future modifications to the data pre-processing procedures)

In [55]:
dataset['gain_loss-total_b'] = dataset['gain_loss-0b'] + dataset['gain_loss-1b'] + dataset['gain_loss-2b'] + dataset['gain_loss-3b'] + dataset['gain_loss-4b']

dataset.head(-1)

Unnamed: 0_level_0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b
Date_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2021-07-29,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4
2021-07-28,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2
2021-07-27,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3
2021-07-26,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5
2021-07-23,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.60,1,1,1,1,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2007-12-10,2007-12-10,152.08,0,344,50,12,4,1.17,1.14,3.27,5.72,4.40,1,1,1,1,1,5
2007-12-07,2007-12-07,150.91,4,341,49,12,4,-0.03,2.10,4.55,3.23,,0,1,1,1,0,3
2007-12-06,2007-12-06,150.94,3,340,49,12,4,2.13,4.58,3.26,,,1,1,1,0,0,3
2007-12-05,2007-12-05,148.81,2,339,49,12,4,2.45,1.13,,,,1,1,0,0,0,2


#Create daily gain loss and denormalize values

1) Calculate the first day's gain loss by subtracting day -1 from day 0

2) Calculate the second day's gain loss by subtracting day -2 from day -1

3) Calculate the third day's gain loss by subtracting day -n from day -n+1

This process creates a new column and removes the top rows in accordance with the desired "lookback" period - shift over 1 and lift by 1, shift over 2 and lift by 2, shift over n and lift by n

!Note: This should be turned into a loop using i=n where n = the rows to be processed (now many previous rows)


In [56]:
dataset['prior_day-0'] = dataset['gain_loss-0']
#dataset['prior_day-1'] = dataset['prior_day-1'].shift(periods=-1, fill_value=0)#Removed this to experiment 
#with switching around the dependant variable rather than the independant variable
dataset['prior_day-1'] = dataset['gain_loss-0']
dataset['prior_day-1'] = dataset['prior_day-1'].shift(periods=-1, fill_value=0)
dataset['prior_day-2'] = dataset['gain_loss-0']
dataset['prior_day-2'] = dataset['prior_day-2'].shift(periods=-2, fill_value=0)
dataset['prior_day-3'] = dataset['gain_loss-0']
dataset['prior_day-3'] = dataset['prior_day-3'].shift(periods=-3, fill_value=0)
dataset['prior_day-4'] = dataset['gain_loss-0']
dataset['prior_day-4'] = dataset['prior_day-4'].shift(periods=-4, fill_value=0)
dataset.head()

Unnamed: 0_level_0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4
Date_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2021-07-29,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48
2021-07-28,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91
2021-07-27,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49
2021-07-26,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09
2021-07-23,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37


#Create binary version of daily gain loss values

This process changes all gain loss continuous variables into a binary-descrete (dichotomous) variables

!Note - This process should be converted into the previous process when that process is converted into a loop

In [57]:
dataset['prior_day-0b'] = np.where(dataset['prior_day-0'] > 0, 1, 0)
dataset['prior_day-1b'] = np.where(dataset['prior_day-1'] > 0, 1, 0)
dataset['prior_day-2b'] = np.where(dataset['prior_day-2'] > 0, 1, 0)
dataset['prior_day-3b'] = np.where(dataset['prior_day-3'] > 0, 1, 0)
dataset['prior_day-4b'] = np.where(dataset['prior_day-4'] > 0, 1, 0)
dataset.head()

Unnamed: 0_level_0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-0b,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b
Date_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
2021-07-29,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48,1,0,0,1,1
2021-07-28,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91,0,0,1,1,1
2021-07-27,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49,0,1,1,1,1
2021-07-26,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09,1,1,1,1,1
2021-07-23,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37,1,1,1,1,0


#Aggregate the binary daily gain loss values

This creates a true categorical value from the binary descrete values.

The theory is that, having binary values for each period (sparce matrix) and an aggregate (categorical), the values will work together to increase the value of this data

!Note = This process should be indluded in the loop mentioned in notes from the above process (future modifications to the data pre-processing procedures)

In [60]:
dataset['prior_day-total_b'] = dataset['prior_day-0b'] + dataset['prior_day-1b'] + dataset['prior_day-2b'] + dataset['prior_day-3b'] + dataset['prior_day-4b'] 
dataset.head(5)

Unnamed: 0_level_0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total_b,prior_day-0,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-0b,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b,prior_day-total_b
Date_Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
2021-07-29,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,1.82,-0.18,-2.01,1.08,4.48,1,0,0,1,1,3
2021-07-28,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,-0.18,-2.01,1.08,4.48,0.91,0,0,1,1,1,3
2021-07-27,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,-2.01,1.08,4.48,0.91,3.49,0,1,1,1,1,4
2021-07-26,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,1.08,4.48,0.91,3.49,6.09,1,1,1,1,1,5
2021-07-23,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,4.48,0.91,3.49,6.09,-6.37,1,1,1,1,0,4


#Creating Rolling mean attribute values

Rolling mean values are based on daily gain loss and represent the trending direction of the prior n mean values (5, 10, 15, n, row mean values)

the rolling mean works from the top row down - for exampple the mean of row 1 and 2 would appear on row 2. We need the mean of row 1 and 2 to land on row 1. This requires us to reverse the index of each desired mean column. The process to do this creates pandas value lists

!Note: this process can convert into a loop  where n = list of n mean values (as described in the description above)

In [43]:
#Rolling averages based on prior day gain loss
rolling_prior_day = dataset['prior_day-0']

rolling_prior_day_5 = rolling_prior_day[::-1].rolling(5).mean()[::-1]
rolling_prior_day_10 = rolling_prior_day[::-1].rolling(10).mean()[::-1]
rolling_prior_day_15 = rolling_prior_day[::-1].rolling(15).mean()[::-1]
rolling_prior_day_20 = rolling_prior_day[::-1].rolling(20).mean()[::-1]
rolling_prior_day_25 = rolling_prior_day[::-1].rolling(25).mean()[::-1]
rolling_prior_day_30 = rolling_prior_day[::-1].rolling(30).mean()[::-1]

print(rolling_prior_day_10)

0       0.590
1       0.259
2       0.342
3       0.394
4       0.442
        ...  
3433      NaN
3434      NaN
3435      NaN
3436      NaN
3437      NaN
Name: prior_day-0, Length: 3438, dtype: float64


In [44]:
dataset = pd.concat([dataset,
           rolling_prior_day_5, 
           rolling_prior_day_10, 
           rolling_prior_day_15, 
           rolling_prior_day_20, 
           rolling_prior_day_25, 
           rolling_prior_day_30], 
          axis = 1)

dataset.head()

Unnamed: 0,Date,Close,DOW,DOY,Week,Month,Quarter,gain_loss-0,gain_loss-1,gain_loss-2,gain_loss-3,gain_loss-4,gain_loss-0b,gain_loss-1b,gain_loss-2b,gain_loss-3b,gain_loss-4b,gain_loss-total,gain_loss-total_b,prior_day-1,prior_day-2,prior_day-3,prior_day-4,prior_day-5,prior_day-1b,prior_day-2b,prior_day-3b,prior_day-4b,prior_day-5b,prior_day-total_b,prior_day-0,prior_day-0.1,prior_day-0.2,prior_day-0.3,prior_day-0.4,prior_day-0.5,prior_day-0.6
0,2021-07-29,440.65,3,210,30,7,3,1.82,1.64,-0.37,0.71,5.19,1,1,0,1,1,4,4,-0.18,-2.01,1.08,4.48,4.48,1,0,0,1,1,3,1.82,1.038,0.59,0.648667,0.6295,0.722,0.618
1,2021-07-28,438.83,2,209,30,7,3,-0.18,-2.19,-1.11,3.37,4.28,0,0,0,1,1,2,2,-2.01,1.08,4.48,0.91,0.91,0,0,1,1,1,3,-0.18,0.856,0.259,0.291333,0.5565,0.6288,0.478333
2,2021-07-27,439.01,1,208,30,7,3,-2.01,-0.93,3.55,4.46,7.95,0,0,1,1,1,3,3,1.08,4.48,0.91,3.49,3.49,0,1,1,1,1,4,-2.01,1.59,0.342,0.405333,0.577,0.726,0.458333
3,2021-07-26,441.02,0,207,30,7,3,1.08,5.56,6.47,9.96,16.05,1,1,1,1,1,5,5,4.48,0.91,3.49,6.09,6.09,1,1,1,1,1,5,1.08,3.21,0.394,0.486667,0.7205,1.044,0.557
4,2021-07-23,439.94,4,204,29,7,3,4.48,5.39,8.88,14.97,8.6,1,1,1,1,1,5,5,0.91,3.49,6.09,-6.37,-6.37,1,1,1,1,0,4,4.48,1.72,0.442,0.634,0.742,0.7188,0.544333


In [45]:
len(dataset.columns)

37

In [None]:
np.argwhere(np.isnan(X))

array([], shape=(0, 2), dtype=int64)

In [None]:
#split text and train datasets
testset = dataset.loc[(dataset['Date'] >= '12/01/2020')]
dataset = dataset.loc[(dataset['Date'] <= '12/31/2020')]
type(testset)

pandas.core.frame.DataFrame

In [None]:
X = dataset
X.drop(X.tail(31).index, inplace = True)
X = X.drop(['Date', 'Close', 'gain_loss-0'], axis = 1).values
y = dataset['gain_loss-0'].values
#y = y[:-31]
print(X)
print("-------------------------")
print(y)

[[ 3.00000000e+00  3.66000000e+02  5.30000000e+01 ...  2.98500000e-01
   5.81200000e-01  3.14000000e-01]
 [ 2.00000000e+00  3.65000000e+02  5.30000000e+01 ...  4.70000000e-01
   6.45200000e-01  4.45333333e-01]
 [ 1.00000000e+00  3.64000000e+02  5.30000000e+01 ...  4.25000000e-01
   5.75600000e-01  6.32000000e-01]
 ...
 [ 1.00000000e+00  2.20000000e+01  4.00000000e+00 ... -6.91000000e-01
  -6.92400000e-01 -5.58333333e-01]
 [ 4.00000000e+00  1.80000000e+01  3.00000000e+00 ... -6.22500000e-01
  -5.79200000e-01 -4.31000000e-01]
 [ 3.00000000e+00  1.70000000e+01  3.00000000e+00 ... -4.04500000e-01
  -6.04000000e-01 -3.56666667e-01]]
-------------------------
[ 1.89  0.53 -0.71 ... -1.34 -1.37 -3.55]


In [None]:
len(X)

3263

In [None]:
len(y)

3263

In [None]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)

regressor.fit(X, y)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [None]:
#Prepare test set
Xtest = testset
Xtest.drop(Xtest.tail(31).index, inplace = True)
Xtest = Xtest.drop(['Date', 'Close', 'gain_loss-0'], axis = 1).values
ytest = testset['gain_loss-0'].values
print(Xtest)
print("-------------------------")
print(ytest)

[[  3.         210.          30.         ...   0.5565       0.6288
    0.47833333]
 [  2.         209.          30.         ...   0.577        0.726
    0.45833333]
 [  1.         208.          30.         ...   0.7205       1.044
    0.557     ]
 ...
 [  2.          20.           3.         ...   0.3205       0.4768
    0.39866667]
 [  1.          19.           3.         ...   0.2765       0.354
    0.297     ]
 [  4.          15.           2.         ...   0.4435       0.3316
    0.41466667]]
-------------------------
[ 1.82 -0.18 -2.01  1.08  4.48  0.91  3.49  6.09 -6.37 -3.41 -1.49  0.65
 -1.49  1.56  4.6  -3.54  1.53 -0.79  3.29  2.37  0.36  0.23  0.86  1.51
  2.5  -0.51  2.25  5.94 -7.05 -0.14 -2.37 -0.78  0.95  0.7   1.96 -0.63
  0.09 -0.41  3.83 -1.56  0.66 -0.37  0.75  0.22  0.83 -0.93  4.23 -0.34
  4.42 -1.08 -3.58 -1.06  6.3   4.87 -8.8  -3.73 -4.18  3.05  3.32  0.13
 -2.58  0.9  -2.76  2.66 -0.12 -0.09  0.87  4.47 -3.8   3.9  -3.04 -2.05
  1.39  4.42 -1.41  1.22  0.15  2.9

In [None]:
y_pred = regressor.predict(Xtest)

np_array = np.concatenate((y_pred.reshape(len(y_pred),1), ytest.reshape(len(ytest), 1)), axis = 1)

results = pd.DataFrame(np_array, columns = ['pred','actual'])

results['pred_b'] = np.where(results['pred'] > 0, 1, 0)
results['actual_b'] = np.where(results['actual'] > 0, 1, 0)
results['total_b'] = results['pred_b'] + results['actual_b']

print(results)



         pred  actual  pred_b  actual_b  total_b
0    1.771760    1.82       1         1        2
1   -1.725320   -0.18       0         0        0
2   -2.226368   -2.01       0         0        0
3    3.079408    1.08       1         1        2
4    2.137418    4.48       1         1        2
..        ...     ...     ...       ...      ...
130 -2.213533   -1.36       0         0        0
131  1.898500    0.35       1         1        2
132  1.703442    5.24       1         1        2
133  1.518966    2.95       1         1        2
134 -1.763628   -2.76       0         0        0

[135 rows x 5 columns]


In [None]:
results.to_csv('/content/drive/MyDrive/Colab Notebooks/results.csv')