In [1]:
import yfinance as yf
import pandas as pd
import datetime as dt
import time
from datetime import timedelta
import pandas_ta as ta

from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from functools import reduce

from sklearn.model_selection import GridSearchCV

In [2]:
def get_data(ticker, year,month,day):
    start = dt.datetime(1999,1,1)
    end = dt.datetime(2023,8,29)
     

    df = yf.download(ticker,start,end)
    
    return df

In [3]:
# loop through dataframe and add all features to a list
# removes 'Tomorrow' and 'Target' column from the list 
def get_features(dataframe):
    features = []
    for column in dataframe:
        features.append(column)
    
    features.pop(-1)
    features.pop(-1)
    
    return features

In [4]:
def fit_train_score(df, n_est, min_split):    
    features = get_features(df)
    
    # n_estimators = number of decision trees
    # min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
    model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    # fit and train model
    model.fit(train[features], train['Target'])
    predictions = model.predict(test[features])
    #predictions_series = pd.Series(predictions, index=test.index)

    # precision of training data
    predictions_training = model.predict(train[features])
    
    test_dict = {
        'n_estimators':n_est,
        'min_samples_split':min_split,
        'training precision':precision_score(train['Target'], predictions_training),
        'testing precision':precision_score(test['Target'], predictions)
                }
    
    return test_dict

In [5]:
# same as fit_train_score but it excepts arguments for max_depth
def fit_train_score_with_depth(df, n_est, min_split, max_depth):
    features = get_features(df)
    
    # n_estimators = number of decision trees
    # min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
    model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    # fit and train model
    model.fit(train[features], train['Target'])
    predictions = model.predict(test[features])
    #predictions_series = pd.Series(predictions, index=test.index)

    # precision of training data
    predictions_training = model.predict(train[features])
    
    test_dict = {
        'n_estimators':n_est,
        'min_samples_split':min_split,
        'max_depth': max_depth,
        'training precision':precision_score(train['Target'], predictions_training),
        'testing precision':precision_score(test['Target'], predictions)
                }
    
    return test_dict

In [6]:
def predict(train, test, features, model):
    model.fit(train[features], train["Target"])
    preds = model.predict(test[features])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [7]:
def backtest(df, model, features, start=2500, step=250):
    all_predictions = []

    for i in range(start, df.shape[0]-step, step):
        train = df.iloc[0:i].copy()
        test = df.iloc[i:(i+step)].copy()
        predictions = predict(train, test, features, model)
        
        print(df.index[i])
        print(precision_score(predictions["Target"], predictions["Predictions"]))
        
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [8]:
data = get_data('SPY',1999,1,1)

[*********************100%***********************]  1 of 1 completed


In [9]:
df = data.copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-01-04,123.375000,125.218750,121.718750,123.031250,79.169930,9450400
1999-01-05,122.937500,124.875000,122.937500,124.437500,80.074799,8031000
1999-01-06,125.812500,127.750000,125.750000,127.437500,82.005302,7737700
1999-01-07,126.375000,127.218750,125.781250,126.812500,81.603149,5504900
1999-01-08,128.187500,128.500000,125.968750,127.750000,82.206413,6224400
...,...,...,...,...,...,...
2023-08-22,441.179993,441.179993,437.570007,438.149994,438.149994,65062900
2023-08-23,439.250000,443.670013,439.100006,443.029999,443.029999,68441000
2023-08-24,444.690002,445.220001,436.859985,436.890015,436.890015,88517300
2023-08-25,438.679993,441.299988,435.000000,439.970001,439.970001,102325100


In [10]:
# create new features for SPY
df['Range'] = abs(df['High']-df['Low'])
df['ATR'] = ta.atr(df['High'], df['Low'], df['Close'] )
df['Up'] = (df['Close'] > df['Open']).astype(int)
df['Percent Change'] = (abs(df['Close'] - df['Open']) / df['Open']) *100
df['MOM 5'] = ta.mom(df['Close'],5)
df['MOM 20'] = ta.mom(df['Close'],20)
df['RSI 14'] = ta.rsi(df['Close'], 14)
df['Relative Volatility'] = df['Range'] / df['ATR']

#del df['Range']
#del df['ATR']
del df['Adj Close']

In [11]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,ATR,Up,Percent Change,MOM 5,MOM 20,RSI 14,Relative Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1999-01-04,123.375000,125.218750,121.718750,123.031250,9450400,3.500000,,0,0.278622,,,,
1999-01-05,122.937500,124.875000,122.937500,124.437500,8031000,1.937500,,1,1.220132,,,,
1999-01-06,125.812500,127.750000,125.750000,127.437500,7737700,2.000000,,1,1.291605,,,,
1999-01-07,126.375000,127.218750,125.781250,126.812500,5504900,1.437500,,1,0.346192,,,,
1999-01-08,128.187500,128.500000,125.968750,127.750000,6224400,2.531250,,0,0.341297,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-22,441.179993,441.179993,437.570007,438.149994,65062900,3.609985,4.472321,0,0.686794,-4.740021,-17.290009,39.350095,0.807184
2023-08-23,439.250000,443.670013,439.100006,443.029999,68441000,4.570007,4.547157,1,0.860557,3.389984,-12.480011,47.987997,1.005025
2023-08-24,444.690002,445.220001,436.859985,436.890015,88517300,8.360016,4.819504,0,1.754028,0.600006,-15.599976,40.225352,1.734622
2023-08-25,438.679993,441.299988,435.000000,439.970001,102325100,6.299988,4.925253,1,0.294066,3.470001,-16.950012,45.029060,1.279120


In [12]:
# get VIX data
vix_data = get_data('^VIX',1999,1,1)
vix_df = vix_data.copy()

[*********************100%***********************]  1 of 1 completed


In [13]:
vix_df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1999-01-04,25.379999,26.959999,24.740000,26.170000,26.170000,0
1999-01-05,25.920000,25.980000,24.360001,24.459999,24.459999,0
1999-01-06,23.360001,23.379999,22.680000,23.340000,23.340000,0
1999-01-07,24.420000,24.900000,24.040001,24.370001,24.370001,0
1999-01-08,22.950001,24.080000,22.809999,23.280001,23.280001,0
...,...,...,...,...,...,...
2023-08-22,16.959999,17.580000,16.610001,16.969999,16.969999,0
2023-08-23,16.639999,17.100000,15.910000,15.980000,15.980000,0
2023-08-24,15.570000,17.320000,15.480000,17.200001,17.200001,0
2023-08-25,17.209999,17.360001,15.450000,15.680000,15.680000,0


In [14]:
# add features to VIX and clean VIX dataframe
vix_df['VIX Range'] = abs(vix_df['High']-vix_df['Low'])
vix_df['VIX ATR'] = ta.atr(vix_df['High'], vix_df['Low'], vix_df['Close'])
vix_df['Vix Up'] = (vix_df['Close'] > vix_df['Open']).astype(int)
vix_df['VIX Percent Change'] = (abs(vix_df['Close'] - vix_df['Open']) / vix_df['Open']) *100
vix_df['VIX MOM 5'] = ta.mom(vix_df['Close'],5)
vix_df['VIX MOM 20'] = ta.mom(vix_df['Close'],20)
vix_df['Oil RSI 14'] = ta.rsi(vix_df['Close'], 14)
vix_df['VIX Relative Volatility'] = vix_df['VIX Range'] / vix_df['VIX ATR']

vix_df.rename(columns={
    'High':'VIX High',
    'Low': 'VIX Low',
    'Open':'VIX Open',
    'Close':'VIX Close',
    'Volume':'VIX Volume'
}, inplace=True)

del vix_df['VIX Volume']
del vix_df['Adj Close']
vix_df

Unnamed: 0_level_0,VIX Open,VIX High,VIX Low,VIX Close,VIX Range,VIX ATR,Vix Up,VIX Percent Change,VIX MOM 5,VIX MOM 20,Oil RSI 14,VIX Relative Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1999-01-04,25.379999,26.959999,24.740000,26.170000,2.219999,,1,3.112691,,,,
1999-01-05,25.920000,25.980000,24.360001,24.459999,1.619999,,0,5.632720,,,,
1999-01-06,23.360001,23.379999,22.680000,23.340000,0.699999,,0,0.085618,,,,
1999-01-07,24.420000,24.900000,24.040001,24.370001,0.859999,,0,0.204747,,,,
1999-01-08,22.950001,24.080000,22.809999,23.280001,1.270000,,1,1.437908,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-22,16.959999,17.580000,16.610001,16.969999,0.969999,1.415371,1,0.058964,0.510000,3.110000,58.716554,0.685332
2023-08-23,16.639999,17.100000,15.910000,15.980000,1.190001,1.399273,0,3.966345,-0.800001,2.790000,51.887792,0.850442
2023-08-24,15.570000,17.320000,15.480000,17.200001,1.840000,1.430753,1,10.468857,-0.689999,2.790001,58.320756,1.286036
2023-08-25,17.209999,17.360001,15.450000,15.680000,1.910001,1.464985,0,8.890174,-1.619999,2.350000,49.449487,1.303768


In [15]:
oil_data = get_data('CL=F',1999,1,1)
oil_df = oil_data.copy()

[*********************100%***********************]  1 of 1 completed


In [16]:
oil_df['Oil Range'] = abs(oil_df['High']-oil_df['Low'])
oil_df['Oil ATR'] = ta.atr(oil_df['High'], oil_df['Low'], oil_df['Close'] )
oil_df['Oil Up'] = (oil_df['Close'] > oil_df['Open']).astype(int)
oil_df['Oil Percent Change'] = (abs(oil_df['Close'] - oil_df['Open']) / oil_df['Open']) *100
oil_df['Oil MOM 5'] = ta.mom(oil_df['Close'],5)
oil_df['Oil MOM 20'] = ta.mom(oil_df['Close'],20)
oil_df['Oil RSI 14'] = ta.rsi(oil_df['Close'], 14)
oil_df['Oil Relative Volatility'] = oil_df['Oil Range'] / oil_df['Oil ATR']

oil_df.rename(columns={
    'High':'Oil High',
    'Low': 'Oil Low',
    'Open':'Oil Open',
    'Close':'Oil Close',
    'Volume':'Oil Volume'
},inplace=True)


del oil_df['Oil Volume']
del oil_df['Adj Close']

In [17]:
gold_data = get_data('GC=F',1999,1,1)
gold_df = gold_data.copy()

[*********************100%***********************]  1 of 1 completed


In [18]:
gold_df['Gold Range'] = abs(gold_df['High']-gold_df['Low'])
gold_df['Gold ATR'] = ta.atr(gold_df['High'], gold_df['Low'], gold_df['Close'] )
gold_df['Gold Up'] = (gold_df['Close'] > gold_df['Open']).astype(int)
gold_df['Gold Percent Change'] = (abs(gold_df['Close'] - gold_df['Open']) / gold_df['Open']) *100
gold_df['Gold MOM 5'] = ta.mom(gold_df['Close'],5)
gold_df['Gold MOM 20'] = ta.mom(gold_df['Close'],20)
gold_df['Gold RSI 14'] = ta.rsi(gold_df['Close'], 14)
gold_df['Gold Relative Volatility'] = gold_df['Gold Range'] / gold_df['Gold ATR']

gold_df.rename(columns={
    'High':'Gold High',
    'Low': 'Gold Low',
    'Open':'Gold Open',
    'Close':'Gold Close',
    'Volume':'Gold Volume'
},inplace=True)

del gold_df['Gold Volume']
del gold_df['Adj Close']

In [193]:
# foreign market data
UK_df = get_data('^FTSE',1999,1,1)
China_df = get_data('000001.SS',1999,1,1)
Germany_df = get_data('^GDAXI',1999,1,1)
Japan_df = get_data('^N225',1999,1,1)


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [194]:
# UK market data
# add features
UK_df['UK Range'] = abs(UK_df['High']-UK_df['Low'])
UK_df['UK ATR'] = ta.atr(UK_df['High'], UK_df['Low'], UK_df['Close'] )
UK_df['UK Up'] = (UK_df['Close'] > UK_df['Open']).astype(int)
UK_df['UK Percent Change'] = (abs(UK_df['Close'] - UK_df['Open']) / UK_df['Open']) *100
UK_df['UK MOM 5'] = ta.mom(UK_df['Close'],5)
UK_df['UK MOM 20'] = ta.mom(UK_df['Close'],20)
UK_df['UK RSI 14'] = ta.rsi(UK_df['Close'], 14)
UK_df['UK Relative Volatility'] = UK_df['UK Range'] / UK_df['UK ATR']

UK_df.rename(columns={
    'Open':'UK Open',
    'High':'UK High',
    'Low':'UK Low',
    'Close':'UK Close'
},inplace=True)


del UK_df['Volume']
del UK_df['Adj Close']

In [195]:
# China market data
# add features
China_df['China Range'] = abs(China_df['High']-China_df['Low'])
China_df['China ATR'] = ta.atr(China_df['High'], China_df['Low'], China_df['Close'] )
China_df['China Up'] = (China_df['Close'] > China_df['Open']).astype(int)
China_df['China Percent Change'] = (abs(China_df['Close'] - China_df['Open']) / China_df['Open']) *100
China_df['China MOM 5'] = ta.mom(China_df['Close'],5)
China_df['China MOM 20'] = ta.mom(China_df['Close'],20)
China_df['China RSI 14'] = ta.rsi(China_df['Close'], 14)
China_df['China Relative Volatility'] = China_df['China Range'] / China_df['China ATR']

China_df.rename(columns={
    'Open':'China Open',
    'High':'China High',
    'Low':'China Low',
    'Close':'China Close'
},inplace=True)

    
del China_df['Volume']
del China_df['Adj Close']

In [196]:
# add Germany market data
# add features
Germany_df['Germany Range'] = abs(Germany_df['High']-Germany_df['Low'])
Germany_df['Germany ATR'] = ta.atr(Germany_df['High'], Germany_df['Low'], Germany_df['Close'] )
Germany_df['Germany Up'] = (Germany_df['Close'] > Germany_df['Open']).astype(int)
Germany_df['Germany Percent Change'] = (abs(Germany_df['Close'] - Germany_df['Open']) / Germany_df['Open']) *100
Germany_df['Germany MOM 5'] = ta.mom(Germany_df['Close'],5)
Germany_df['Germany MOM 20'] = ta.mom(Germany_df['Close'],20)
Germany_df['Germany RSI 14'] = ta.rsi(Germany_df['Close'], 14)
Germany_df['Germany Relative Volatility'] = Germany_df['Germany Range'] / Germany_df['Germany ATR']
                                                                               
Germany_df.rename(columns={
    'Open':'Germany Open',
    'High':'Germany High',
    'Low':'Germany Low',
    'Close':'Germany Close'
},inplace=True)
                                                                               
del Germany_df['Volume']
del Germany_df['Adj Close']

In [197]:
# add Japan market data
# add features
Japan_df['Japan Range'] = abs(Japan_df['High']-Japan_df['Low'])
Japan_df['Japan ATR'] = ta.atr(Japan_df['High'], Japan_df['Low'], Japan_df['Close'] )
Japan_df['Japan Up'] = (Japan_df['Close'] > Japan_df['Open']).astype(int)
Japan_df['Japan Percent Change'] = (abs(Japan_df['Close'] - Japan_df['Open']) / Japan_df['Open']) *100
Japan_df['Japan MOM 5'] = ta.mom(Japan_df['Close'],5)
Japan_df['Japan MOM 20'] = ta.mom(Japan_df['Close'],20)
Japan_df['Japan RSI 14'] = ta.rsi(Japan_df['Close'], 14)
Japan_df['Japan Relative Volatility'] = Japan_df['Japan Range'] / Japan_df['Japan ATR']

Japan_df.rename(columns={
    'Open':'Japan Open',
    'High':'Japan High',
    'Low':'Japan Low',
    'Close':'Japan Close'
},inplace=True)

del Japan_df['Volume']
del Japan_df['Adj Close']
                                                                               

In [198]:

merge_oil = df.merge(oil_df, how='inner', on='Date')
merge_gold = merge_oil.merge(gold_df, how='inner', on='Date')
merge_vix = merge_gold.merge(vix_df, how='inner', on='Date')
merge_UK = merge_vix.merge(UK_df, how='inner', on='Date')
merge_China = merge_UK.merge(China_df, how='inner', on='Date')
merge_Germany = merge_China.merge(Germany_df, how='inner', on='Date')
merge_Japan = merge_Germany.merge(Japan_df, how='inner', on='Date')
merge_Japan

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,ATR,Up,Percent Change,MOM 5,...,Japan Low,Japan Close,Japan Range,Japan ATR,Japan Up,Japan Percent Change,Japan MOM 5,Japan MOM 20,Japan RSI 14,Japan Relative Volatility
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-08-30,151.312500,151.500000,150.343750,150.343750,3964800,1.156250,1.661060,0,0.640231,-0.500000,...,16895.359375,16901.669922,236.000000,282.674885,0,1.340754,465.019531,695.479492,57.486844,0.834881
2000-08-31,151.062500,153.093750,150.906250,152.343750,4863100,2.187500,1.738841,1,0.848159,1.031250,...,16769.490234,16861.259766,287.048828,282.987310,0,0.335556,190.439453,1046.819336,56.534148,1.014352
2000-09-01,153.250000,153.593750,152.000000,152.500000,3191200,1.593750,1.728478,0,0.489396,1.250000,...,16700.359375,16739.779297,318.160156,285.499656,0,1.036118,-171.550781,1072.418945,53.655420,1.114398
2000-09-05,151.875000,152.203125,150.812500,151.281250,3470800,1.390625,1.725551,0,0.390947,-0.484375,...,16401.279297,16452.269531,311.050781,283.101754,0,1.352215,-689.480469,632.159180,47.149848,1.098724
2000-09-06,151.187500,151.953125,149.531250,149.562500,4322200,2.421875,1.775288,0,1.074824,-2.234375,...,16364.950195,16399.869141,166.860352,274.798797,0,0.206771,-501.800781,365.269531,46.039974,0.607209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-21,437.549988,440.109985,435.320007,439.339996,68719000,4.789978,4.538655,1,0.409098,-8.769989,...,31409.859375,31565.640625,348.839844,446.848810,1,0.040538,-494.269531,-738.609375,39.335097,0.780666
2023-08-22,441.179993,441.179993,437.570007,438.149994,65062900,3.609985,4.472321,0,0.686794,-4.740021,...,31693.759766,31856.710938,212.339844,439.249537,1,0.201655,-382.179688,-844.228516,44.340707,0.483415
2023-08-23,439.250000,443.670013,439.100006,443.029999,68441000,4.570007,4.547157,1,0.860557,3.389984,...,31717.910156,32010.259766,321.689453,430.852388,1,0.921718,243.439453,-672.250000,46.832977,0.746635
2023-08-24,444.690002,445.220001,436.859985,436.890015,88517300,8.360016,4.819504,0,1.754028,0.600006,...,32063.140625,32287.210938,234.769531,420.623674,1,0.487672,661.210938,-381.128906,51.087210,0.558146


In [199]:
print(f'SPY length: {len(df)}')
print(f'merged length: {len(merge_Japan)}')

SPY length: 6203
merged length: 4992


In [200]:
merge_Japan.dropna(inplace=True)

In [201]:
print(f'SPY length: {len(df)}')
print(f'merged length: {len(merge_Japan)}')

SPY length: 6203
merged length: 4973


In [202]:
merged_df = merge_Japan

In [203]:
merged_df.columns

Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Range', 'ATR', 'Up',
       'Percent Change', 'MOM 5', 'MOM 20', 'RSI 14', 'Relative Volatility',
       'Oil Open', 'Oil High', 'Oil Low', 'Oil Close', 'Oil Range', 'Oil ATR',
       'Oil Up', 'Oil Percent Change', 'Oil MOM 5', 'Oil MOM 20',
       'Oil RSI 14_x', 'Oil Relative Volatility', 'Gold Open', 'Gold High',
       'Gold Low', 'Gold Close', 'Gold Range', 'Gold ATR', 'Gold Up',
       'Gold Percent Change', 'Gold MOM 5', 'Gold MOM 20', 'Gold RSI 14',
       'Gold Relative Volatility', 'VIX Open', 'VIX High', 'VIX Low',
       'VIX Close', 'VIX Range', 'VIX ATR', 'Vix Up', 'VIX Percent Change',
       'VIX MOM 5', 'VIX MOM 20', 'Oil RSI 14_y', 'VIX Relative Volatility',
       'UK Open', 'UK High', 'UK Low', 'UK Close', 'UK Range', 'UK ATR',
       'UK Up', 'UK Percent Change', 'UK MOM 5', 'UK MOM 20', 'UK RSI 14',
       'UK Relative Volatility', 'China Open', 'China High', 'China Low',
       'China Close', 'China Range', 'Chin

In [204]:
merged_df.count()

Open                         4973
High                         4973
Low                          4973
Close                        4973
Volume                       4973
                             ... 
Japan Percent Change         4973
Japan MOM 5                  4973
Japan MOM 20                 4973
Japan RSI 14                 4973
Japan Relative Volatility    4973
Length: 97, dtype: int64

In [205]:
# create 'Tomorrow' column
# it's the 'Close' value from the previous day
# will be used to create a target
merged_df['Tomorrow'] = merged_df['Close'].shift(-1)
merged_df['Target'] = (merged_df['Tomorrow'] > merged_df['Close']).astype(int)

candlestick_df = pd.DataFrame({
    'Open':merged_df['Open'],
    'High': merged_df['High'],
    'Low': merged_df['Low'],
    'Close': merged_df['Close'],
    'Volume': merged_df['Volume']
})




In [206]:
merged_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,ATR,Up,Percent Change,MOM 5,...,Japan Range,Japan ATR,Japan Up,Japan Percent Change,Japan MOM 5,Japan MOM 20,Japan RSI 14,Japan Relative Volatility,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-28,143.187500,146.328125,142.890625,145.000000,7036400,3.437500,2.256395,1,1.265823,2.312500,...,262.479492,282.814378,0,0.101515,-684.089844,-1274.709961,37.028971,0.928098,143.625000,0
2000-09-29,145.468750,145.968750,143.625000,143.625000,9333600,2.343750,2.262635,0,1.267454,-1.656250,...,234.360352,281.978376,1,0.533397,-70.990234,-1114.000000,40.253733,0.831129,137.687500,0
2000-10-10,140.093750,141.250000,137.687500,137.687500,6104700,3.562500,2.503268,0,1.717600,-4.812500,...,165.759766,266.361446,0,0.817141,-74.790039,-673.831055,42.997313,0.622311,136.531250,0
2000-10-11,137.625000,138.625000,135.125000,136.531250,10346000,3.500000,2.574463,0,0.794732,-7.156250,...,370.440430,276.122040,0,1.782700,-398.519531,-617.330078,37.072666,1.341582,133.125000,0
2000-10-12,137.281250,137.593750,132.781250,133.125000,12336900,4.812500,2.734323,0,3.027544,-11.062500,...,187.580078,269.797614,1,0.379813,-598.440430,-489.590820,38.155558,0.695262,137.562500,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-21,437.549988,440.109985,435.320007,439.339996,68719000,4.789978,4.538655,1,0.409098,-8.769989,...,348.839844,446.848810,1,0.040538,-494.269531,-738.609375,39.335097,0.780666,438.149994,0
2023-08-22,441.179993,441.179993,437.570007,438.149994,65062900,3.609985,4.472321,0,0.686794,-4.740021,...,212.339844,439.249537,1,0.201655,-382.179688,-844.228516,44.340707,0.483415,443.029999,1
2023-08-23,439.250000,443.670013,439.100006,443.029999,68441000,4.570007,4.547157,1,0.860557,3.389984,...,321.689453,430.852388,1,0.921718,243.439453,-672.250000,46.832977,0.746635,436.890015,0
2023-08-24,444.690002,445.220001,436.859985,436.890015,88517300,8.360016,4.819504,0,1.754028,0.600006,...,234.769531,420.623674,1,0.487672,661.210938,-381.128906,51.087210,0.558146,439.970001,1


In [207]:
most_recent = merged_df[-1:].copy()
merged_df.dropna(inplace=True)

In [179]:
merged_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,ATR,Up,Percent Change,MOM 5,...,Japan Range,Japan ATR,Japan Up,Japan Percent Change,Japan MOM 5,Japan MOM 20,Japan RSI 14,Japan Relative Volatility,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-28,143.187500,146.328125,142.890625,145.000000,7036400,3.437500,2.256395,1,1.265823,2.312500,...,262.479492,282.814378,0,0.101515,-684.089844,-1274.709961,37.028971,0.928098,143.625000,0
2000-09-29,145.468750,145.968750,143.625000,143.625000,9333600,2.343750,2.262635,0,1.267454,-1.656250,...,234.360352,281.978376,1,0.533397,-70.990234,-1114.000000,40.253733,0.831129,137.687500,0
2000-10-10,140.093750,141.250000,137.687500,137.687500,6104700,3.562500,2.503268,0,1.717600,-4.812500,...,165.759766,266.361446,0,0.817141,-74.790039,-673.831055,42.997313,0.622311,136.531250,0
2000-10-11,137.625000,138.625000,135.125000,136.531250,10346000,3.500000,2.574463,0,0.794732,-7.156250,...,370.440430,276.122040,0,1.782700,-398.519531,-617.330078,37.072666,1.341582,133.125000,0
2000-10-12,137.281250,137.593750,132.781250,133.125000,12336900,4.812500,2.734323,0,3.027544,-11.062500,...,187.580078,269.797614,1,0.379813,-598.440430,-489.590820,38.155558,0.695262,137.562500,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-18,433.369995,437.570007,433.010010,436.500000,98758400,4.559998,4.519322,1,0.722248,-9.149994,...,369.640625,454.387962,1,0.413457,-1022.890625,-1039.759766,37.267446,0.813491,439.339996,1
2023-08-21,437.549988,440.109985,435.320007,439.339996,68719000,4.789978,4.538655,1,0.409098,-8.769989,...,348.839844,446.848810,1,0.040538,-494.269531,-738.609375,39.335097,0.780666,438.149994,0
2023-08-22,441.179993,441.179993,437.570007,438.149994,65062900,3.609985,4.472321,0,0.686794,-4.740021,...,212.339844,439.249537,1,0.201655,-382.179688,-844.228516,44.340707,0.483415,443.029999,1
2023-08-23,439.250000,443.670013,439.100006,443.029999,68441000,4.570007,4.547157,1,0.860557,3.389984,...,321.689453,430.852388,1,0.921718,243.439453,-672.250000,46.832977,0.746635,436.890015,0


In [180]:
candlestick_df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-09-28,143.187500,146.328125,142.890625,145.000000,7036400
2000-09-29,145.468750,145.968750,143.625000,143.625000,9333600
2000-10-10,140.093750,141.250000,137.687500,137.687500,6104700
2000-10-11,137.625000,138.625000,135.125000,136.531250,10346000
2000-10-12,137.281250,137.593750,132.781250,133.125000,12336900
...,...,...,...,...,...
2023-08-21,437.549988,440.109985,435.320007,439.339996,68719000
2023-08-22,441.179993,441.179993,437.570007,438.149994,65062900
2023-08-23,439.250000,443.670013,439.100006,443.029999,68441000
2023-08-24,444.690002,445.220001,436.859985,436.890015,88517300


In [209]:
most_recent

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,ATR,Up,Percent Change,MOM 5,...,Japan Range,Japan ATR,Japan Up,Japan Percent Change,Japan MOM 5,Japan MOM 20,Japan RSI 14,Japan Relative Volatility,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-08-25,438.679993,441.299988,435.0,439.970001,102297000,6.299988,4.925253,1,0.294066,3.470001,...,313.46875,441.661297,0,0.680354,173.519531,-1266.880859,42.351505,0.709749,,0


In [210]:
# n_est, min_sample, and max_depth should be lists with values you want to test
def optimize_hyperparams(df, n_est, min_sample, max_depth):
    features = get_features(df)
    
    param_grid = { 
        'n_estimators': n_est,
        'min_samples_split': min_sample,
        'max_depth' : max_depth,
    }

    model = RandomForestClassifier(random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
    CV_rfc.fit(train[features], train['Target'])
    
    print(CV_rfc.best_params_)

In [211]:
# optimize model hyperparameters
n_est = [100,200,250,300,350,400]
min_samples_split = [300,400, 450, 500, 550, 600,650,700]
max_depth = [4,5,6,7,8]

# uses GridSearchCV from sklearn to find best hyperparameters
# can take a VERY long time to run if there are a lot of hyperparameters to loop through
optimize_hyperparams(merged_df, n_est, min_samples_split, max_depth)

{'max_depth': 6, 'min_samples_split': 300, 'n_estimators': 100}


In [213]:
fit_train_score_with_depth(merged_df, 100, 300, 6)

{'n_estimators': 100,
 'min_samples_split': 300,
 'max_depth': 6,
 'training precision': 0.6286415210058265,
 'testing precision': 0.5488}

In [216]:
fit_train_score_with_depth(merged_df, 100, 200, 6)

{'n_estimators': 100,
 'min_samples_split': 200,
 'max_depth': 6,
 'training precision': 0.6617124394184168,
 'testing precision': 0.5482014388489208}

In [217]:
fit_train_score_with_depth(merged_df, 200, 300, 6)

{'n_estimators': 200,
 'min_samples_split': 300,
 'max_depth': 6,
 'training precision': 0.6295731707317073,
 'testing precision': 0.5660749506903353}

In [218]:
fit_train_score_with_depth(merged_df, 300, 300, 6)

{'n_estimators': 300,
 'min_samples_split': 300,
 'max_depth': 6,
 'training precision': 0.6271289537712895,
 'testing precision': 0.5591985428051002}

In [219]:
fit_train_score_with_depth(merged_df, 100, 300, 5)

{'n_estimators': 100,
 'min_samples_split': 300,
 'max_depth': 5,
 'training precision': 0.6286415210058265,
 'testing precision': 0.5488}

In [221]:
fit_train_score_with_depth(merged_df, 500, 300, 5)

{'n_estimators': 500,
 'min_samples_split': 300,
 'max_depth': 5,
 'training precision': 0.6290864650168042,
 'testing precision': 0.5660377358490566}

In [222]:
fit_train_score_with_depth(merged_df, 600, 300, 6)

{'n_estimators': 600,
 'min_samples_split': 300,
 'max_depth': 6,
 'training precision': 0.6298224127372933,
 'testing precision': 0.5645161290322581}

In [223]:
fit_train_score_with_depth(merged_df, 50, 300, 6)

{'n_estimators': 50,
 'min_samples_split': 300,
 'max_depth': 6,
 'training precision': 0.6343072573044298,
 'testing precision': 0.5553191489361702}

In [225]:
fit_train_score_with_depth(merged_df, 100, 300, 6)

{'n_estimators': 75,
 'min_samples_split': 300,
 'max_depth': 6,
 'training precision': 0.6290471785383904,
 'testing precision': 0.5604982206405694}

In [228]:
fit_train_score_with_depth(merged_df, 100, 750, 6)

{'n_estimators': 100,
 'min_samples_split': 750,
 'max_depth': 6,
 'training precision': 0.5729684908789386,
 'testing precision': 0.5587846763540291}

In [212]:
most_recent

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Range,ATR,Up,Percent Change,MOM 5,...,Japan Range,Japan ATR,Japan Up,Japan Percent Change,Japan MOM 5,Japan MOM 20,Japan RSI 14,Japan Relative Volatility,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-08-25,438.679993,441.299988,435.0,439.970001,102297000,6.299988,4.925253,1,0.294066,3.470001,...,313.46875,441.661297,0,0.680354,173.519531,-1266.880859,42.351505,0.709749,,0


In [60]:
def daily_prediction(df,model,most_recent): 
    features = get_features(df)
    train = df.iloc[:-1000]
    test = df.iloc[-1000:]
    model.fit(train[features], train['Target'])
    prediction = model.predict(most_recent[features])
    print(prediction)

In [61]:
daily_prediction = daily_prediction(merged_df,model,most_recent)

[0]


In [62]:
def predict(train, test, features, model):
    model.fit(train[features], train["Target"])
    preds = model.predict(test[features])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [63]:
def backtest(df, model, features, start=2500, step=250):
    all_predictions = []
    backtest_data = []
    for i in range(start, df.shape[0]-step, step):
        train = df.iloc[0:i].copy()
        test = df.iloc[i:(i+step)].copy()
        predictions = predict(train, test, features, model)
        all_predictions.append(predictions)
        backtest_dict = {
        'start date':df.index[i-step],
        'end date': df.index[i],
        'precision score': precision_score(predictions["Target"], predictions["Predictions"]),
                }
        backtest_data.append(backtest_dict)
    backtest_df = pd.DataFrame(backtest_data)
    print(backtest_df)
    return pd.concat(all_predictions)

In [None]:
model = RandomForestClassifier(n_estimators=700, min_samples_split=400, max_depth=4, random_state=1)
# run back test for previous periods
predictions = backtest(merged_df, model, get_features(merged_df))
    # returns dataframe with time period column and precision column


In [None]:
features = get_features(merged_df)
model = RandomForestClassifier(n_estimators=700, min_samples_split=400, max_depth=4, random_state=1)
train = merged_df.iloc[:-1000]
test = merged_df.iloc[-1000:]
model.fit(train[features], train['Target'])
prediction = model.predict(most_recent[features])
print(prediction)