In [1]:
import yfinance as yf
import pandas as pd
import datetime as dt
import time
from datetime import timedelta
import pandas_ta as ta

from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from functools import reduce

from sklearn.model_selection import GridSearchCV

In [2]:
def get_data(ticker, year,month,day):
    start = dt.datetime(1999,1,1)
    end = dt.datetime(2023,8,29)
     

    df = yf.download(ticker,start,end)
    
    return df

In [3]:
# loop through dataframe and add all features to a list
# removes 'Tomorrow' and 'Target' column from the list 
def get_features(dataframe):
    features = []
    for column in dataframe:
        features.append(column)
    
    features.pop(-1)
    features.pop(-1)
    
    return features

In [4]:
def fit_train_score(df, n_est, min_split):    
    features = get_features(df)
    
    # n_estimators = number of decision trees
    # min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
    model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    # fit and train model
    model.fit(train[features], train['Target'])
    predictions = model.predict(test[features])
    #predictions_series = pd.Series(predictions, index=test.index)

    # precision of training data
    predictions_training = model.predict(train[features])
    
    test_dict = {
        'n_estimators':n_est,
        'min_samples_split':min_split,
        'training precision':precision_score(train['Target'], predictions_training),
        'testing precision':precision_score(test['Target'], predictions)
                }
    
    return test_dict

In [5]:
# same as fit_train_score but it excepts arguments for max_depth
def fit_train_score_with_depth(df, n_est, min_split, max_depth):
    features = get_features(df)
    
    # n_estimators = number of decision trees
    # min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
    model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    # fit and train model
    model.fit(train[features], train['Target'])
    predictions = model.predict(test[features])
    #predictions_series = pd.Series(predictions, index=test.index)

    # precision of training data
    predictions_training = model.predict(train[features])
    
    test_dict = {
        'n_estimators':n_est,
        'min_samples_split':min_split,
        'max_depth': max_depth,
        'training precision':precision_score(train['Target'], predictions_training),
        'testing precision':precision_score(test['Target'], predictions)
                }
    
    return test_dict

In [6]:
# n_est, min_sample, and max_depth should be lists with values you want to test
def optimize_hyperparams(df, n_est, min_sample, max_depth):
    features = get_features(df)
    
    param_grid = { 
        'n_estimators': n_est,
        'min_samples_split': min_sample,
        'max_depth' : max_depth,
    }

    model = RandomForestClassifier(random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
    CV_rfc.fit(train[features], train['Target'])
    
    print(CV_rfc.best_params_)

In [23]:
df = get_data('SPY',1999,1,1)

[*********************100%***********************]  1 of 1 completed


In [24]:
# create new features
df['Range'] = abs(df['High']-df['Low'])
df['RSI 14'] = ta.rsi(df['Close'], 14)
df['SMA 20'] = ta.sma(df['Close'], 20)
df['SMA 50'] = ta.sma(df['Close'], 50)
df['SMA 200'] = ta.sma(df['Close'], 200)
df['EMA 12'] = ta.ema(df['Close'], 12)
df['EMA 26'] = ta.ema(df['Close'], 26)
df['ATR'] = ta.atr(df['High'], df['Low'], df['Close'] )

# create 'Tomorrow' column
# it's the 'Close' value from the previous day
# will be used to create a target
df['Tomorrow'] = df['Close'].shift(-1)
df['Target'] = (df['Tomorrow'] > df['Close']).astype(int)
df = df.loc['2000-01-01':].copy()
df.drop('Adj Close', axis=1, inplace=True)

# save most recent daily data
#df_most_recent = pd.DataFrame(df.tail(1))

# drop most recent day from dataframe
#df.drop(df.tail(1).index, inplace=True)

In [9]:
len(df)

5951

In [10]:
# get foreign market and gold data
UK_df = get_data('^FTSE',1999,1,1)
China_df = get_data('000001.SS',1999,1,1)
Germany_df = get_data('^GDAXI',1999,1,1)
Japan_df = get_data('^N225',1999,1,1)
gold_df = get_data('GC=F',1999,1,1)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [11]:
print(len(UK_df))
print(len(China_df))
print(len(Germany_df))
print(len(Japan_df))
print(len(gold_df))

6225
5965
6260
6041
5768


In [12]:
# clean foreign market data
# some have empty volume columns for early 2000's

# drop volume from foreign data dfs
#rename columns
# remove first year of dat

UK_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
UK_df.rename(columns={
    'Open':'UK Open',
    'High':'UK High',
    'Low':'UK Low',
    'Close':'UK Close'
},inplace=True)
UK_df.loc['2000-01-01':]

China_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
China_df.rename(columns={
    'Open':'China Open',
    'High':'China High',
    'Low':'China Low',
    'Close':'China Close'
},inplace=True)
China_df.loc['2000-01-01':]

Germany_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
Germany_df.rename(columns={
    'Open':'Germany Open',
    'High':'Germany High',
    'Low':'Germany Low',
    'Close':'Germany Close'
},inplace=True)
Germany_df.loc['2000-01-01':]

Japan_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
Japan_df.rename(columns={
    'Open':'Japan Open',
    'High':'Japan High',
    'Low':'Japan Low',
    'Close':'Japan Close'
},inplace=True)
Japan_df.loc['2000-01-01':]

gold_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
gold_df.rename(columns={
    'Open':'Gold Open',
    'High':'Gold High',
    'Low':'Gold Low',
    'Close':'Gold Close'
},inplace=True)
gold_df.loc['2000-01-01':]

Unnamed: 0_level_0,Gold Open,Gold High,Gold Low,Gold Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-08-30,273.899994,273.899994,273.899994,273.899994
2000-08-31,274.799988,278.299988,274.799988,278.299988
2000-09-01,277.000000,277.000000,277.000000,277.000000
2000-09-05,275.799988,275.799988,275.799988,275.799988
2000-09-06,274.200012,274.200012,274.200012,274.200012
...,...,...,...,...
2023-08-22,1894.599976,1896.800049,1892.800049,1896.400024
2023-08-23,1909.599976,1918.500000,1909.599976,1918.500000
2023-08-24,1920.000000,1920.800049,1918.199951,1918.199951
2023-08-25,1919.800049,1919.800049,1911.099976,1911.099976


In [13]:
print(len(UK_df))
print(len(China_df))
print(len(Germany_df))
print(len(Japan_df))
print(len(gold_df))

6225
5965
6260
6041
5768


In [20]:
# merge into one df
dfs = [UK_df, China_df, Germany_df, Japan_df, gold_df, df]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs)

# drop most recent day from dataframe
df_merged.drop(df_merged.tail(1).index, inplace=True)

df_merged

Unnamed: 0_level_0,UK Open,UK High,UK Low,UK Close,China Open,China High,China Low,China Close,Germany Open,Germany High,...,Range,RSI 14,SMA 20,SMA 50,SMA 200,EMA 12,EMA 26,ATR,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1999-01-04,5909.399902,5916.899902,5811.299805,5879.399902,1144.887939,1144.889038,1124.213013,1125.818970,4978.379883,5290.359863,...,,,,,,,,,,
1999-01-05,5882.299805,5980.500000,5875.799805,5958.200195,1123.696045,1123.696045,1104.500977,1119.964966,5171.770020,5313.609863,...,,,,,,,,,,
1999-01-06,5968.899902,6157.399902,5968.899902,6148.799805,1120.667969,1137.208008,1120.401978,1132.589966,5337.899902,5460.200195,...,,,,,,,,,,
1999-01-07,6145.899902,6153.700195,6042.500000,6101.200195,1133.165039,1140.482056,1125.668945,1137.729004,5482.620117,5509.299805,...,,,,,,,,,,
1999-01-08,6115.399902,6195.600098,6114.799805,6147.200195,1138.767944,1168.838013,1138.767944,1168.813965,5357.080078,5458.259766,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2010-04-05,,,,,,,,,,,...,0.919998,72.321706,116.448500,112.404800,106.433400,117.044569,115.583474,1.211515,119.040001,1.0
2014-12-25,,,,,,,,,,,...,,,,,,,,,,
2015-04-06,,,,,,,,,,,...,3.239990,50.722610,207.396001,207.433401,201.454800,207.225337,207.583408,2.162638,207.279999,0.0
2017-05-01,,,,,,,,,,,...,0.970001,62.029661,235.722999,236.194399,224.041400,236.940016,236.096638,1.600913,238.770004,1.0


In [22]:
df_merged.tail()

Unnamed: 0_level_0,UK Open,UK High,UK Low,UK Close,China Open,China High,China Low,China Close,Germany Open,Germany High,...,Range,RSI 14,SMA 20,SMA 50,SMA 200,EMA 12,EMA 26,ATR,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-04-05,,,,,,,,,,,...,0.919998,72.321706,116.4485,112.4048,106.4334,117.044569,115.583474,1.211515,119.040001,1.0
2014-12-25,,,,,,,,,,,...,,,,,,,,,,
2015-04-06,,,,,,,,,,,...,3.23999,50.72261,207.396001,207.433401,201.4548,207.225337,207.583408,2.162638,207.279999,0.0
2017-05-01,,,,,,,,,,,...,0.970001,62.029661,235.722999,236.194399,224.0414,236.940016,236.096638,1.600913,238.770004,1.0
2021-04-05,,,,,,,,,,,...,3.559998,67.670551,393.285001,388.377801,355.071351,396.012714,392.516068,5.233052,406.119995,0.0


In [21]:
df_merged.count()

UK Open          6225
UK High          6225
UK Low           6225
UK Close         6225
China Open       5965
China High       5965
China Low        5965
China Close      5965
Germany Open     6260
Germany High     6260
Germany Low      6260
Germany Close    6260
Japan Open       6040
Japan High       6040
Japan Low        6040
Japan Close      6040
Gold Open        5767
Gold High        5767
Gold Low         5767
Gold Close       5767
Open             5950
High             5950
Low              5950
Close            5950
Volume           5950
Range            5950
RSI 14           5950
SMA 20           5950
SMA 50           5950
SMA 200          5950
EMA 12           5950
EMA 26           5950
ATR              5950
Tomorrow         5949
Target           5950
dtype: int64

In [16]:
df_merged['Target'].value_counts()

1    2699
0    2293
Name: Target, dtype: int64

In [None]:
fit_train_score(df_merged,200,750)

In [None]:
fit_train_score(df_merged,200,650)

In [93]:
def optimize_hyperparams(df, n_est, min_sample, max_depth):
    features = get_features(df)
    
    param_grid = { 
        'n_estimators': n_est,
        'min_samples_split': min_sample,
        'max_depth' : max_depth,
    }

    model = RandomForestClassifier(random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
    CV_rfc.fit(train[features], train['Target'])
    
    print(CV_rfc.best_params_)

In [94]:
n_est = [200,250,300,350,400]
min_samples_split = [400, 450, 500, 550, 600,650,700]
max_depth = [4,5,6,7,8]

# uses GridSearchCV from sklearn to find best hyperparameters
# can take a VERY long time to run if there are a lot of hyperparameters to loop through
optimize_hyperparams(df_merged, n_est, min_samples_split, max_depth)

{'max_depth': 4, 'min_samples_split': 700, 'n_estimators': 350}


In [25]:
# using optimized hyperparameters from above
fit_train_score_with_depth(df, 700, 350, 4)

{'n_estimators': 700,
 'min_samples_split': 350,
 'max_depth': 4,
 'training precision': 0.6168662155127538,
 'testing precision': 0.5909090909090909}

In [19]:
df_most_recent = df_merged[-1:].copy()
df_most_recent

Unnamed: 0_level_0,UK Open,UK High,UK Low,UK Close,China Open,China High,China Low,China Close,Germany Open,Germany High,...,Range,RSI 14,SMA 20,SMA 50,SMA 200,EMA 12,EMA 26,ATR,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-08-24,7320.5,7386.100098,7320.5,7333.600098,3085.909912,3106.179932,3072.949951,3082.23999,15876.219727,15896.740234,...,8.360016,40.225352,445.729002,444.707801,413.2533,441.560757,443.776542,4.819504,439.970001,1


In [None]:
# check if model could correctly predict today 

features = get_features(df_merged)

# n_estimators = number of decision trees
# min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
model = RandomForestClassifier(n_estimators=200, min_samples_split=400, max_depth=5, random_state=1)

train = df_merged.iloc[:-1000].copy()
test = df_merged.iloc[-1000:].copy()

# fit and train model
model.fit(train[features], train['Target'])
predictions = model.predict(df_most_recent[features])
#predictions_series = pd.Series(predictions, index=test.index)



print(predictions)

# model predict a down day, but Monday was an up day            

In [28]:
# model to use for backtesting
final_model = RandomForestClassifier(n_estimators=200, min_samples_split=400, max_depth=5, random_state=1)

In [27]:
# make predictions

# arguments for funtion
    # train = df_merged.iloc[:-1000].copy()
    # test = df_merged.iloc[-1000:].copy()
    # final_model
    # features = get_features(merged_df)
def predict(train, test, features, model):
    model.fit(train[features], train["Target"])
    preds = model.predict(test[features])
    preds = pd.Series(preds, index=test.index, name="Predictions")
    combined = pd.concat([test["Target"], preds], axis=1)
    return combined

In [29]:
def backtest(df, model, features, start=2500, step=250):
    all_predictions = []

    for i in range(start, df.shape[0]-step, step):
        train = df.iloc[0:i].copy()
        test = df.iloc[i:(i+step)].copy()
        predictions = predict(train, test, features, model)
        
        print(df.index[i])
        print(precision_score(predictions["Target"], predictions["Predictions"]))
        
        all_predictions.append(predictions)
    
    return pd.concat(all_predictions)

In [30]:
predictions = backtest(df, final_model, get_features(df))

2009-12-10 00:00:00
0.5591836734693878
2010-12-08 00:00:00
0.5349794238683128
2011-12-05 00:00:00
0.5365853658536586
2012-12-04 00:00:00
0.5882352941176471
2013-12-02 00:00:00
0.6108597285067874
2014-11-28 00:00:00
0.484
2015-11-25 00:00:00
0.5514705882352942
2016-11-22 00:00:00
0.0


  _warn_prf(average, modifier, msg_start, len(result))


2017-11-20 00:00:00
0.536
2018-11-16 00:00:00
0.6
2019-11-15 00:00:00
0.588
2020-11-12 00:00:00
0.0


  _warn_prf(average, modifier, msg_start, len(result))


2021-11-10 00:00:00
0.452


In [None]:
df_merged

In [41]:
most_recent = yf.download('SPY',pd.Timestamp.now())

[*********************100%***********************]  1 of 1 completed


In [42]:
most_recent

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-08-28,442.23999,443.399994,439.972809,442.76001,442.76001,56650963


In [96]:
model2 = RandomForestClassifier(n_estimators=700, min_samples_split=350, max_depth=4, random_state=1)

In [100]:
predictions2 = backtest(df_merged, model2, get_features(df_merged))

2012-03-07 00:00:00
0.5523809523809524
2013-05-08 00:00:00
0.5666666666666667
2014-06-30 00:00:00
0.516
2015-08-13 00:00:00
0.5380434782608695
2016-10-18 00:00:00
0.5333333333333333
2017-12-04 00:00:00
0.5602094240837696
2019-01-28 00:00:00
0.59375
2020-03-31 00:00:00
0.6415094339622641
2021-05-26 00:00:00
0.5193370165745856
