In [13]:
import yfinance as yf
import pandas as pd
import datetime as dt
import time
from datetime import timedelta
import pandas_ta as ta

from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from functools import reduce

from sklearn.model_selection import GridSearchCV

In [14]:
def get_data(ticker, year,month,day):
    start = dt.datetime(1999,1,1)
    end = dt.datetime.now()
     

    df = yf.download(ticker,start,end)
    
    return df

In [15]:
# loop through dataframe and add all features to a list
# removes 'Tomorrow' and 'Target' column from the list 
def get_features(dataframe):
    features = []
    for column in dataframe:
        features.append(column)
    
    features.pop(-1)
    features.pop(-1)
    
    return features

In [16]:
def fit_train_score(df, n_est, min_split):    
    features = get_features(df)
    
    # n_estimators = number of decision trees
    # min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
    model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    # fit and train model
    model.fit(train[features], train['Target'])
    predictions = model.predict(test[features])
    #predictions_series = pd.Series(predictions, index=test.index)

    # precision of training data
    predictions_training = model.predict(train[features])
    
    test_dict = {
        'n_estimators':n_est,
        'min_samples_split':min_split,
        'training precision':precision_score(train['Target'], predictions_training),
        'testing precision':precision_score(test['Target'], predictions)
                }
    
    return test_dict

In [17]:
def fit_train_score_with_depth(df, n_est, min_split, max_depth):
    features = get_features(df)
    
    # n_estimators = number of decision trees
    # min_samples_split = higher it is set, the less accurtate it is, the less it will overfit
    model = RandomForestClassifier(n_estimators=n_est, min_samples_split=min_split, random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    # fit and train model
    model.fit(train[features], train['Target'])
    predictions = model.predict(test[features])
    #predictions_series = pd.Series(predictions, index=test.index)

    # precision of training data
    predictions_training = model.predict(train[features])
    
    test_dict = {
        'n_estimators':n_est,
        'min_samples_split':min_split,
        'max_depth': max_depth,
        'training precision':precision_score(train['Target'], predictions_training),
        'testing precision':precision_score(test['Target'], predictions)
                }
    
    return test_dict

In [18]:
# n_est, min_sample, and max_depth should be lists with values you want to test
def optimize_hyperparams(df, n_est, min_sample, max_depth):
    features = get_features(df)
    
    param_grid = { 
        'n_estimators': n_est,
        'min_samples_split': min_sample,
        'max_depth' : max_depth,
    }

    model = RandomForestClassifier(random_state=1)

    train = df.iloc[:-1000]
    test = df.iloc[-1000:]

    CV_rfc = GridSearchCV(estimator=model, param_grid=param_grid, cv= 5)
    CV_rfc.fit(train[features], train['Target'])
    
    print(CV_rfc.best_params_)

In [19]:
df = get_data('SPY',1999,1,1)

[*********************100%***********************]  1 of 1 completed


In [20]:
# create new features
df['Range'] = abs(df['High']-df['Low'])
df['RSI 14'] = ta.rsi(df['Close'], 14)
df['SMA 20'] = ta.sma(df['Close'], 20)
df['SMA 50'] = ta.sma(df['Close'], 50)
df['SMA 200'] = ta.sma(df['Close'], 200)
df['EMA 12'] = ta.ema(df['Close'], 12)
df['EMA 26'] = ta.ema(df['Close'], 26)
df['ATR'] = ta.atr(df['High'], df['Low'], df['Close'] )

# create 'Tomorrow' column
# it's the 'Close' value from the previous day
# will be used to create a target
df['Tomorrow'] = df['Close'].shift(-1)
df['Target'] = (df['Tomorrow'] > df['Close']).astype(int)
df = df.copy().loc['2000-01-01':]
df.drop('Adj Close', axis=1, inplace=True)
df.drop(df.tail(1).index, inplace=True)

In [21]:
# get foreign market and gold data
UK_df = get_data('^FTSE',1999,1,1)
China_df = get_data('000001.SS',1999,1,1)
Germany_df = get_data('^GDAXI',1999,1,1)
Japan_df = get_data('^N225',1999,1,1)
gold_df = get_data('GC=F',1999,1,1)

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [22]:
# clean foreign market data
# some have empty volume columns for early 2000's

# drop volume from foreign data dfs
#rename columns
# remove first year of dat

UK_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
UK_df.rename(columns={
    'Open':'UK Open',
    'High':'UK High',
    'Low':'UK Low',
    'Close':'UK Close'
},inplace=True)
UK_df.loc['2000-01-01':]

China_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
China_df.rename(columns={
    'Open':'China Open',
    'High':'China High',
    'Low':'China Low',
    'Close':'China Close'
},inplace=True)
China_df.loc['2000-01-01':]

Germany_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
Germany_df.rename(columns={
    'Open':'Germany Open',
    'High':'Germany High',
    'Low':'Germany Low',
    'Close':'Germany Close'
},inplace=True)
Germany_df.loc['2000-01-01':]

Japan_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
Japan_df.rename(columns={
    'Open':'Japan Open',
    'High':'Japan High',
    'Low':'Japan Low',
    'Close':'Japan Close'
},inplace=True)
Japan_df.loc['2000-01-01':]

gold_df.drop(['Volume', 'Adj Close'], axis=1, inplace=True)
gold_df.rename(columns={
    'Open':'Gold Open',
    'High':'Gold High',
    'Low':'Gold Low',
    'Close':'Gold Close'
},inplace=True)
gold_df.loc['2000-01-01':]

Unnamed: 0_level_0,Gold Open,Gold High,Gold Low,Gold Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-08-30,273.899994,273.899994,273.899994,273.899994
2000-08-31,274.799988,278.299988,274.799988,278.299988
2000-09-01,277.000000,277.000000,277.000000,277.000000
2000-09-05,275.799988,275.799988,275.799988,275.799988
2000-09-06,274.200012,274.200012,274.200012,274.200012
...,...,...,...,...
2023-08-21,1893.599976,1893.599976,1893.300049,1893.300049
2023-08-22,1894.599976,1896.800049,1892.800049,1896.400024
2023-08-23,1909.599976,1918.500000,1909.599976,1918.500000
2023-08-24,1920.000000,1920.800049,1918.199951,1918.199951


In [23]:
# merge into one df
dfs = [UK_df, China_df, Germany_df, Japan_df, gold_df, df]
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='inner'), dfs)
df_merged

Unnamed: 0_level_0,UK Open,UK High,UK Low,UK Close,China Open,China High,China Low,China Close,Germany Open,Germany High,...,Range,RSI 14,SMA 20,SMA 50,SMA 200,EMA 12,EMA 26,ATR,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-08-30,6586.299805,6624.899902,6586.100098,6615.100098,2107.019043,2108.096924,2075.159912,2076.143066,7292.459961,7292.459961,...,1.156250,55.725080,149.267188,147.763125,144.410859,150.345132,149.168169,1.661060,152.343750,1
2000-08-31,6615.100098,6675.700195,6585.200195,6672.700195,2062.834961,2064.760010,2019.447021,2021.198975,7191.000000,7251.310059,...,2.187500,62.639763,149.604688,147.852500,144.466328,150.652612,149.403397,1.738841,152.500000,1
2000-09-01,6672.700195,6838.600098,6672.700195,6795.000000,2009.151001,2026.350952,1997.717041,1999.859009,7221.410156,7397.120117,...,1.593750,63.124303,149.910937,147.990000,144.520703,150.936826,149.632775,1.728478,151.281250,0
2000-09-05,6798.100098,6809.500000,6737.600098,6752.500000,2015.062012,2020.847046,1959.592041,1963.791992,7448.080078,7456.709961,...,1.390625,56.922949,150.068750,148.128125,144.563984,150.989814,149.754884,1.725551,149.562500,0
2000-09-06,6752.500000,6766.000000,6688.399902,6694.700195,1955.499023,1964.265015,1923.598022,1925.906982,7397.049805,7434.020020,...,2.421875,49.532617,150.112500,148.194687,144.599297,150.770227,149.740634,1.775288,150.843750,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-21,7262.399902,7316.000000,7250.899902,7257.799805,3125.989990,3134.959961,3092.979980,3092.979980,15573.950195,15710.209961,...,4.789978,40.887683,447.997502,444.353600,412.274100,443.051299,444.933763,4.538655,438.149994,0
2023-08-22,7257.799805,7310.500000,7257.600098,7270.799805,3103.780029,3126.780029,3074.530029,3120.330078,15684.639648,15799.339844,...,3.609985,39.350095,447.133002,444.518600,412.590500,442.297252,444.431262,4.472321,443.029999,1
2023-08-23,7270.799805,7348.799805,7270.799805,7320.500000,3116.270020,3116.379883,3077.610107,3078.399902,15761.250000,15820.950195,...,4.570007,47.987997,446.509001,444.703201,412.950600,442.409982,444.327464,4.547157,436.890015,0
2023-08-24,7320.500000,7386.100098,7320.500000,7333.600098,3085.909912,3106.179932,3072.949951,3082.239990,15876.219727,15896.740234,...,8.360016,40.225352,445.729002,444.707801,413.253300,441.560757,443.776542,4.819504,439.970001,1


In [24]:
fit_train_score(df_merged,200,750)

{'n_estimators': 200,
 'min_samples_split': 750,
 'training precision': 0.5844036697247706,
 'testing precision': 0.573170731707317}

In [25]:
fit_train_score(df_merged,200,650)

{'n_estimators': 200,
 'min_samples_split': 650,
 'training precision': 0.5929944203347799,
 'testing precision': 0.5636363636363636}

In [26]:
n_est = [200] #[200,250,300]
min_samples_split = [400,450,500] #[400, 450, 500, 550, 600,650,700]
max_depth = [5] #[4,5,6,7,8]

# uses GridSearchCV from sklearn to find best hyperparameters
# can take a VERY long time to run if there are a lot of hyperparameters to loop through
optimize_hyperparams(df_merged, n_est, min_samples_split, max_depth)

{'max_depth': 5, 'min_samples_split': 400, 'n_estimators': 200}


In [27]:
# using optimized hyperparameters from above
fit_train_score_with_depth(df_merged, 200, 400, 5)

{'n_estimators': 200,
 'min_samples_split': 400,
 'max_depth': 5,
 'training precision': 0.6148604802076574,
 'testing precision': 0.6111111111111112}