In [38]:
import yfinance as yf
import pandas as pd
import datetime as dt
import time
from datetime import timedelta
import pandas_ta as ta

from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

from functools import reduce
from sklearn.model_selection import GridSearchCV

#######################################################################################​
# pulls stock data from yfinance module
def get_data(ticker, year,month,day):
    start = dt.datetime(1999,1,1)
    end = dt.datetime.now()
     

    df = yf.download(ticker,start,end)
    return df

# loop through dataframe and add all features to a list
# removes 'Tomorrow' and 'Target' column from the list 
def get_features(dataframe):
    features = []
    for column in dataframe:
        features.append(column)
    
    # drop Target and Tomorrow columns from feature list
    features.pop(-1)
    features.pop(-1)
    
    return features

########################################################################################
# get raw data (2001 to now)
# pull SPY data
data = get_data('SPY',1999,1,1)
df = data.copy()
# pull oil data
oil_data = get_data('CL=F',1999,1,1)
oil_df = oil_data.copy()
# pull gold data 
gold_data = get_data('GC=F',1999,1,1)
gold_df = gold_data.copy()
#############################################################################################
# add and remove features
# SPY
df['Range'] = abs(df['High']-df['Low'])
df['ATR'] = ta.atr(df['High'], df['Low'], df['Close'] )
df['ATR'] = ta.atr(df['High'], df['Low'], df['Close'] )
df['Up'] = (df['Close'] > df['Open']).astype(int)
df['Percent Change'] = (abs(df['Close'] - df['Open']) / df['Open']) *100
df['MOM 5'] = ta.mom(df['Close'],5)
df['MOM 20'] = ta.mom(df['Close'],20)
df['RSI 14'] = ta.rsi(df['Close'], 14)
# remove adjusted close column
del df['Adj Close']
del df['Volume']

# add feautres to Oil data
# uses Pandas TA (technical analysis) module to generate indicators like ATR, MOM, and RSI
oil_df['Oil Range'] = abs(oil_df['High']-oil_df['Low'])
oil_df['Oil ATR'] = ta.atr(oil_df['High'], oil_df['Low'], oil_df['Close'] )
oil_df['Oil Up'] = (oil_df['Close'] > oil_df['Open']).astype(int)
oil_df['Oil Percent Change'] = (abs(oil_df['Close'] - oil_df['Open']) / oil_df['Open']) *100
oil_df['Oil MOM 5'] = ta.mom(oil_df['Close'],5)
oil_df['Oil MOM 20'] = ta.mom(oil_df['Close'],20)
oil_df['Oil RSI 14'] = ta.rsi(oil_df['Close'], 14)
# remove price columns
del oil_df['Open']
del oil_df['Close']
del oil_df['High']
del oil_df['Low']
del oil_df['Volume']
del oil_df['Adj Close']
# add features to Gold data
gold_df['Gold Range'] = abs(gold_df['High']-gold_df['Low'])
gold_df['Gold ATR'] = ta.atr(gold_df['High'], gold_df['Low'], gold_df['Close'] )
gold_df['Gold Up'] = (gold_df['Close'] > gold_df['Open']).astype(int)
gold_df['Gold Percent Change'] = (abs(gold_df['Close'] - gold_df['Open']) / gold_df['Open']) *100
gold_df['Gold MOM 5'] = ta.mom(gold_df['Close'],5)
gold_df['Gold MOM 20'] = ta.mom(gold_df['Close'],20)
gold_df['Gold RSI 14'] = ta.rsi(gold_df['Close'], 14)
# remove price columns
del gold_df['Open']
del gold_df['Close']
del gold_df['High']
del gold_df['Low']
del gold_df['Volume']
del gold_df['Adj Close']
###################################################################################################
# merge dataframes
# merge SPY and oil dataframes
merged_oil = df.merge(oil_df, how='inner', on='Date')
# merge gold with already merged dataframe
merged_df = merged_oil.merge(gold_df, how='inner', on='Date')
#####################################################################################################
# create tomorrow and target columns
# it's the 'Close' value from the previous day
# will be used to create a target
merged_df['Tomorrow'] = merged_df['Close'].shift(-1)
merged_df['Target'] = (merged_df['Tomorrow'] > merged_df['Close']).astype(int)



[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [39]:
merged_df

Unnamed: 0_level_0,Open,High,Low,Close,Range,ATR,Up,Percent Change,MOM 5,MOM 20,...,Oil RSI 14,Gold Range,Gold ATR,Gold Up,Gold Percent Change,Gold MOM 5,Gold MOM 20,Gold RSI 14,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-08-30,151.312500,151.500000,150.343750,150.343750,1.156250,1.661060,0,0.640231,-0.500000,5.750000,...,,0.000000,,0,0.000000,,,,152.343750,1
2000-08-31,151.062500,153.093750,150.906250,152.343750,2.187500,1.738841,1,0.848159,1.031250,6.750000,...,,3.500000,,1,1.273654,,,,152.500000,1
2000-09-01,153.250000,153.593750,152.000000,152.500000,1.593750,1.728478,0,0.489396,1.250000,6.125000,...,,0.000000,,0,0.000000,,,,151.281250,0
2000-09-05,151.875000,152.203125,150.812500,151.281250,1.390625,1.725551,0,0.390947,-0.484375,3.156250,...,,0.000000,,0,0.000000,,,,149.562500,0
2000-09-06,151.187500,151.953125,149.531250,149.562500,2.421875,1.775288,0,1.074824,-2.234375,0.875000,...,,0.000000,,0,0.000000,,,,150.843750,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-28,442.239990,443.399994,439.970001,442.760010,3.429993,4.818449,1,0.117588,3.420013,-15.029999,...,51.763433,6.000000,12.623303,1,0.125295,24.599976,-52.599976,48.752948,449.160004,1
2023-08-29,442.649994,449.450012,442.459991,449.160004,6.990021,4.973561,1,1.470690,11.010010,-7.320007,...,55.567108,19.599976,13.121636,1,0.969809,40.099976,-4.199951,56.795240,451.010010,1
2023-08-30,449.510010,451.670013,448.779999,451.010010,2.890015,4.824736,1,0.333697,7.980011,0.880005,...,57.179416,11.800049,13.027237,1,0.428722,25.800049,6.900024,59.654636,450.350006,0
2023-08-31,451.649994,452.829987,450.160004,450.350006,2.669983,4.670825,0,0.287831,13.459991,1.510010,...,63.284792,6.200073,12.539583,0,0.318868,20.000000,6.199951,56.505046,451.190002,1


In [40]:
# drop rows with missing data
merged_df.dropna(inplace=True)
merged_df

Unnamed: 0_level_0,Open,High,Low,Close,Range,ATR,Up,Percent Change,MOM 5,MOM 20,...,Oil RSI 14,Gold Range,Gold ATR,Gold Up,Gold Percent Change,Gold MOM 5,Gold MOM 20,Gold RSI 14,Tomorrow,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-09-28,143.187500,146.328125,142.890625,145.000000,3.437500,2.256395,1,1.265823,2.312500,-5.343750,...,38.721348,2.399994,1.494902,0,0.684682,5.300018,1.700012,54.697250,143.625000,0
2000-09-29,145.468750,145.968750,143.625000,143.625000,2.343750,2.262635,0,1.267454,-1.656250,-8.718750,...,41.617324,2.600006,1.645266,0,0.364166,1.800018,-4.699982,49.480396,143.843750,1
2000-10-02,144.281250,144.906250,143.140625,143.843750,1.765625,2.227134,0,0.303227,-0.406250,-8.656250,...,47.743780,1.000000,1.606319,1,0.109977,-1.000000,-3.899994,48.241629,142.500000,0
2000-10-03,144.531250,145.750000,142.281250,142.500000,3.468750,2.315821,0,1.405405,0.093750,-8.781250,...,47.411501,1.299988,1.605868,0,0.183756,-2.299988,-4.199982,44.631643,143.687500,1
2000-10-04,142.875000,144.250000,141.750000,143.687500,2.500000,2.328977,1,0.568679,0.531250,-5.875000,...,45.008044,0.899994,1.598306,0,0.073942,-8.100006,-3.900024,41.717915,144.187500,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-08-25,438.679993,441.299988,435.000000,439.970001,6.299988,4.925253,1,0.294066,3.470001,-16.950012,...,50.766595,8.700073,12.794324,0,0.453176,25.000000,-49.300049,45.296089,442.760010,1
2023-08-28,442.239990,443.399994,439.970001,442.760010,3.429993,4.818449,1,0.117588,3.420013,-15.029999,...,51.763433,6.000000,12.623303,1,0.125295,24.599976,-52.599976,48.752948,449.160004,1
2023-08-29,442.649994,449.450012,442.459991,449.160004,6.990021,4.973561,1,1.470690,11.010010,-7.320007,...,55.567108,19.599976,13.121636,1,0.969809,40.099976,-4.199951,56.795240,451.010010,1
2023-08-30,449.510010,451.670013,448.779999,451.010010,2.890015,4.824736,1,0.333697,7.980011,0.880005,...,57.179416,11.800049,13.027237,1,0.428722,25.800049,6.900024,59.654636,450.350006,0


In [41]:
features = get_features(merged_df)
print(features)

['Open', 'High', 'Low', 'Close', 'Range', 'ATR', 'Up', 'Percent Change', 'MOM 5', 'MOM 20', 'RSI 14', 'Oil Range', 'Oil ATR', 'Oil Up', 'Oil Percent Change', 'Oil MOM 5', 'Oil MOM 20', 'Oil RSI 14', 'Gold Range', 'Gold ATR', 'Gold Up', 'Gold Percent Change', 'Gold MOM 5', 'Gold MOM 20', 'Gold RSI 14']


In [44]:
# display confusion matrix
# display feature importances
n_estimators = 700
min_samples_split = 400
max_depth = 4

model = RandomForestClassifier(n_estimators=n_estimators, min_samples_split=min_samples_split, max_depth=max_depth, random_state=1)



train = merged_df.iloc[:-1000]
test = merged_df.iloc[-1000:]

# fit and train model
model = model.fit(train[features], train['Target'])
predictions = model.predict(test[features])

# precision of training data
predictions_training = model.predict(train[features])

cm = confusion_matrix(test['Target'], predictions)

print(cm)
print('')
print(f'{model.feature_importances_}')


[[366  96]
 [419 119]]

[0.0259923  0.03919834 0.02600499 0.02898092 0.05148255 0.04641173
 0.0066624  0.02932279 0.05451818 0.04966133 0.0565561  0.08234434
 0.0522897  0.00159372 0.03276679 0.02839118 0.11412488 0.03577927
 0.01743284 0.04981208 0.00043671 0.01624476 0.0246803  0.05775202
 0.07155978]
