In [43]:
# %pip install xgboost

%pip install tensorflow keras

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (4.1 kB)
Collecting keras
  Downloading keras-3.3.3-py3-none-any.whl.metadata (5.7 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.5.4-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow)
  Downloading h5py-3.11.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.5 kB)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting ml-dtypes~=0.3.1 (from tensorflow)
  Downloading ml_dtypes-0.3.2-cp311-cp311-macosx_10_

In [62]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV

# Fetch ETH data
eth_data = yf.download('ETH-USD', start='2020-01-01', end='2024-06-01')

# Calculate EMA
eth_data['EMA_12'] = eth_data['Close'].ewm(span=12, adjust=False).mean()
eth_data['EMA_26'] = eth_data['Close'].ewm(span=26, adjust=False).mean()

# Calculate MACD
eth_data['MACD'] = eth_data['EMA_12'] - eth_data['EMA_26']
eth_data['Signal_Line'] = eth_data['MACD'].ewm(span=9, adjust=False).mean()

# Calculate RSI
def calculate_rsi(data, window):
    delta = data['Close'].diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

eth_data['RSI'] = calculate_rsi(eth_data, 14)

eth_data.dropna(inplace=True)
eth_data.head()

import numpy as np

# Bollinger Bands
eth_data['BB_Middle'] = eth_data['Close'].rolling(window=20).mean()
eth_data['BB_Upper'] = eth_data['BB_Middle'] + (eth_data['Close'].rolling(window=20).std() * 2)
eth_data['BB_Lower'] = eth_data['BB_Middle'] - (eth_data['Close'].rolling(window=20).std() * 2)

# Stochastic Oscillator
low_14 = eth_data['Low'].rolling(window=14).min()
high_14 = eth_data['High'].rolling(window=14).max()
eth_data['Stochastic'] = ((eth_data['Close'] - low_14) / (high_14 - low_14)) * 100

# Average True Range (ATR)
high_low = eth_data['High'] - eth_data['Low']
high_close = np.abs(eth_data['High'] - eth_data['Close'].shift())
low_close = np.abs(eth_data['Low'] - eth_data['Close'].shift())
tr = high_low.combine(high_close, max).combine(low_close, max)
eth_data['ATR'] = tr.rolling(window=14).mean()

# On-Balance Volume (OBV)
eth_data['OBV'] = (np.sign(eth_data['Close'].diff()) * eth_data['Volume']).fillna(0).cumsum()

# MACD Histogram
eth_data['MACD_Hist'] = eth_data['MACD'] - eth_data['Signal_Line']

# Volume-weighted Average Price (VWAP)
vwap = (eth_data['Volume'] * (eth_data['High'] + eth_data['Low'] + eth_data['Close']) / 3).cumsum() / eth_data['Volume'].cumsum()
eth_data['VWAP'] = vwap

eth_data.dropna(inplace=True)

# Commodity Channel Index (CCI)
def calculate_cci(data, ndays): 
    TP = (data['High'] + data['Low'] + data['Close']) / 3 
    CCI = pd.Series((TP - TP.rolling(ndays).mean()) / (0.015 * TP.rolling(ndays).std()), name = 'CCI') 
    return CCI

# Chaikin Money Flow (CMF)
def calculate_cmf(data, ndays):
    mfv = ((data['Close'] - data['Low']) - (data['High'] - data['Close'])) / (data['High'] - data['Low']) * data['Volume']
    cmf = mfv.rolling(ndays).sum() / data['Volume'].rolling(ndays).sum()
    return cmf

# Money Flow Index (MFI)
def calculate_mfi(data, window):
    typical_price = (data['High'] + data['Low'] + data['Close']) / 3
    raw_money_flow = typical_price * data['Volume']
    positive_flow = raw_money_flow.copy()
    negative_flow = raw_money_flow.copy()
    positive_flow[data['Close'] <= data['Close'].shift(1)] = 0
    negative_flow[data['Close'] > data['Close'].shift(1)] = 0
    positive_mf = positive_flow.rolling(window).sum()
    negative_mf = negative_flow.rolling(window).sum()
    mfi = 100 - (100 / (1 + positive_mf / negative_mf))
    return mfi

# Additional features
eth_data['RSI_7'] = calculate_rsi(eth_data, 7)
eth_data['RSI_21'] = calculate_rsi(eth_data, 21)
eth_data['Momentum'] = eth_data['Close'].diff(10)
eth_data['ROC'] = eth_data['Close'].pct_change(periods=10) * 100
eth_data['CCI'] = calculate_cci(eth_data, 20)
eth_data['Williams_%R'] = ((high_14 - eth_data['Close']) / (high_14 - low_14)) * -100
eth_data['CMF'] = calculate_cmf(eth_data, 20)
eth_data['MFI'] = calculate_mfi(eth_data, 14)
eth_data['Force_Index'] = eth_data['Close'].diff(1) * eth_data['Volume']

eth_data.dropna(inplace=True)



[*********************100%%**********************]  1 of 1 completed


In [63]:
def ichimoku_cloud(data):
    high_9 = data['High'].rolling(window=9).max()
    low_9 = data['Low'].rolling(window=9).min()
    high_26 = data['High'].rolling(window=26).max()
    low_26 = data['Low'].rolling(window=26).min()
    high_52 = data['High'].rolling(window=52).max()
    low_52 = data['Low'].rolling(window=52).min()

    data['Tenkan_Sen'] = (high_9 + low_9) / 2
    data['Kijun_Sen'] = (high_26 + low_26) / 2
    data['Senkou_Span_A'] = ((data['Tenkan_Sen'] + data['Kijun_Sen']) / 2).shift(26)
    data['Senkou_Span_B'] = ((high_52 + low_52) / 2).shift(26)
    data['Chikou_Span'] = data['Close'].shift(-26)
    return data

eth_data = ichimoku_cloud(eth_data)

In [64]:
def calculate_rvi(data, period=14):
    close_open = data['Close'] - data['Open']
    high_low = data['High'] - data['Low']
    
    rvi = pd.Series((close_open.rolling(window=period).mean() / high_low.rolling(window=period).mean()), name='RVI')
    data['RVI'] = rvi
    return data

eth_data = calculate_rvi(eth_data)


In [65]:
def calculate_keltner_channel(data, period=20):
    typical_price = (data['High'] + data['Low'] + data['Close']) / 3
    ema_tp = typical_price.ewm(span=period, adjust=False).mean()
    atr = data['High'] - data['Low']
    data['Keltner_Upper'] = ema_tp + (2 * atr)
    data['Keltner_Lower'] = ema_tp - (2 * atr)
    return data

eth_data = calculate_keltner_channel(eth_data)


In [66]:
def calculate_donchian_channel(data, period=20):
    data['Donchian_Upper'] = data['High'].rolling(window=period).max()
    data['Donchian_Lower'] = data['Low'].rolling(window=period).min()
    return data

eth_data = calculate_donchian_channel(eth_data)


In [67]:
def calculate_force_index(data, period=13):
    force_index = data['Close'].diff(period) * data['Volume']
    data['Force_Index'] = force_index
    return data

eth_data = calculate_force_index(eth_data)


In [68]:
def calculate_vortex(data, period=14):
    tr = pd.Series(np.maximum((data['High'] - data['Low']), 
                              np.maximum(abs(data['High'] - data['Close'].shift(1)), 
                                         abs(data['Low'] - data['Close'].shift(1)))), name='TR')
    vmp = abs(data['High'] - data['Low'].shift(1))
    vmm = abs(data['Low'] - data['High'].shift(1))
    
    vip = vmp.rolling(window=period).sum() / tr.rolling(window=period).sum()
    vim = vmm.rolling(window=period).sum() / tr.rolling(window=period).sum()
    
    data['Vortex_Positive'] = vip
    data['Vortex_Negative'] = vim
    return data

eth_data = calculate_vortex(eth_data)


In [69]:
print(eth_data.shape)
print(eth_data.index.min(), eth_data.index.max())

(1561, 40)
2020-02-22 00:00:00 2024-05-31 00:00:00


In [70]:
# Fetch S&P 500 data
sp500_data = yf.download('^GSPC', start='2020-01-01', end='2024-06-01')

# We are only interested in the closing prices
sp500_data = sp500_data[['Close']].rename(columns={'Close': 'SP500_Close'})


[*********************100%%**********************]  1 of 1 completed


In [71]:
# Merge S&P 500 data with ETH data on the same dates
eth_sp500_data = eth_data[['Close']].rename(columns={'Close': 'ETH_Close'})
merged_data = pd.merge(eth_sp500_data, sp500_data, left_index=True, right_index=True, how='inner')

# Calculate rolling correlations
correlation_periods = [7, 14, 21, 28]
for period in correlation_periods:
    merged_data[f'Corr_{period}'] = merged_data['ETH_Close'].rolling(window=period).corr(merged_data['SP500_Close'])

# Drop NaN values resulting from rolling calculations
merged_data.dropna(inplace=True)


In [72]:
# Add correlation features to the original ETH data
eth_data = eth_data.join(merged_data[[f'Corr_{period}' for period in correlation_periods]])
eth_data.dropna(inplace=True)


In [73]:
print(eth_data.shape)
print(eth_data.index.min(), eth_data.index.max())

(1003, 44)
2020-05-11 00:00:00 2024-05-03 00:00:00


In [74]:
# Define the indices to fetch
indices = {
    'DJIA': '^DJI',
    'NASDAQ': '^IXIC',
    # 'FTSE': '^FTSE',
    # 'DAX': '^GDAXI',
    # 'Nikkei': '^N225',
    # 'Hang_Seng': '^HSI',
    # 'Crude_Oil': 'CL=F',
    'Gold': 'GC=F',
    # 'Dollar_Index': 'DX-Y.NYB'
}

# Fetch data for each index
index_data = {}
for name, ticker in indices.items():
    index_data[name] = yf.download(ticker, start='2020-01-01', end='2024-06-01')['Close'].rename(f'{name}_Close')
    
# Merge all index data into a single DataFrame
index_data_df = pd.concat(index_data.values(), axis=1)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [75]:
index_data_df.head()

Unnamed: 0_level_0,DJIA_Close,NASDAQ_Close,Gold_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-02,28868.800781,9092.19043,1524.5
2020-01-03,28634.880859,9020.769531,1549.199951
2020-01-06,28703.380859,9071.469727,1566.199951
2020-01-07,28583.679688,9068.580078,1571.800049
2020-01-08,28745.089844,9129.240234,1557.400024


In [76]:
index_data_df.isna().sum()

DJIA_Close      1
NASDAQ_Close    1
Gold_Close      0
dtype: int64

In [77]:
# Merge ETH data with index data
eth_index_data = eth_data[['Close']].rename(columns={'Close': 'ETH_Close'})
merged_data = pd.merge(eth_index_data, index_data_df, left_index=True, right_index=True, how='inner')

# Calculate rolling correlations for each index
correlation_periods = [7, 14, 21, 28]
for name in indices.keys():
    for period in correlation_periods:
        merged_data[f'Corr_{name}_{period}'] = merged_data['ETH_Close'].rolling(window=period).corr(merged_data[f'{name}_Close'])

# Drop NaN values resulting from rolling calculations
# merged_data.dropna(inplace=True)

In [78]:
merged_data.head()

Unnamed: 0_level_0,ETH_Close,DJIA_Close,NASDAQ_Close,Gold_Close,Corr_DJIA_7,Corr_DJIA_14,Corr_DJIA_21,Corr_DJIA_28,Corr_NASDAQ_7,Corr_NASDAQ_14,Corr_NASDAQ_21,Corr_NASDAQ_28,Corr_Gold_7,Corr_Gold_14,Corr_Gold_21,Corr_Gold_28
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2020-05-11,185.912842,24221.990234,9192.339844,1695.300049,,,,,,,,,,,,
2020-05-12,189.3125,23764.779297,9002.549805,1704.400024,,,,,,,,,,,,
2020-05-13,199.193283,23247.970703,8863.169922,1713.900024,,,,,,,,,,,,
2020-05-14,202.949097,23625.339844,8943.719727,1738.099976,,,,,,,,,,,,
2020-05-15,195.622665,23685.419922,9014.55957,1753.400024,,,,,,,,,,,,


In [79]:
merged_data.isna().sum()

ETH_Close          0
DJIA_Close         0
NASDAQ_Close       0
Gold_Close         0
Corr_DJIA_7        6
Corr_DJIA_14      13
Corr_DJIA_21      20
Corr_DJIA_28      27
Corr_NASDAQ_7      6
Corr_NASDAQ_14    13
Corr_NASDAQ_21    20
Corr_NASDAQ_28    27
Corr_Gold_7        6
Corr_Gold_14      13
Corr_Gold_21      20
Corr_Gold_28      27
dtype: int64

In [80]:
merged_data.isnull().sum()

ETH_Close          0
DJIA_Close         0
NASDAQ_Close       0
Gold_Close         0
Corr_DJIA_7        6
Corr_DJIA_14      13
Corr_DJIA_21      20
Corr_DJIA_28      27
Corr_NASDAQ_7      6
Corr_NASDAQ_14    13
Corr_NASDAQ_21    20
Corr_NASDAQ_28    27
Corr_Gold_7        6
Corr_Gold_14      13
Corr_Gold_21      20
Corr_Gold_28      27
dtype: int64

In [81]:
merged_data.shape

(1003, 16)

In [82]:
# Add correlation features to the original ETH data
for name in indices.keys():
    for period in correlation_periods:
        eth_data[f'Corr_{name}_{period}'] = merged_data[f'Corr_{name}_{period}']

eth_data.dropna(inplace=True)


In [83]:
eth_data.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,EMA_12,EMA_26,MACD,Signal_Line,...,Corr_DJIA_21,Corr_DJIA_28,Corr_NASDAQ_7,Corr_NASDAQ_14,Corr_NASDAQ_21,Corr_NASDAQ_28,Corr_Gold_7,Corr_Gold_14,Corr_Gold_21,Corr_Gold_28
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-18,232.898697,234.570648,228.951431,232.101166,232.101166,6713800872,235.106484,230.643579,4.462905,7.029998,...,0.799682,0.871491,0.415533,0.1227,0.77253,0.855128,-0.616467,-0.400021,-0.242599,-0.136306
2020-06-19,231.954971,232.154114,226.795181,227.13829,227.13829,6946372590,233.880608,230.383928,3.49668,6.323334,...,0.77991,0.877083,-0.339848,0.1188,0.714548,0.844781,-0.410506,-0.674338,-0.149855,-0.251909
2020-06-22,229.003372,243.776016,228.934738,242.533188,242.533188,9079586552,234.067714,231.117785,2.949929,4.566757,...,0.727426,0.865163,0.105267,0.17095,0.672941,0.833719,0.439227,-0.481528,-0.067796,-0.269292
2020-06-23,242.537018,244.86441,239.759735,244.142151,244.142151,6624530348,235.617627,232.082552,3.535075,4.360421,...,0.66007,0.851612,0.769549,0.315898,0.63089,0.82178,0.750468,-0.211824,0.098633,-0.197483
2020-06-24,244.185928,248.508026,232.807739,235.772461,235.772461,8815030025,235.641448,232.355879,3.285569,4.14545,...,0.587545,0.830252,0.78717,0.451944,0.541788,0.805858,0.642053,-0.214069,-0.020459,-0.132861


In [84]:
print(eth_data.shape)
eth_data.dropna(inplace=True)
print(eth_data.shape)

(976, 56)
(976, 56)


In [85]:
eth_data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,EMA_12,EMA_26,MACD,Signal_Line,...,Corr_DJIA_21,Corr_DJIA_28,Corr_NASDAQ_7,Corr_NASDAQ_14,Corr_NASDAQ_21,Corr_NASDAQ_28,Corr_Gold_7,Corr_Gold_14,Corr_Gold_21,Corr_Gold_28
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-18,232.898697,234.570648,228.951431,232.101166,232.101166,6713800872,235.106484,230.643579,4.462905,7.029998,...,0.799682,0.871491,0.415533,0.122700,0.772530,0.855128,-0.616467,-0.400021,-0.242599,-0.136306
2020-06-19,231.954971,232.154114,226.795181,227.138290,227.138290,6946372590,233.880608,230.383928,3.496680,6.323334,...,0.779910,0.877083,-0.339848,0.118800,0.714548,0.844781,-0.410506,-0.674338,-0.149855,-0.251909
2020-06-22,229.003372,243.776016,228.934738,242.533188,242.533188,9079586552,234.067714,231.117785,2.949929,4.566757,...,0.727426,0.865163,0.105267,0.170950,0.672941,0.833719,0.439227,-0.481528,-0.067796,-0.269292
2020-06-23,242.537018,244.864410,239.759735,244.142151,244.142151,6624530348,235.617627,232.082552,3.535075,4.360421,...,0.660070,0.851612,0.769549,0.315898,0.630890,0.821780,0.750468,-0.211824,0.098633,-0.197483
2020-06-24,244.185928,248.508026,232.807739,235.772461,235.772461,8815030025,235.641448,232.355879,3.285569,4.145450,...,0.587545,0.830252,0.787170,0.451944,0.541788,0.805858,0.642053,-0.214069,-0.020459,-0.132861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-04-29,3262.340820,3285.468750,3116.199951,3215.428955,3215.428955,15032246816,3196.545648,3240.960923,-44.415275,-66.236104,...,0.709003,0.772927,0.474255,0.712441,0.764125,0.808132,-0.689733,-0.459548,-0.477669,-0.640666
2024-04-30,3215.381104,3249.378418,2918.228760,3012.286865,3012.286865,18266894653,3168.198143,3224.022104,-55.823961,-64.153675,...,0.704735,0.779383,0.069502,0.692764,0.752990,0.812635,0.866043,-0.123491,-0.280332,-0.593374
2024-05-01,3011.015625,3020.173340,2815.923340,2969.784668,2969.784668,20005057445,3137.672993,3205.189701,-67.516708,-64.826282,...,0.764990,0.785961,0.511074,0.413389,0.775077,0.817704,0.890584,-0.015702,-0.189027,-0.545821
2024-05-02,2969.794434,3015.050293,2894.329834,2988.168457,2988.168457,13163903903,3114.672295,3189.114053,-74.441758,-66.749377,...,0.786732,0.798992,0.460204,0.138136,0.763070,0.823772,0.950126,0.077838,-0.079934,-0.554518


In [86]:
# Create target variable (1 if next day's close price is higher, else 0)
eth_data['Target'] = (eth_data['Close'].shift(-1) > eth_data['Close']).astype(int)

# Features including the new ones
features = [
    'Open', 'High', 'Low', 'Close', 'Volume',
    'MACD', 'Signal_Line', 'RSI',
    'BB_Middle', 'BB_Upper', 'BB_Lower',
    'Stochastic', 'ATR', 'OBV', 'MACD_Hist', 'VWAP',
    'RSI_7', 'RSI_21', 'Momentum', 'ROC', 'CCI', 'Williams_%R', 'CMF', 'MFI', 'Force_Index',
    'Tenkan_Sen', 'Kijun_Sen', 'Senkou_Span_A', 'Senkou_Span_B', 'Chikou_Span', 'RVI',
    'Keltner_Upper', 'Keltner_Lower', 'Donchian_Upper', 'Donchian_Lower',
    'Vortex_Positive', 'Vortex_Negative',
    'Corr_7', 'Corr_14', 'Corr_21', 'Corr_28'
] + [f'Corr_{name}_{period}' for name in indices.keys() for period in correlation_periods]

X = eth_data[features]
y = eth_data['Target']

# Drop the last row as it will have NaN target value
X = X[:-1]
y = y[:-1]

# Polynomial Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(X)
poly_feature_names = poly.get_feature_names_out(X.columns)

# Create a DataFrame with the new features
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=X.index)

# Combine with the original features
X_poly = pd.concat([X, poly_df], axis=1)

# Recursive Feature Elimination
rf = RandomForestClassifier(n_estimators=100, random_state=42)
# rfe = RFE(estimator=rf, n_features_to_select=20)
# X_rfe = rfe.fit_transform(X_poly, y)
# selected_features = X_poly.columns[rfe.support_]

selected_features = ['OBV VWAP', 'VWAP Keltner_Lower', 'RSI_7 Vortex_Negative',
       'Williams_%R RVI', 'Williams_%R Corr_DJIA_21', 'CMF Corr_Gold_14',
       'Vortex_Positive Vortex_Negative', 'Corr_DJIA_7 Corr_Gold_7',
       'Corr_DJIA_7 Corr_Gold_14', 'Corr_DJIA_21 Corr_DJIA_28']

# selected_features = ['Stochastic Vortex_Negative', 'OBV VWAP', 'MACD_Hist ROC',
#        'VWAP Keltner_Lower', 'VWAP Vortex_Negative', 'RSI_7 Vortex_Negative',
#        'RSI_21 Vortex_Negative', 'CCI RVI', 'Williams_%R Kijun_Sen',
#        'Williams_%R RVI', 'Williams_%R Corr_DJIA_21', 'CMF Corr_Gold_14',
#        'Vortex_Positive Vortex_Negative', 'Corr_DJIA_7 Corr_DJIA_28',
#        'Corr_DJIA_7 Corr_Gold_7', 'Corr_DJIA_7 Corr_Gold_14',
#        'Corr_DJIA_7 Corr_Gold_28', 'Corr_DJIA_21 Corr_DJIA_28',
#        'Corr_DJIA_21 Corr_NASDAQ_28', 'Corr_Gold_7 Corr_Gold_14']

X_selected = X_poly[selected_features]

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Hyperparameter tuning with GridSearchCV
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_features': ['sqrt', 'log2', None],
#     'max_depth': [4, 6, 8, 10, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)
# best_params = grid_search.best_params_
best_params = {}

# Train with best parameters
rf_best = RandomForestClassifier(**best_params, random_state=42)
rf_best.fit(X_train, y_train)

# Initialize the models
log_clf = LogisticRegression(random_state=42)
svm_clf = SVC(probability=True, random_state=42)
rf_clf = RandomForestClassifier(**best_params, random_state=42)

# Create a voting classifier
voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('svc', svm_clf), ('rf', rf_clf)],
    voting='soft'
)

# Train the voting classifier
voting_clf.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = voting_clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6153846153846154
Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.56      0.57        88
           1       0.65      0.66      0.65       107

    accuracy                           0.62       195
   macro avg       0.61      0.61      0.61       195
weighted avg       0.61      0.62      0.61       195



In [87]:
# Predict and evaluate the model
y_pred = rf_best.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5846153846153846
Classification Report:
               precision    recall  f1-score   support

           0       0.54      0.55      0.54        88
           1       0.62      0.62      0.62       107

    accuracy                           0.58       195
   macro avg       0.58      0.58      0.58       195
weighted avg       0.59      0.58      0.58       195



In [88]:
from xgboost import XGBClassifier

m = XGBClassifier()
m.fit(X_train, y_train)

y_pred = m.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5538461538461539
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.56      0.53        88
           1       0.60      0.55      0.58       107

    accuracy                           0.55       195
   macro avg       0.55      0.55      0.55       195
weighted avg       0.56      0.55      0.55       195



In [89]:
from xgboost import XGBRFClassifier

m = XGBRFClassifier()
m.fit(X_train, y_train)

y_pred = m.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6051282051282051
Classification Report:
               precision    recall  f1-score   support

           0       0.57      0.53      0.55        88
           1       0.63      0.66      0.65       107

    accuracy                           0.61       195
   macro avg       0.60      0.60      0.60       195
weighted avg       0.60      0.61      0.60       195



In [91]:
# Initialize the XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5, 6],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0, 0.01, 0.1, 1]
}

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Train the best model
best_xgb = grid_search.best_estimator_
best_xgb.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = best_xgb.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')

Fitting 5 folds for each of 110592 candidates, totalling 552960 fits


In [56]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# Build the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5794 - loss: 0.7543 - val_accuracy: 0.5192 - val_loss: 0.7105
Epoch 2/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5130 - loss: 0.7686 - val_accuracy: 0.5449 - val_loss: 0.6940
Epoch 3/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5343 - loss: 0.7248 - val_accuracy: 0.5769 - val_loss: 0.6930
Epoch 4/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 936us/step - accuracy: 0.5335 - loss: 0.7208 - val_accuracy: 0.5128 - val_loss: 0.6893
Epoch 5/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5341 - loss: 0.7029 - val_accuracy: 0.5449 - val_loss: 0.6885
Epoch 6/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 991us/step - accuracy: 0.4995 - loss: 0.7368 - val_accuracy: 0.5769 - val_loss: 0.6891
Epoch 7/100
[1m39/39[0m [32

In [57]:
from tensorflow.keras.layers import Conv1D, GlobalAveragePooling1D, Reshape

# Reshape data for 1D CNN
X_train_cnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_cnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# Build the model
cnn_model = Sequential()
cnn_model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)))
cnn_model.add(GlobalAveragePooling1D())
cnn_model.add(Dense(32, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid'))

# Compile the model
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history_cnn = cnn_model.fit(X_train_cnn, y_train, epochs=100, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = cnn_model.evaluate(X_test_cnn, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5394 - loss: 0.6925 - val_accuracy: 0.4423 - val_loss: 0.6949
Epoch 2/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5344 - loss: 0.6911 - val_accuracy: 0.4487 - val_loss: 0.6955
Epoch 3/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5447 - loss: 0.6898 - val_accuracy: 0.4487 - val_loss: 0.6968
Epoch 4/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 925us/step - accuracy: 0.5410 - loss: 0.6851 - val_accuracy: 0.4551 - val_loss: 0.6962
Epoch 5/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5248 - loss: 0.6907 - val_accuracy: 0.5128 - val_loss: 0.6937
Epoch 6/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 949us/step - accuracy: 0.5179 - loss: 0.6916 - val_accuracy: 0.5256 - val_loss: 0.6910
Epoch 7/100
[1m39/39[0m [32m━━━━━━━━━━━

In [59]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_poly)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.4983 - loss: 1.2519 - val_accuracy: 0.5705 - val_loss: 0.6863
Epoch 2/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5473 - loss: 1.0972 - val_accuracy: 0.5449 - val_loss: 0.6998
Epoch 3/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4985 - loss: 1.1533 - val_accuracy: 0.4551 - val_loss: 0.6888
Epoch 4/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4711 - loss: 0.9777 - val_accuracy: 0.5385 - val_loss: 0.6769
Epoch 5/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4948 - loss: 0.8936 - val_accuracy: 0.5256 - val_loss: 0.6815
Epoch 6/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5276 - loss: 0.8326 - val_accuracy: 0.5192 - val_loss: 0.6902
Epoch 7/100
[1m39/39[0m [32m━━━━━━━━━━━━━━━

In [61]:
print(X_train.shape)

(780, 1484)


In [None]:
# poly selected features
['OBV VWAP', 'VWAP Keltner_Lower', 'RSI_7 Vortex_Negative',
       'Williams_%R RVI', 'Williams_%R Corr_DJIA_21', 'CMF Corr_Gold_14',
       'Vortex_Positive Vortex_Negative', 'Corr_DJIA_7 Corr_Gold_7',
       'Corr_DJIA_7 Corr_Gold_14', 'Corr_DJIA_21 Corr_DJIA_28']

['Stochastic Vortex_Negative', 'OBV VWAP', 'MACD_Hist ROC',
       'VWAP Keltner_Lower', 'VWAP Vortex_Negative', 'RSI_7 Vortex_Negative',
       'RSI_21 Vortex_Negative', 'CCI RVI', 'Williams_%R Kijun_Sen',
       'Williams_%R RVI', 'Williams_%R Corr_DJIA_21', 'CMF Corr_Gold_14',
       'Vortex_Positive Vortex_Negative', 'Corr_DJIA_7 Corr_DJIA_28',
       'Corr_DJIA_7 Corr_Gold_7', 'Corr_DJIA_7 Corr_Gold_14',
       'Corr_DJIA_7 Corr_Gold_28', 'Corr_DJIA_21 Corr_DJIA_28',
       'Corr_DJIA_21 Corr_NASDAQ_28', 'Corr_Gold_7 Corr_Gold_14']