In [60]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, BayesianRidge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [61]:
btc_df = pd.read_csv('data/btc_201901_202411.csv')
print(btc_df.shape)
btc_df.head(2)

(2136, 7)


Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,volume
0,0,2019-01-01,3693.85,3845.46,3629.66,3823.44,5149.606277
1,1,2019-01-02,3825.41,3918.67,3770.0,3885.87,5534.469515


In [62]:
## volume in usd
btc_df['avg_price_day'] = (btc_df['open']+btc_df['high']+btc_df['low']+btc_df['close'])/4
btc_df['volume_usd'] = btc_df['volume'] * btc_df['avg_price_day']
btc_df.drop(['Unnamed: 0','volume','avg_price_day'], axis=1, inplace=True)

In [63]:
btc_df.tail(5)

Unnamed: 0,date,open,high,low,close,volume_usd
2131,2024-11-01,70202.0,71596.0,68768.0,69490.0,190261100.0
2132,2024-11-02,69468.0,69887.0,69015.0,69330.0,72696630.0
2133,2024-11-03,69339.0,69354.0,67459.0,68738.0,131636000.0
2134,2024-11-04,68738.0,69444.0,66798.0,67818.0,173113600.0
2135,2024-11-05,67817.0,70538.0,67443.0,69386.0,163095000.0


In [64]:
btc_df['return_today'] = btc_df['close'].pct_change()

for i in range(1, 9):
    btc_df[f'close_{i}'] = btc_df['close'].shift(i)
    btc_df[f'volume_{i}'] = btc_df['volume_usd'].shift(i)
    
btc_df['high_pct'] = ((btc_df['high'] - btc_df['close_1']) / btc_df['close_1']) * 100
btc_df['low_pct'] = ((btc_df['low'] - btc_df['close_1']) / btc_df['close_1']) * 100

## log returns
btc_df['log_return_today'] = np.log(btc_df['close'] / btc_df['close'].shift(1))
for i in range(1, 8):
    btc_df[f'log_return_{i}'] = np.log(btc_df[f'close_{i}'] / btc_df[f'close_{i+1}'])

btc_df['close_tomorrow'] = btc_df['close'].shift(-1)
btc_df['log_return_tomorrow'] = np.log(btc_df['close_tomorrow'] / btc_df['close'])
    
## price moving average & change in ema
btc_df['ema_7'] = btc_df['close'].ewm(span=3, adjust=False).mean()
btc_df['ema_14'] = btc_df['close'].ewm(span=7, adjust=False).mean()
btc_df['ema_7_pct'] = btc_df['ema_7'].pct_change()
btc_df['ema_14_pct'] = btc_df['ema_14'].pct_change()


## volume moving avg
btc_df['volume_sma_3'] = btc_df['volume_usd'].rolling(window=3).mean()
btc_df['volume_sma_7'] = btc_df['volume_usd'].rolling(window=7).mean()
btc_df['volume_sma_3_pct'] = btc_df['volume_sma_3'].pct_change()
btc_df['volume_sma_7_pct'] = btc_df['volume_sma_7'].pct_change()

## Momentum indicator
btc_df['roc_7'] = ((btc_df['close'] - btc_df['close_6']) / btc_df['close_7']) * 100


## volatility features
btc_df['volatility_3'] = btc_df['close'].rolling(window=3).std()
btc_df['volatility_7'] = btc_df['close'].rolling(window=7).std()

## STD+Mean -> bbands
btc_df['bb_middle'] = btc_df['close'].rolling(window=20).mean()
btc_df['bb_std'] = btc_df['close'].rolling(window=20).std()
btc_df['bb_upper'] = btc_df['bb_middle'] + (2 * btc_df['bb_std'])
btc_df['bb_lower'] = btc_df['bb_middle'] - (2 * btc_df['bb_std'])

## ATR
btc_df['tr'] = btc_df[['high', 'low', 'close']].apply(
    lambda x: max(x['high'] - x['low'], abs(x['high'] - x['close']), abs(x['low'] - x['close'])), axis=1)
btc_df['atr'] = btc_df['tr'].rolling(window=14).mean()


##Stochaistic Oscillators
lowest_low = btc_df['low'].rolling(window=14).min()
highest_high = btc_df['high'].rolling(window=14).max()
btc_df['stochastic'] = ((btc_df['close'] - lowest_low) / (highest_high - lowest_low)) * 100

## Highlow price diifferences
btc_df['daily_range'] = btc_df['high'] - btc_df['low']
btc_df['range_pct'] = btc_df['daily_range'] / btc_df['close'] * 100

## volume price interactions
# obv
btc_df['obv'] = (np.sign(btc_df['close'].diff()) * btc_df['volume_usd']).fillna(0).cumsum()

#mfi
typical_price = (btc_df['high'] + btc_df['low'] + btc_df['close']) / 3
money_flow = typical_price * btc_df['volume_usd']
positive_flow = money_flow.where(typical_price > typical_price.shift(1), 0)
negative_flow = money_flow.where(typical_price < typical_price.shift(1), 0)
positive_mf = positive_flow.rolling(window=14).sum()
negative_mf = negative_flow.rolling(window=14).sum()
mfi = 100 - (100 / (1 + (positive_mf / negative_mf)))
btc_df['mfi'] = mfi

# cummulative features
btc_df['cumulative_return'] = (1 + btc_df['return_today']).cumprod() - 1

## Statistical features

btc_df['rolling_skew'] = btc_df['close'].rolling(window=7).apply(lambda x: skew(x))
btc_df['rolling_kurtosis'] = btc_df['close'].rolling(window=7).apply(lambda x: kurtosis(x))

## Trend indicators
btc_df['return_today'] = btc_df['close'].pct_change()
for i in range(1, 8):
    btc_df[f'return_{i}'] = btc_df[f'close_{i}'].pct_change()

btc_df.dropna(inplace=True)

In [65]:
btc_df.tail()

Unnamed: 0,date,open,high,low,close,volume_usd,return_today,close_1,volume_1,close_2,...,cumulative_return,rolling_skew,rolling_kurtosis,return_1,return_2,return_3,return_4,return_5,return_6,return_7
2130,2024-10-31,72330.0,72660.0,69591.0,70231.0,126485700.0,-0.029033,72331.0,163720300.0,72729.0,...,17.368537,0.10789,-1.4663,-0.005472,0.040204,0.029144,0.013955,0.006383,-0.023138,0.023333
2131,2024-11-01,70202.0,71596.0,68768.0,69490.0,190261100.0,-0.010551,70231.0,126485700.0,72331.0,...,17.174733,0.024535,-1.157767,-0.029033,-0.005472,0.040204,0.029144,0.013955,0.006383,-0.023138
2132,2024-11-02,69468.0,69887.0,69015.0,69330.0,72696630.0,-0.002302,69490.0,190261100.0,70231.0,...,17.132886,0.330247,-1.024698,-0.010551,-0.029033,-0.005472,0.040204,0.029144,0.013955,0.006383
2133,2024-11-03,69339.0,69354.0,67459.0,68738.0,131636000.0,-0.008539,69330.0,72696630.0,69490.0,...,16.978051,0.663513,-1.1378,-0.002302,-0.010551,-0.029033,-0.005472,0.040204,0.029144,0.013955
2134,2024-11-04,68738.0,69444.0,66798.0,67818.0,173113600.0,-0.013384,68738.0,131636000.0,69330.0,...,16.73743,0.430042,-1.170232,-0.008539,-0.002302,-0.010551,-0.029033,-0.005472,0.040204,0.029144


In [72]:
btc_df = pd.read_csv('data/btc_201901_202411.csv')
btc_df.tail(2)

Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,volume
2134,2134,2024-11-04,68738.0,69444.0,66798.0,67818.0,2538.340802
2135,2135,2024-11-05,67817.0,70538.0,67443.0,69386.0,2370.705183


In [90]:
btc_df.columns

Index(['Unnamed: 0', 'date', 'open', 'high', 'low', 'close', 'volume',
       'open_pct', 'high_pct', 'low_pct', 'close_pct'],
      dtype='object')

In [67]:
df = btc_df[['high_pct','low_pct','log_return_today',
            'ema_7_pct','ema_14_pct',
             'range_pct', 'roc_7',
             'volume_sma_3_pct','volume_sma_7_pct',
             'volatility_3','volatility_7',
             'obv', 'mfi',
             'rolling_skew', 'rolling_kurtosis',
             'log_return_tomorrow'
            ]]

In [68]:
df.tail(10)

Unnamed: 0,high_pct,low_pct,log_return_today,ema_7_pct,ema_14_pct,range_pct,roc_7,volume_sma_3_pct,volume_sma_7_pct,volatility_3,volatility_7,obv,mfi,rolling_skew,rolling_kurtosis,log_return_tomorrow
2125,1.162546,-0.351467,0.006363,-0.000798,-0.000777,1.50441,-2.937433,-0.209632,0.005732,815.951183,878.497744,5207358000.0,51.640001,0.782472,-0.566694,0.013858
2126,1.834246,-0.228348,0.013858,0.006573,0.002897,2.034208,0.866529,-0.261947,0.010936,695.754986,611.41522,5257312000.0,54.669433,0.189242,-1.289131,0.028728
2127,3.356001,-0.584356,0.028728,0.017932,0.009516,3.828771,3.765964,-0.155572,0.029603,1488.39119,1170.592565,5399519000.0,54.589892,1.016171,0.014782,0.039417
2128,5.266169,-0.28891,0.039417,0.029264,0.017405,5.340373,9.094417,1.118615,0.221036,2407.481464,2229.484309,5683744000.0,58.59645,1.110393,-0.032678,-0.005487
2129,0.273619,-1.946954,-0.005487,0.011402,0.011392,2.232791,6.270176,0.238811,0.03659,1521.112203,2488.299821,5520024000.0,60.083807,0.458263,-1.37785,-0.029463
2130,0.454853,-3.788141,-0.029463,-0.009043,0.000944,4.369865,5.359842,-0.02664,-0.017961,1342.162931,2461.557695,5393538000.0,58.468422,0.10789,-1.4663,-0.010607
2131,1.943586,-2.083126,-0.010607,-0.00979,-0.001938,4.06965,3.735468,-0.163578,0.006743,1473.678052,2098.274121,5203277000.0,49.743273,0.024535,-1.157767,-0.002305
2132,0.571305,-0.683552,-0.002305,-0.006083,-0.002028,1.257753,2.077519,-0.189448,0.040423,480.708158,1700.451117,5130580000.0,46.932218,0.330247,-1.024698,-0.008576
2133,0.034617,-2.698687,-0.008576,-0.007304,-0.003646,2.756845,-1.736878,0.013225,0.079338,396.141389,1535.631001,4998944000.0,42.50964,0.663513,-1.1378,-0.013475
2134,1.027088,-2.822311,-0.013475,-0.010322,-0.006054,3.901619,-7.023942,-0.043456,0.027813,761.906381,1822.707484,4825831000.0,41.344852,0.430042,-1.170232,0.022857


In [168]:
btc_df.head(2)

Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,volume,open_log_pct,high_log_pct,low_log_pct,close_log_pct,close_tomorrow_log_pct,ema_7,ema_14,ema_7_pct,ema_14_pct
0,0,2019-01-01,3693.85,3845.46,3629.66,3823.44,5149.606277,,,,,0.016196,3823.44,3823.44,,
1,1,2019-01-02,3825.41,3918.67,3770.0,3885.87,5534.469515,0.000515,0.024602,-0.014076,0.016196,-0.025559,3835.926,3829.115455,0.003266,0.001484


In [254]:
## Feature Engineering 2.0
btc_df = pd.read_csv('data/btc_201901_202411.csv')

btc_df['close_tomro'] = btc_df['close'].shift(-1)
## volume USD
btc_df['volume_usd'] = btc_df['close']*btc_df['volume']

## log volume pct change
btc_df['volume_log_pct'] = np.log(btc_df['volume_usd'] / btc_df['volume_usd'].shift(1))

## log pct
btc_df['open_log_pct'] = np.log(btc_df['open'] / btc_df['close'].shift(1)) * 100
btc_df['high_log_pct'] = np.log(btc_df['high'] / btc_df['close'].shift(1)) * 100
btc_df['low_log_pct'] = np.log(btc_df['low'] / btc_df['close'].shift(1)) * 100
btc_df['close_log_pct'] = np.log(btc_df['close'] / btc_df['close'].shift(1)) *100
btc_df['close_tomorrow_log_pct'] = np.log(btc_df['close'].shift(-1) / btc_df['close']) *100

## lag price and volume
for i in range(1, 9):
    btc_df[f'close_{i}_log_pct'] = btc_df['close_log_pct'].shift(i)
    btc_df[f'volume_{i}_log_pct'] = btc_df['volume_log_pct'].shift(i)

## volume confirmation
btc_df['price_change'] = btc_df['close_log_pct']
btc_df['volume_change'] = btc_df['volume_log_pct']
def categorize_alignment(row):
    if row['price_change'] > 0 and row['volume_change'] > 0:
        return 'Bullish'
    elif row['price_change'] < 0 and row['volume_change'] > 0:
        return 'Bearish'
    else:
        return 'Neutral'
btc_df['price_volume_alignment'] = btc_df.apply(categorize_alignment, axis=1)
btc_df = btc_df.drop(columns=['price_change', 'volume_change'])

## past 7 days price volume alignment
def rolling_count(category):
    return (btc_df['price_volume_alignment'] == category).rolling(window=7, min_periods=1).sum().shift(1)
btc_df['bullish_count_7day'] = rolling_count('Bullish')
btc_df['bearish_count_7day'] = rolling_count('Bearish')
btc_df['neutral_count_7day'] = rolling_count('Neutral')


## EMA 
btc_df['ema_1'] = btc_df['close'].ewm(span=9, adjust=False).mean()
btc_df['ema_2'] = btc_df['close'].ewm(span=21, adjust=False).mean()
btc_df['ema_1_pct'] = btc_df['ema_1'].pct_change()
btc_df['ema_2_pct'] = btc_df['ema_2'].pct_change()

## Price above EMA or Below EMA
btc_df['close_above_ema_1'] = (btc_df['close'] > btc_df['ema_1']).astype(int)
btc_df['close_above_ema_2'] = (btc_df['close'] > btc_df['ema_2']).astype(int)


##Volume MA
btc_df['volume_sma_3'] = btc_df['volume_usd'].rolling(window=3).mean()
btc_df['volume_sma_7'] = btc_df['volume_usd'].rolling(window=7).mean()
btc_df['volume_sma_3_pct'] = btc_df['volume_sma_3'].pct_change()
btc_df['volume_sma_7_pct'] = btc_df['volume_sma_7'].pct_change()

btc_df.dropna(inplace=True)
btc_df.columns

Index(['Unnamed: 0', 'date', 'open', 'high', 'low', 'close', 'volume',
       'close_tomro', 'volume_usd', 'volume_log_pct', 'open_log_pct',
       'high_log_pct', 'low_log_pct', 'close_log_pct',
       'close_tomorrow_log_pct', 'close_1_log_pct', 'volume_1_log_pct',
       'close_2_log_pct', 'volume_2_log_pct', 'close_3_log_pct',
       'volume_3_log_pct', 'close_4_log_pct', 'volume_4_log_pct',
       'close_5_log_pct', 'volume_5_log_pct', 'close_6_log_pct',
       'volume_6_log_pct', 'close_7_log_pct', 'volume_7_log_pct',
       'close_8_log_pct', 'volume_8_log_pct', 'price_volume_alignment',
       'bullish_count_7day', 'bearish_count_7day', 'neutral_count_7day',
       'ema_1', 'ema_2', 'ema_1_pct', 'ema_2_pct', 'close_above_ema_1',
       'close_above_ema_2', 'volume_sma_3', 'volume_sma_7', 'volume_sma_3_pct',
       'volume_sma_7_pct'],
      dtype='object')

In [255]:
df = btc_df[[ #'open_log_pct','high_log_pct','low_log_pct','close_log_pct',
             #'volume_log_pct',
             #'price_volume_alignment', 'bullish_count_7day', 'bearish_count_7day', 'neutral_count_7day',
             #'close_1_log_pct','close_2_log_pct','close_3_log_pct','close_4_log_pct','close_5_log_pct','close_6_log_pct','close_7_log_pct',
             #'ema_9','ema_21',
            #'volume_sma_3_pct','volume_sma_7_pct',
             #'close_above_ema_1','close_above_ema_2',
            'open','high','low','close','close_tomro'
             #'close_tomorrow_log_pct'
            ]]
# df = pd.get_dummies(df, columns=['price_volume_alignment'], drop_first=True)

In [260]:
df.head()

Unnamed: 0,open,high,low,close,close_tomro
9,4003.09,4036.22,3503.44,3627.51,3632.02
10,3627.65,3704.6,3569.85,3632.02,3617.13
11,3631.0,3650.35,3557.09,3617.13,3515.95
12,3617.75,3654.02,3476.0,3515.95,3664.28
13,3517.03,3714.0,3506.0,3664.28,3576.93


In [256]:
def train_test_funk(df):
    y = df.iloc[:,-1]

    X = df.iloc[:,:-1]
    #X = pd.get_dummies(X, columns=['price_volume_alignment'], drop_first=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X, y, X_train, X_test, y_train, y_test

X, y, X_train, X_test, y_train, y_test = train_test_funk(df)

Unnamed: 0,open,high,low,close
9,4003.09,4036.22,3503.44,3627.51
10,3627.65,3704.60,3569.85,3632.02
11,3631.00,3650.35,3557.09,3617.13
12,3617.75,3654.02,3476.00,3515.95
13,3517.03,3714.00,3506.00,3664.28
...,...,...,...,...
2130,72330.00,72660.00,69591.00,70231.00
2131,70202.00,71596.00,68768.00,69490.00
2132,69468.00,69887.00,69015.00,69330.00
2133,69339.00,69354.00,67459.00,68738.00


In [265]:


def vanillaMLModels(X_train,y_train,X_test,y_test,X):
    models = {
        "Linear Regression": LinearRegression(),
        #"Lasso": Lasso(alpha=0.1),
        "Ridge": Ridge(alpha=0.2),
        "Random Forest": RandomForestRegressor(random_state=42),
        "XGBoost": XGBRegressor(objective='reg:squarederror', alpha=1, max_depth=3, random_state=42),
        #"ElasticNet": ElasticNet(alpha=1.0, l1_ratio=0.5),
        "SVR (SVM Regression)": SVR(kernel='rbf', C=1.0, epsilon=0.1),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
        "K-Nearest Neighbors": KNeighborsRegressor(n_neighbors=5),
        "Bayesian Ridge": BayesianRidge()
    }
    
    feature_names = X.columns.tolist()
    
    # Collect results
    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        test_pred = model.predict(X_test)
        train_mse = mean_squared_error(y_train, train_pred)
        test_mse = mean_squared_error(y_test, test_pred)
        y_train_np = np.array(y_train)
        train_pred_np = np.array(train_pred)
        y_test_np = np.array(y_test)
        test_pred_np = np.array(test_pred)
        correct_directions_train = np.sign(y_train_np) == np.sign(train_pred_np)
        directional_accuracy_train = correct_directions_train.mean() * 100 
        correct_directions_test = np.sign(y_test_np) == np.sign(test_pred_np)
        directional_accuracy_test = correct_directions_test.mean() * 100 
        results.append([name, train_mse, test_mse, directional_accuracy_train, directional_accuracy_test])
        for actual, predicted in zip(y_test, test_pred):
            print(f"Actual: {actual:.4f}, Predicted: {predicted:.4f}")
        break
#         if True:
#             plt.figure(figsize=(10, 6))
#             plt.plot(y_test.values, label="Actual", color="blue", alpha=0.6, linestyle='dotted')
#             plt.plot(test_pred, label="Predicted", color="red", alpha=0.6)
#             plt.title(f"{name}: Actual vs Predicted")
#             plt.xlabel("Sample")
#             plt.ylabel("Value")
# #             plt.ylim(-10, 10)
#             plt.legend()
#             plt.show()
        
            
#         ##feature importance
#         if name in ['Random Forest','XGBoost']:
#             plt.figure(figsize=(5, 3))
#             importances = model.feature_importances_
#             plt.barh(feature_names, importances, color="skyblue")
#             plt.xlabel("Importance")
#             plt.title(f"Feature Importance - {name}")
#             plt.gca().invert_yaxis()
#             plt.show()
            
#         if name in ['Lasso','Ridge','ElasticNet']:
#             plt.figure(figsize=(5, 3))
#             importances = np.abs(model.coef_)
#             plt.barh(feature_names, importances, color="skyblue")
#             plt.xlabel("Importance")
#             plt.title(f"Feature Importance - {name}")
#             plt.gca().invert_yaxis()
#             plt.show()
        
            
        

    # Convert results to DataFrame for display
    results_df = pd.DataFrame(results, columns=["Model", "Training MSE", "Test MSE", "TrainDirection","Test Direction"])
    results_df.set_index('Model',inplace=True)
    print(results_df)
    print("----------")
    print(results_df['Test Direction'].mean())
    return 

vanillaMLModels(X_train,y_train,X_test,y_test,X)

291      8235.74
2008    62033.00
1718    26571.00
997     42722.19
2027    68158.00
          ...   
314      8813.91
982     44857.38
620     10334.78
2117    68359.00
201     10328.83
Name: close_tomro, Length: 426, dtype: float64 [ 8015.65387035 62854.04626689 26629.2806173  42968.57410325
 67102.97897842  9260.57133457 26892.48638913 13033.21762122
  4031.56876097  9614.3950728  27235.65905871 61607.13958284
  9508.7891915  19570.81939627 33235.68135906  6934.15471029
 25986.36108731 37467.85542666  9952.74637245  5199.88538508
 23240.39367712 45633.5229642   9463.35901631 30279.82107359
 50802.20527154  7330.19024683 62463.3473044  17143.43109351
  5370.07394346 64941.00372834 26377.50358079 19166.22934014
 44116.990844   19457.94542045 47034.66139458 11472.54113831
 67673.3187001  69709.39289317  8853.92965413  8219.38829109
  5447.57183415  7570.44159485 45925.11558856 11815.39089516
 51272.87511534 68109.97540153  7306.96833627  7607.28027808
 40835.21909907 26021.95636183 218

In [263]:
print(y_test,testpred)

NameError: name 'testpred' is not defined