In [1]:
import pandas as pd

df = pd.read_csv('BIT_202324_4차.csv')
df.head(3)

Unnamed: 0.1,Unnamed: 0,Open,High,Low,Close,Volume,returns,volatility,price,quote_qty,...,upper_band_10,lower_band_10,roc_1,roc_2,rsi_7,rsi_9,rsi_14,UO_71014,UO_7911,UO_71012
0,2023-01-01 0:00,16537.5,16540.9,16504.0,16527.0,5381.399,9.9e-05,0.006528,16524.52574,7264.129209,...,42075.64143,41720.20402,0.141586,0.093181,100.0,100.0,100.0,0.0,0.0,0.0
1,2023-01-01 1:00,16527.1,16554.3,16524.1,16550.4,3210.826,0.001416,0.006528,16537.21599,6819.889969,...,42075.64143,41720.20402,0.141586,0.093181,100.0,100.0,100.0,40.577816,40.577816,40.577816
2,2023-01-01 2:00,16550.5,16557.1,16534.8,16542.4,2399.668,-0.000483,0.006528,16545.81814,6030.420093,...,42075.64143,41720.20402,-0.048337,0.093181,74.522293,74.522293,74.522293,45.684084,45.684084,45.684084


In [2]:
df = df.rename(columns={'Unnamed: 0': 'time'})
df = df.set_index('time')
df.index=pd.to_datetime(df.index)

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit #Timeseires Split

In [4]:
def calculate_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
def rf_base(df, df_name, n_splits): 
    
    # Feature와 Target 나누기
    X = df.drop(columns=['volatility'])
    y = df['volatility']

    tscv = TimeSeriesSplit(n_splits)
    
    mape_list = []
    rmse_list = []

    # TimeSeriesSplit을 사용하여 데이터 분할
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        

        # RandomForestRegressor
        rf_model = RandomForestRegressor(random_state=42)  # random_state 추가

        # 모델 학습
        rf_model.fit(X_train, y_train)

        # 예측
        y_pred = rf_model.predict(X_test)

        # 평가지표(MAPE, RMSE)
        test_mape = mean_absolute_percentage_error(y_test, y_pred)
        test_rmse = calculate_rmse(y_test, y_pred)
        
        # 결과 저장
        mape_list.append(test_mape)
        rmse_list.append(test_rmse)
        
        print(f'{df_name} : MAPE: {test_mape}, RMSE: {test_rmse}')
    
    # 평균 결과a
    print(f'{df_name} : Average MAPE: {np.mean(mape_list)}, Average RMSE: {np.mean(rmse_list)}')

In [6]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [7]:
df5 = df.loc[:,[ 'volatility', 'atr_14','atr_20',
                      'SO low_202', 'SO low_142', 'SO low_302', 'SO low_203',
                         'SO low_303','SO low_205', 'SO low_145', 
                         'SO low_305','SO low_2010', 'SO low_1410', 'SO low_3010',
                      'SO high_202','SO high_142','SO high_302','SO high_203', 
                         'SO high_303','SO high_205', 'SO high_145',
                         'SO high_305','SO high_2010','SO high_1410','SO high_3010',
                         'MACD_1226', 'Signal_Line_9', 'MACD_613','Signal_Line_4',
                       'roc_2', 'is_buyer_maker' ]]

In [8]:
rf_base(df,df5,5)

                     volatility      atr_14   atr_20   SO low_202  \
time                                                                
2023-01-01 00:00:00    0.006528   36.900000   36.900  39882.52696   
2023-01-01 01:00:00    0.006528   33.550000   33.550  39882.52696   
2023-01-01 02:00:00    0.006528   29.800000   29.800  39882.52696   
2023-01-01 03:00:00    0.006528   29.225000   29.225  39882.52696   
2023-01-01 04:00:00    0.006528   27.700000   27.700  39882.52696   
...                         ...         ...      ...          ...   
2024-01-26 22:00:00    0.006608  405.057143  330.215  39880.90000   
2024-01-26 23:00:00    0.006610  390.257143  333.060  39880.90000   
2024-01-27 00:00:00    0.006643  352.507143  333.315  39880.90000   
2024-01-27 01:00:00    0.006611  315.714286  336.870  39880.90000   
2024-01-27 02:00:00    0.006532  288.907143  329.685  39880.90000   

                      SO low_142   SO low_302   SO low_203   SO low_303  \
time                       

                     volatility      atr_14   atr_20   SO low_202  \
time                                                                
2023-01-01 00:00:00    0.006528   36.900000   36.900  39882.52696   
2023-01-01 01:00:00    0.006528   33.550000   33.550  39882.52696   
2023-01-01 02:00:00    0.006528   29.800000   29.800  39882.52696   
2023-01-01 03:00:00    0.006528   29.225000   29.225  39882.52696   
2023-01-01 04:00:00    0.006528   27.700000   27.700  39882.52696   
...                         ...         ...      ...          ...   
2024-01-26 22:00:00    0.006608  405.057143  330.215  39880.90000   
2024-01-26 23:00:00    0.006610  390.257143  333.060  39880.90000   
2024-01-27 00:00:00    0.006643  352.507143  333.315  39880.90000   
2024-01-27 01:00:00    0.006611  315.714286  336.870  39880.90000   
2024-01-27 02:00:00    0.006532  288.907143  329.685  39880.90000   

                      SO low_142   SO low_302   SO low_203   SO low_303  \
time                       

                     volatility      atr_14   atr_20   SO low_202  \
time                                                                
2023-01-01 00:00:00    0.006528   36.900000   36.900  39882.52696   
2023-01-01 01:00:00    0.006528   33.550000   33.550  39882.52696   
2023-01-01 02:00:00    0.006528   29.800000   29.800  39882.52696   
2023-01-01 03:00:00    0.006528   29.225000   29.225  39882.52696   
2023-01-01 04:00:00    0.006528   27.700000   27.700  39882.52696   
...                         ...         ...      ...          ...   
2024-01-26 22:00:00    0.006608  405.057143  330.215  39880.90000   
2024-01-26 23:00:00    0.006610  390.257143  333.060  39880.90000   
2024-01-27 00:00:00    0.006643  352.507143  333.315  39880.90000   
2024-01-27 01:00:00    0.006611  315.714286  336.870  39880.90000   
2024-01-27 02:00:00    0.006532  288.907143  329.685  39880.90000   

                      SO low_142   SO low_302   SO low_203   SO low_303  \
time                       

결과 Average MAPE: 0.2685750040157949, Average RMSE: 0.0016954874658810085

In [9]:
from sklearn.model_selection import train_test_split

X = df5.drop(columns=['volatility'])
y = df['volatility']

# 훈련 데이터 / 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 랜덤 서치

In [10]:
from sklearn.model_selection import RandomizedSearchCV

In [11]:
# 랜덤 포레스트 모델 생성
rf_model = RandomForestRegressor(random_state=42)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'] # 오류 발생해 수정
}

# 랜덤 서치를 통한 하이퍼파라미터 탐색
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,  # 랜덤 서치 대상 하이퍼파라미터 후보들
    n_iter=10,  # 랜덤하게 시도할 조합 수
    cv=5,
    random_state=42
)

# 데이터 학습 및 랜덤 서치 수행
random_search.fit(X_train, y_train)

# 랜덤 서치의 최적 하이퍼파라미터와 성능
print("Random Search - Best Parameters:", random_search.best_params_)
print("Random Search - Best Score:", random_search.best_score_)

Random Search - Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}
Random Search - Best Score: 0.9551228770053722


결과 Best Parameters: {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': None}   
     Best Score: 0.9551228770053722

# 베이지안 옵티마이저

In [3]:
from skopt import BayesSearchCV

In [15]:
# 베이지안 옵티마이저를 통한 하이퍼파라미터 탐색
bayes_search = BayesSearchCV(
    estimator=rf_model,
    search_spaces=param_grid,  # 베이지안 옵티마이저 대상 하이퍼파라미터 후보들
    n_iter=10,  # 베이지안 옵티마이저 반복 횟수
    cv=5,
    random_state=42
)

# 데이터 학습 및 베이지안 옵티마이저 수행
bayes_search.fit(X_train, y_train)

# 베이지안 옵티마이저의 최적 하이퍼파라미터와 성능
print("Bayesian Optimization - Best Parameters:", bayes_search.best_params_)
print("Bayesian Optimization - Best Score:", bayes_search.best_score_)

Bayesian Optimization - Best Parameters: OrderedDict([('max_depth', None), ('max_features', 'sqrt'), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 200)])
Bayesian Optimization - Best Score: 0.9454776679757156


결과 Best Parameters: OrderedDict([('max_depth', None), ('max_features', 'sqrt'), ('min_samples_leaf', 2), ('min_samples_split', 5), ('n_estimators', 200)])   
 Best Score: 0.9454776679757156

# 그리드 서치

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [19]:
# 랜덤 서치, 베이지안 옵티마이저의 결과 기반 - 최적의 하이퍼파라미터 설정
best_hyperparameters = {
    'n_estimators': 200,
    'min_samples_split': 3,
    'min_samples_leaf': 1,
    'max_features': 'sqrt',
    'max_depth': None
}

# 그리드 서치에 사용할 하이퍼파라미터 후보 설정
param_grid = {
    'n_estimators': [best_hyperparameters['n_estimators']],
    'min_samples_split': [best_hyperparameters['min_samples_split']],
    'min_samples_leaf': [best_hyperparameters['min_samples_leaf']],
    'max_features': [best_hyperparameters['max_features']],
    'max_depth': [best_hyperparameters['max_depth']]
}

# 모델 생성
model = RandomForestRegressor()

# Set the error_score parameter to 'raise'
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, error_score='raise')

# execution
try:
    grid_search.fit(X, y)
except Exception as e:
    print("An error occurred during the grid search fitting process:")
    print(e)

grid_search.fit(X, y)

# 최적의 하이퍼파라미터와 점수
print("Grid Search - Best Parameters:", grid_search.best_params_)
print("Grid Search - Best Score:", grid_search.best_score_)

Grid Search - Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 200}
Grid Search - Best Score: 0.5428481705885037
