In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from datetime import datetime

In [None]:
df_10 = pd.read_csv('resample_10.csv')
df_11 = pd.read_csv('resample_11.csv')

In [None]:
df_new = pd.concat([df_10, df_11]) 
df_new

In [None]:
df_new['time'] = pd.to_datetime(df_new['time'])
df_new

In [None]:
def convert_tick_to_ohlcv(data):
    """
    Converts given Binance tick data into 1-hour interval OHLCV (Open, High, Low, Close, Volume) data.
    :param data: DataFrame with Tick data
    :return: DataFrame with the Open, High, Low, Close, Volume values
    """
    ohlcv = data.resample('1T',on='time').agg({
        'price': ['first', 'max', 'min', 'last'],
        'qty': 'sum'
})

    ohlcv.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    return ohlcv

def calculate_volatility(data, window=20):
    """
    Calculate the rolling volatility using the standard deviation of returns.
    :param data: DataFrame with OHLCV data
    :param window: The number of periods to use for calculating the standard deviation
    :return: DataFrame with the volatility values
    """

    # Calculate daily returns
    data['returns'] = data['Close'].pct_change()

    # Calculate the rolling standard deviation of returns
    data['volatility'] = data['returns'].rolling(window=window).std()
    return data

In [None]:
ohlcv=convert_tick_to_ohlcv(df_new)
ohlcv

In [None]:
result=calculate_volatility(ohlcv, window=20)
result

In [None]:
a=result[['volatility']]
a

In [None]:
a=a.reset_index()
a

In [None]:
a.info()

In [None]:
b=pd.merge(df_new,a,on='time')
b

In [None]:
#b=b.set_index('time')

In [None]:
b=b.iloc[21:]
b

In [None]:
sns.boxplot(b.volatility)

In [None]:
def replace_outlier(value):
    Q1 = b['volatility'].quantile(0.25)
    Q3 = b['volatility'].quantile(0.75)
    IQR = Q3-Q1
    rev_range = 3

    if((value <(Q1-rev_range*IQR))):
        value = np.nan
    if((value >(Q3+rev_range*IQR))):
        value = np.nan
    return value

In [None]:
b['volatility'] = b['volatility'].apply(replace_outlier)

In [None]:
b.isna().sum()

In [None]:
result=b.dropna()
result

In [None]:
X = result.drop('volatility',axis=1) # 변동성 빼고 나머지 데이터 프레임
y = result['volatility']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=111) 

In [None]:
# RandomForestRegressor 모델과 mape 기준으로 튜닝할 하이퍼파라미터 그리드 정의

# MAPE를 계산하는 함수 정의
def mape(y_true, y_pred): 
    """
    MAPE 계산 함수

    Parameters:
    - y_true: 실제값의 배열
    - y_pred: 예측값의 배열

    Returns:
    - mape: MAPE 값
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    # 0으로 나누는 것을 방지하기 위해 실제값이 0인 경우는 0으로 대체
    mask = y_true != 0
    y_true, y_pred = y_true[mask], y_pred[mask]

    # MAPE 계산
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    return mape

# MAPE를 사용자 정의 스코어로 등록
mape_scorer = make_scorer(mape, greater_is_better=False)

# RandomForestRegressor 모델과 튜닝할 하이퍼파라미터 그리드 정의
param_grid = {
    'n_estimators': [100, 150, 200],
    'max_depth': [10, 15, 20]
}

# 모델 생성
rf_model = RandomForestRegressor(random_state=111)

# GridSearchCV 생성
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring=mape_scorer)

# 데이터에 모델을 적합시키고 최적의 하이퍼파라미터를 찾음
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)

* 거의 1시간 넘게 안 돌아간 듯 코랩에서 gqu 써서 해봐야겠다

In [None]:
rf_model = RandomForestRegressor(n_estimators=200, max_depth=20,random_state=111)
rf_model.fit(X_train, y_train) # 모델 학습

# 학습된 모델을 사용하여 테스트 데이터 예측
y_pred = rf_model.predict(X_test)

r2score = r2_score(y_true = y_test, y_pred =y_pred) ## R-squared score
mse = mean_squared_error(y_true = y_test, y_pred =y_pred) ## MSE (Mean Squared Error)
mae = mean_absolute_error(y_true = y_test, y_pred =y_pred) ## MAE (Mean Absolute Error)
mape=mean_absolute_percentage_error(y_true=y_test, y_pred=y_pred) ## MAPE

print("r2score :", r2score)
print("mse :", mse)
print("mae :", mae)
print("mape :", mape) 