In [None]:
pip install pandas_ta



In [None]:
pip install ta



In [None]:
pip install shap



In [None]:
pip install SMOTE



In [None]:
pip install imbalanced-learn



In [None]:
pip install joblib



In [None]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm
import ta  # Technical Analysis library
import pandas_ta as pata
import shap
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
import joblib

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


######################

**테스트 데이터 생성**
######################

In [None]:
# Load the dataset from Kaggle
file_path = '/content/drive/MyDrive/Data/SOL_data.csv'
data_test = pd.read_csv(file_path)

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34
1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86
2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13
3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76
4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91


In [None]:
# 데이터프레임의 길이 계산
total_length = len(data_test)

# 마지막 1/3 데이터 분리
data_test = data_test.iloc[-(total_length // 3):]

In [None]:
data_test

Unnamed: 0,open_time,open,high,low,close,volume
1237691,2023-05-10 20:56:00,21.27,21.27,21.24,21.24,724.800
1237692,2023-05-10 20:57:00,21.25,21.26,21.25,21.25,1058.840
1237693,2023-05-10 20:58:00,21.25,21.25,21.23,21.24,2098.960
1237694,2023-05-10 20:59:00,21.24,21.25,21.20,21.22,7424.210
1237695,2023-05-10 21:00:00,21.21,21.24,21.16,21.18,5169.380
...,...,...,...,...,...,...
1856531,2024-07-13 14:56:00,139.93,140.05,139.93,139.99,929.251
1856532,2024-07-13 14:57:00,139.98,139.99,139.90,139.90,71.095
1856533,2024-07-13 14:58:00,139.91,139.93,139.88,139.90,171.879
1856534,2024-07-13 14:59:00,139.90,139.95,139.89,139.91,425.029


######################

**테스트 데이터 로딩**
######################

In [None]:
data_test = pd.read_csv("/content/drive/MyDrive/Data/SOL_Data_Test.csv")

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2024-07-14 15:00:00,145.02,145.2,145.0,145.2,1788.764
1,2024-07-14 15:01:00,145.21,145.5,145.18,145.49,2510.833
2,2024-07-14 15:02:00,145.5,145.57,145.45,145.45,1708.171
3,2024-07-14 15:03:00,145.45,145.49,145.32,145.44,1655.282
4,2024-07-14 15:04:00,145.44,145.63,145.44,145.6,2003.321


######################

**테스트 데이터 처리**
######################

In [None]:
# open_time 열을 datetime 형식으로 변환
data_test['open_time'] = pd.to_datetime(data['open_time'])

# 시각과 분만 추출하여 time 열 생성
data_test['time'] = data_test['open_time'].dt.strftime('%H:%M')

# ATR 계산
atr_periods = [5, 10, 14, 20, 50]
for period in atr_periods:
    data_test[f'atr_{period}'] = ta.volatility.average_true_range(data_test['high'], data_test['low'], data_test['close'], window=period)

# VWAP 계산
data_test['vwap'] = ta.volume.volume_weighted_average_price(data_test['high'], data_test['low'], data_test['close'], data_test['volume'])

# Stochastic Oscillator 계산
stoch_periods = [(14, 3), (21, 5), (9, 3), (5, 2), (20, 7)]
for period, smooth in stoch_periods:
    data_test[f'stoch_%k_{period}_{smooth}'] = ta.momentum.stoch(data_test['high'], data_test['low'], data_test['close'], window=period, smooth_window=smooth)
    data_test[f'stoch_%d_{period}_{smooth}'] = ta.momentum.stoch_signal(data_test['high'], data['low'], data_test['close'], window=period, smooth_window=smooth)

# OBV 계산
data_test['obv'] = ta.volume.on_balance_volume(data_test['close'], data_test['volume'])


# Bollinger Bands 계산
bollinger_periods = [10, 20, 50, 100, 200]
for period in bollinger_periods:
    data_test[f'bollinger_hband_{period}'] = ta.volatility.BollingerBands(data_test['close'], window=period).bollinger_hband()
    data_test[f'bollinger_lband_{period}'] = ta.volatility.BollingerBands(data_test['close'], window=period).bollinger_lband()

# Ichimoku 계산
ichimoku_periods = [9, 26, 52, 100, 200]
for period in ichimoku_periods:
    data_test[f'ichimoku_base_{period}'] = ta.trend.ichimoku_base_line(data_test['high'], data_test['low'], window1=period)
    data_test[f'ichimoku_conversion_{period}'] = ta.trend.ichimoku_conversion_line(data_test['high'], data_test['low'], window1=period)
# Supertrend 계산 함수
def calculate_supertrend(df, period=7, multiplier=3, atr_period=14):
    df = df.copy()  # 데이터프레임을 복사하여 사용
    hl2 = (df['high'] + df['low']) / 2
    df['atr'] = ta.volatility.average_true_range(df['high'], df['low'], df['close'], window=atr_period)
    df['upperband'] = hl2 + (multiplier * df['atr'])
    df['lowerband'] = hl2 - (multiplier * df['atr'])
    df['in_uptrend'] = True

    for current in tqdm(range(1, len(df.index)), desc=f'Calculating Supertrend {period}-{multiplier}-{atr_period}'):
        previous = current - 1

        if df['close'].iloc[current] > df['upperband'].iloc[previous]:
            df.loc[df.index[current], 'in_uptrend'] = True
        elif df['close'].iloc[current] < df['lowerband'].iloc[previous]:
            df.loc[df.index[current], 'in_uptrend'] = False
        else:
            df.loc[df.index[current], 'in_uptrend'] = df['in_uptrend'].iloc[previous]

            if df['in_uptrend'].iloc[current] and df['lowerband'].iloc[current] < df['lowerband'].iloc[previous]:
                df.loc[df.index[current], 'lowerband'] = df['lowerband'].iloc[previous]

            if not df['in_uptrend'].iloc[current] and df['upperband'].iloc[current] > df['upperband'].iloc[previous]:
                df.loc[df.index[current], 'upperband'] = df['upperband'].iloc[previous]

    return df

# Supertrend 계산
supertrend_settings = [(7, 3, 14), (10, 3, 20), (14, 2, 10), (20, 4, 50), (50, 5, 5)]
for period, multiplier, atr_period in supertrend_settings:
    data_test = calculate_supertrend(data_test, period, multiplier, atr_period)
    data_test[f'supertrend_upper_{period}_{multiplier}_{atr_period}'] = data_test['upperband']
    data_test[f'supertrend_lower_{period}_{multiplier}_{atr_period}'] = data_test['lowerband']
    data_test[f'supertrend_in_uptrend_{period}_{multiplier}_{atr_period}'] = data_test['in_uptrend']

KeyError: 'open_time'

######################

**테스트 데이터 60분 처리**
######################

In [None]:
def calculate_max_min_returns(df):
    window_size = 60

    # 'open_time' 열이 데이터프레임에 있는지 확인
    if 'open_time' not in df.columns:
        raise KeyError("'open_time' 열이 데이터프레임에 포함되어 있어야 합니다.")

    # 'open_time' 열을 datetime으로 변환
    df['open_time'] = pd.to_datetime(df['open_time'])

    # 인덱스 중복 확인 및 제거
    df = df[~df.duplicated(subset='open_time', keep='first')].copy()

    # 각 행에 대해 60분 윈도우를 적용하여 최대 및 최소 가격 계산
    df['max_price'] = df.apply(lambda row: df[(df['open_time'] >= row['open_time']) &
                                              (df['open_time'] < row['open_time'] + pd.Timedelta(minutes=window_size))]['high'].max(), axis=1)
    df['min_price'] = df.apply(lambda row: df[(df['open_time'] >= row['open_time']) &
                                              (df['open_time'] < row['open_time'] + pd.Timedelta(minutes=window_size))]['low'].min(), axis=1)

    # 결측값을 적절히 처리
    df['max_price'].fillna(df['high'], inplace=True)
    df['min_price'].fillna(df['low'], inplace=True)

    # 현재 가격
    current_price = df['close']

    # 최대 및 최소 수익률 계산
    df['max_return_60min'] = ((df['max_price'] - current_price) / current_price) * 100
    df['min_return_60min'] = ((df['min_price'] - current_price) / current_price) * 100

    # 필요없는 열 삭제
    df.drop(columns=['max_price', 'min_price'], inplace=True)

    return df

# 최대 상승률과 최대 하락률 계산
data_test = calculate_max_min_returns(data_test)

In [None]:
data_test

Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,atr_14,...,supertrend_lower_14_2_10,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min
0,2024-07-14 15:00:00,145.02,145.20,145.00,145.20,1788.764,15:00,0.000000,0.000000,0.000000,...,145.100000,True,145.100000,145.100000,True,145.100000,145.100000,True,0.578512,-0.399449
1,2024-07-14 15:01:00,145.21,145.50,145.18,145.49,2510.833,15:01,0.000000,0.000000,0.000000,...,145.340000,True,145.340000,145.340000,True,145.340000,145.340000,True,0.378033,-0.597979
2,2024-07-14 15:02:00,145.50,145.57,145.45,145.45,1708.171,15:02,0.000000,0.000000,0.000000,...,145.510000,True,145.510000,145.510000,True,145.510000,145.510000,True,0.405638,-0.570643
3,2024-07-14 15:03:00,145.45,145.49,145.32,145.44,1655.282,15:03,0.000000,0.000000,0.000000,...,145.405000,False,145.405000,145.405000,False,145.405000,145.405000,False,0.412541,-0.563806
4,2024-07-14 15:04:00,145.44,145.63,145.44,145.60,2003.321,15:04,0.200000,0.000000,0.000000,...,145.535000,True,145.535000,145.535000,True,146.535000,144.535000,True,0.302198,-0.673077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8636,2024-07-20 14:56:00,168.93,168.96,168.90,168.96,746.716,14:56,0.117171,0.137214,0.141106,...,168.655572,False,169.523561,168.772383,True,169.515857,168.536343,True,0.159801,-0.035511
8637,2024-07-20 14:57:00,168.96,169.05,168.96,169.05,1043.465,14:57,0.111737,0.132493,0.137456,...,168.740015,False,169.593890,168.772383,True,169.563686,168.536343,True,0.106477,-0.053239
8638,2024-07-20 14:58:00,169.05,169.10,169.05,169.10,377.556,14:58,0.099390,0.124243,0.131209,...,168.826513,False,169.656112,168.772383,True,169.571948,168.578052,True,0.076878,-0.029568
8639,2024-07-20 14:59:00,169.10,169.23,169.06,169.07,1579.751,14:59,0.113512,0.128819,0.133980,...,168.887362,False,169.728090,168.772383,True,169.712559,168.578052,True,0.094635,-0.011829


In [None]:
data_test.columns

Index(['open_time', 'open', 'high', 'low', 'close', 'volume', 'time', 'atr_5',
       'atr_10', 'atr_14', 'atr_20', 'atr_50', 'vwap', 'stoch_%k_14_3',
       'stoch_%d_14_3', 'stoch_%k_21_5', 'stoch_%d_21_5', 'stoch_%k_9_3',
       'stoch_%d_9_3', 'stoch_%k_5_2', 'stoch_%d_5_2', 'stoch_%k_20_7',
       'stoch_%d_20_7', 'obv', 'bollinger_hband_10', 'bollinger_lband_10',
       'bollinger_hband_20', 'bollinger_lband_20', 'bollinger_hband_50',
       'bollinger_lband_50', 'bollinger_hband_100', 'bollinger_lband_100',
       'bollinger_hband_200', 'bollinger_lband_200', 'ichimoku_base_9',
       'ichimoku_conversion_9', 'ichimoku_base_26', 'ichimoku_conversion_26',
       'ichimoku_base_52', 'ichimoku_conversion_52', 'ichimoku_base_100',
       'ichimoku_conversion_100', 'ichimoku_base_200',
       'ichimoku_conversion_200', 'atr', 'upperband', 'lowerband',
       'in_uptrend', 'supertrend_upper_7_3_14', 'supertrend_lower_7_3_14',
       'supertrend_in_uptrend_7_3_14', 'supertrend_upper_10

######################

**테스트 처리 데이터 저장**
######################

In [None]:
# Load the dataset from Kaggle
path2 = '/content/drive/MyDrive/Data/SOL_Data_Test_Indicator.csv'
data_test.to_csv(path2, index=True);

######################

**초기 데이터 처리**
######################

In [None]:
# Load the dataset from Kaggle
file_path = '/content/drive/MyDrive/Data/SOL_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to ensure it is loaded correctly
data.head()

Unnamed: 0,open_time,open,high,low,close,volume
0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34
1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86
2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13
3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76
4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91


######################

**1/3 데이터 처리**
######################

In [None]:
# 데이터의 첫 번째 1/3만 사용
n_samples = len(data)
data = data.iloc[:n_samples // 3]

######################

**가용 지표 입력**
######################

In [None]:
# open_time 열을 datetime 형식으로 변환
data['open_time'] = pd.to_datetime(data['open_time'])

# 시각과 분만 추출하여 time 열 생성
data['time'] = data['open_time'].dt.strftime('%H:%M')

# ATR 계산
atr_periods = [5, 10, 14, 20, 50]
for period in atr_periods:
    data[f'atr_{period}'] = ta.volatility.average_true_range(data['high'], data['low'], data['close'], window=period)

# VWAP 계산
data['vwap'] = ta.volume.volume_weighted_average_price(data['high'], data['low'], data['close'], data['volume'])

# Stochastic Oscillator 계산
stoch_periods = [(14, 3), (21, 5), (9, 3), (5, 2), (20, 7)]
for period, smooth in stoch_periods:
    data[f'stoch_%k_{period}_{smooth}'] = ta.momentum.stoch(data['high'], data['low'], data['close'], window=period, smooth_window=smooth)
    data[f'stoch_%d_{period}_{smooth}'] = ta.momentum.stoch_signal(data['high'], data['low'], data['close'], window=period, smooth_window=smooth)

# Supertrend 계산 함수
def calculate_supertrend(df, period=7, multiplier=3, atr_period=14):
    df = df.copy()  # 데이터프레임을 복사하여 사용
    hl2 = (df['high'] + df['low']) / 2
    df['atr'] = ta.volatility.average_true_range(df['high'], df['low'], df['close'], window=atr_period)
    df['upperband'] = hl2 + (multiplier * df['atr'])
    df['lowerband'] = hl2 - (multiplier * df['atr'])
    df['in_uptrend'] = True

    for current in tqdm(range(1, len(df.index)), desc=f'Calculating Supertrend {period}-{multiplier}-{atr_period}'):
        previous = current - 1

        if df['close'].iloc[current] > df['upperband'].iloc[previous]:
            df.loc[df.index[current], 'in_uptrend'] = True
        elif df['close'].iloc[current] < df['lowerband'].iloc[previous]:
            df.loc[df.index[current], 'in_uptrend'] = False
        else:
            df.loc[df.index[current], 'in_uptrend'] = df['in_uptrend'].iloc[previous]

            if df['in_uptrend'].iloc[current] and df['lowerband'].iloc[current] < df['lowerband'].iloc[previous]:
                df.loc[df.index[current], 'lowerband'] = df['lowerband'].iloc[previous]

            if not df['in_uptrend'].iloc[current] and df['upperband'].iloc[current] > df['upperband'].iloc[previous]:
                df.loc[df.index[current], 'upperband'] = df['upperband'].iloc[previous]

    return df

# Supertrend 계산
supertrend_settings = [(7, 3, 14), (10, 3, 20), (14, 2, 10), (20, 4, 50), (50, 5, 5)]
for period, multiplier, atr_period in supertrend_settings:
    data = calculate_supertrend(data, period, multiplier, atr_period)
    data[f'supertrend_upper_{period}_{multiplier}_{atr_period}'] = data['upperband']
    data[f'supertrend_lower_{period}_{multiplier}_{atr_period}'] = data['lowerband']
    data[f'supertrend_in_uptrend_{period}_{multiplier}_{atr_period}'] = data['in_uptrend']

######################

**비가용 지표**
######################

In [None]:
# Bollinger Bands 계산
bollinger_periods = [10, 20, 50, 100, 200]
for period in bollinger_periods:
    data[f'bollinger_hband_{period}'] = ta.volatility.BollingerBands(data['close'], window=period).bollinger_hband()
    data[f'bollinger_lband_{period}'] = ta.volatility.BollingerBands(data['close'], window=period).bollinger_lband()

# OBV 계산
data['obv'] = ta.volume.on_balance_volume(data['close'], data['volume'])

# Ichimoku 계산
ichimoku_periods = [9, 26, 52, 100, 200]
for period in ichimoku_periods:
    data[f'ichimoku_base_{period}'] = ta.trend.ichimoku_base_line(data['high'], data['low'], window1=period)
    data[f'ichimoku_conversion_{period}'] = ta.trend.ichimoku_conversion_line(data['high'], data['low'], window1=period)


######################

**60봉 이내 최대 상승 및 하락 %**
######################

In [None]:
def calculate_max_min_returns(df):
    window_size = 60

    # 'open_time'이 이미 인덱스로 설정되어 있는지 확인
    if df.index.name != 'open_time':
        raise KeyError("'open_time' 열이 인덱스로 설정되어 있어야 합니다.")

    # 인덱스 중복 확인 및 제거
    df = df[~df.index.duplicated(keep='first')].copy()

    # 60분 윈도우를 적용하여 최대 및 최소 가격 계산
    df['max_price'] = df['high'].rolling(f'{window_size}T').max().shift(-window_size)
    df['min_price'] = df['low'].rolling(f'{window_size}T').min().shift(-window_size)

    # 결측값을 적절히 처리 (예: 마지막 몇 행)
    df['max_price'].fillna(df['high'], inplace=True)
    df['min_price'].fillna(df['low'], inplace=True)

    # 현재 가격
    current_price = df['close']

    # 최대 및 최소 수익률 계산
    df['max_return_60min'] = ((df['max_price'] - current_price) / current_price) * 100
    df['min_return_60min'] = ((df['min_price'] - current_price) / current_price) * 100

    # 필요없는 열 삭제
    df.drop(columns=['max_price', 'min_price'], inplace=True)

    return df

# 최대 상승률과 최대 하락률 계산
data = calculate_max_min_returns(data)

In [None]:
# 인덱스를 리셋하여 'open_time'을 열로 되돌림
data.reset_index(inplace=True)

data.head()

In [None]:
data.columns

######################

**처리 데이터 저장**
######################

In [None]:
path2 = "/content/drive/MyDrive/Data/SOL60_INDICATOR2_SMALL";
data.to_csv(path2, index=True);

######################

**테스트 처리 데이터 로딩(S)**
######################

In [None]:
data_test = pd.read_csv("/content/drive/MyDrive/Data/SOL_Data_Test_Indicator.csv")

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Data/SOL_Data_Test_Indicator.csv'

In [None]:
#2nd Account
data_test = pd.read_csv("/content/drive/MyDrive/SOL_Data_Test_Indicator.csv")

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_lower_14_2_10,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min
0,0,2024-07-14 15:00:00,145.02,145.2,145.0,145.2,1788.764,15:00,0.0,0.0,...,145.1,True,145.1,145.1,True,145.1,145.1,True,0.578512,-0.399449
1,1,2024-07-14 15:01:00,145.21,145.5,145.18,145.49,2510.833,15:01,0.0,0.0,...,145.34,True,145.34,145.34,True,145.34,145.34,True,0.378033,-0.597979
2,2,2024-07-14 15:02:00,145.5,145.57,145.45,145.45,1708.171,15:02,0.0,0.0,...,145.51,True,145.51,145.51,True,145.51,145.51,True,0.405638,-0.570643
3,3,2024-07-14 15:03:00,145.45,145.49,145.32,145.44,1655.282,15:03,0.0,0.0,...,145.405,False,145.405,145.405,False,145.405,145.405,False,0.412541,-0.563806
4,4,2024-07-14 15:04:00,145.44,145.63,145.44,145.6,2003.321,15:04,0.2,0.0,...,145.535,True,145.535,145.535,True,146.535,144.535,True,0.302198,-0.673077


######################

**테스트 처리 데이터 로딩(L)**
######################

In [None]:
data_test = pd.read_csv("/content/drive/MyDrive/Data/SOL60_INDICATOR2.csv")

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_lower_14_2_10,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min
0,0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34,15:00,0.0,0.0,...,1.4451,True,1.4451,1.4451,True,1.4451,1.4451,True,1.203652,-0.387382
1,1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86,15:01,0.0,0.0,...,1.44705,True,1.44705,1.44705,True,1.44705,1.44705,True,1.154671,-0.435594
2,2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13,15:02,0.0,0.0,...,1.4498,True,1.4498,1.4498,True,1.4498,1.4498,True,1.008009,-0.57995
3,3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76,15:03,0.0,0.0,...,1.4531,True,1.4531,1.4531,True,1.4531,1.4531,True,0.494573,-1.085314
4,4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91,15:04,0.00404,0.0,...,1.4557,True,1.4557,1.4557,True,1.4759,1.4355,True,0.418697,-1.159997


In [None]:
#2nd Account
data_test = pd.read_csv("/content/drive/MyDrive/SOL60_INDICATOR2_SMALL")

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,atr_14,...,supertrend_lower_14_2_10,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min
0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34,15:00,0.0,0.0,0.0,...,1.4451,True,1.4451,1.4451,True,1.4451,1.4451,True,1.203652,-0.387382
1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86,15:01,0.0,0.0,0.0,...,1.44705,True,1.44705,1.44705,True,1.44705,1.44705,True,1.154671,-0.435594
2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13,15:02,0.0,0.0,0.0,...,1.4498,True,1.4498,1.4498,True,1.4498,1.4498,True,1.008009,-0.57995
3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76,15:03,0.0,0.0,0.0,...,1.4531,True,1.4531,1.4531,True,1.4531,1.4531,True,0.494573,-1.085314
4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91,15:04,0.00404,0.0,0.0,...,1.4557,True,1.4557,1.4557,True,1.4759,1.4355,True,0.418697,-1.159997


In [None]:
# 데이터프레임의 길이 계산
total_length = len(data_test)

# 마지막 1/3 데이터 분리
data_test = data_test.iloc[-(total_length // 3):]

######################

**테스트 처리 전처리(공통)**
######################

In [None]:
# open_time 열을 datetime 형식으로 변환
if not np.issubdtype(data_test['open_time'].dtype, np.datetime64):
    data_test['open_time'] = pd.to_datetime(data_test['open_time'])

# time 열을 분 단위로 변환
data_test['time'] = data_test['open_time'].dt.hour * 60 + data_test['open_time'].dt.minute

######################

**테스트 처리 전처리(시계열)**
######################

In [None]:
# 사용하지 않을 열 제외
data_test_predict = data_test.drop(columns=['open_time', 'max_return_60min', 'min_return_60min'])

In [None]:
# 무한대 값을 NaN으로 대체
data_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN 값을 평균으로 대체
imputer = SimpleImputer(strategy='mean')
data_test_predict_imputed = imputer.fit_transform(data_test_predict)  # 같은 imputer 사용

# 데이터 정규화
scaler = MinMaxScaler()
data_test_predict_scaled = scaler.fit_transform(data_test_predict_imputed)  # 같은 scaler 사용

# 예측 데이터를 시퀀스 형태로 변환 (LSTM용)
def create_sequences_for_prediction(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        seq = data[i:i + sequence_length]
        sequences.append(seq)
    return np.array(sequences)

# 시퀀스 길이 설정
sequence_length = 10

# 예측용 시퀀스 데이터 생성
X_test_seq = create_sequences_for_prediction(data_test_predict_scaled, sequence_length)


######################

**테스트 처리 전처리(비시계열)**
######################

In [None]:
# 무한대 값을 NaN으로 대체
data_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# 사용하지 않을 열 제외
data_test_predict = data_test.drop(columns=['open_time', 'max_return_60min', 'min_return_60min'])

# NaN 값을 평균으로 대체
imputer = SimpleImputer(strategy='mean')
data_test_predict_imputed = imputer.fit_transform(data_test_predict)

# 데이터 정규화
scaler = StandardScaler()
data_test_predict_scaled = scaler.fit_transform(data_test_predict_imputed)

######################

**처리 데이터 로딩**
######################

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Data/SOL60_INDICATOR2_SMALL.csv")

# Display the first few rows of the dataset to ensure it is loaded correctly
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Data/SOL60_INDICATOR2_SMALL.csv'

In [None]:
#2nd 계정 로드
data = pd.read_csv("/content/drive/MyDrive/SOL60_INDICATOR2_SMALL")

# Display the first few rows of the dataset to ensure it is loaded correctly
data.head()

Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,atr_14,...,supertrend_lower_14_2_10,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min
0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34,15:00,0.0,0.0,0.0,...,1.4451,True,1.4451,1.4451,True,1.4451,1.4451,True,1.203652,-0.387382
1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86,15:01,0.0,0.0,0.0,...,1.44705,True,1.44705,1.44705,True,1.44705,1.44705,True,1.154671,-0.435594
2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13,15:02,0.0,0.0,0.0,...,1.4498,True,1.4498,1.4498,True,1.4498,1.4498,True,1.008009,-0.57995
3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76,15:03,0.0,0.0,0.0,...,1.4531,True,1.4531,1.4531,True,1.4531,1.4531,True,0.494573,-1.085314
4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91,15:04,0.00404,0.0,0.0,...,1.4557,True,1.4557,1.4557,True,1.4759,1.4355,True,0.418697,-1.159997


######################

지표 목록 파악

######################



In [None]:
data.columns

Index(['open_time', 'open', 'high', 'low', 'close', 'volume', 'time', 'atr_5',
       'atr_10', 'atr_14', 'atr_20', 'atr_50', 'vwap', 'stoch_%k_14_3',
       'stoch_%d_14_3', 'stoch_%k_21_5', 'stoch_%d_21_5', 'stoch_%k_9_3',
       'stoch_%d_9_3', 'stoch_%k_5_2', 'stoch_%d_5_2', 'stoch_%k_20_7',
       'stoch_%d_20_7', 'obv', 'bollinger_hband_10', 'bollinger_lband_10',
       'bollinger_hband_20', 'bollinger_lband_20', 'bollinger_hband_50',
       'bollinger_lband_50', 'bollinger_hband_100', 'bollinger_lband_100',
       'bollinger_hband_200', 'bollinger_lband_200', 'ichimoku_base_9',
       'ichimoku_conversion_9', 'ichimoku_base_26', 'ichimoku_conversion_26',
       'ichimoku_base_52', 'ichimoku_conversion_52', 'ichimoku_base_100',
       'ichimoku_conversion_100', 'ichimoku_base_200',
       'ichimoku_conversion_200', 'atr', 'upperband', 'lowerband',
       'in_uptrend', 'supertrend_upper_7_3_14', 'supertrend_lower_7_3_14',
       'supertrend_in_uptrend_7_3_14', 'supertrend_upper_10

######################

**불필요 지표 삭제**
######################

In [None]:
data = data.drop(columns=['ichimoku_base_9',
       'ichimoku_conversion_9', 'ichimoku_base_26', 'ichimoku_conversion_26',
       'ichimoku_base_52', 'ichimoku_conversion_52', 'ichimoku_base_100',
       'ichimoku_conversion_100', 'ichimoku_base_200',
       'ichimoku_conversion_200','bollinger_hband_10', 'bollinger_lband_10',
       'bollinger_hband_20', 'bollinger_lband_20', 'bollinger_hband_50',
       'bollinger_lband_50', 'bollinger_hband_100', 'bollinger_lband_100',
       'bollinger_hband_200', 'bollinger_lband_200', 'obv'])

In [None]:
data.columns

Index(['open_time', 'open', 'high', 'low', 'close', 'volume', 'time', 'atr_5',
       'atr_10', 'atr_14', 'atr_20', 'atr_50', 'vwap', 'stoch_%k_14_3',
       'stoch_%d_14_3', 'stoch_%k_21_5', 'stoch_%d_21_5', 'stoch_%k_9_3',
       'stoch_%d_9_3', 'stoch_%k_5_2', 'stoch_%d_5_2', 'stoch_%k_20_7',
       'stoch_%d_20_7', 'atr', 'upperband', 'lowerband', 'in_uptrend',
       'supertrend_upper_7_3_14', 'supertrend_lower_7_3_14',
       'supertrend_in_uptrend_7_3_14', 'supertrend_upper_10_3_20',
       'supertrend_lower_10_3_20', 'supertrend_in_uptrend_10_3_20',
       'supertrend_upper_14_2_10', 'supertrend_lower_14_2_10',
       'supertrend_in_uptrend_14_2_10', 'supertrend_upper_20_4_50',
       'supertrend_lower_20_4_50', 'supertrend_in_uptrend_20_4_50',
       'supertrend_upper_50_5_5', 'supertrend_lower_50_5_5',
       'supertrend_in_uptrend_50_5_5', 'max_return_60min', 'min_return_60min'],
      dtype='object')

######################

**추가 지표 처리**
######################

In [None]:
# 지표 계산 함수 정의

def calculate_indicators(df):
    # Parabolic SAR
    df['Parabolic_SAR_0.02'] = pata.psar(df['high'], df['low'], df['close'], af=0.02, max_af=0.2)['PSARl_0.02_0.2']
    df['Parabolic_SAR_0.04'] = pata.psar(df['high'], df['low'], df['close'], af=0.04, max_af=0.2)['PSARl_0.04_0.2']
    df['Parabolic_SAR_0.06'] = pata.psar(df['high'], df['low'], df['close'], af=0.06, max_af=0.2)['PSARl_0.06_0.2']
    df['Parabolic_SAR_0.08'] = pata.psar(df['high'], df['low'], df['close'], af=0.08, max_af=0.2)['PSARl_0.08_0.2']
    df['Parabolic_SAR_0.1'] = pata.psar(df['high'], df['low'], df['close'], af=0.1, max_af=0.2)['PSARl_0.1_0.2']

    # Williams %R
    df['Williams_%R_10'] = pata.willr(df['high'], df['low'], df['close'], length=10)
    df['Williams_%R_20'] = pata.willr(df['high'], df['low'], df['close'], length=20)
    df['Williams_%R_30'] = pata.willr(df['high'], df['low'], df['close'], length=30)
    df['Williams_%R_40'] = pata.willr(df['high'], df['low'], df['close'], length=40)
    df['Williams_%R_50'] = pata.willr(df['high'], df['low'], df['close'], length=50)

    # Momentum
    df['Momentum_10'] = pata.mom(df['close'], length=10)
    df['Momentum_20'] = pata.mom(df['close'], length=20)
    df['Momentum_30'] = pata.mom(df['close'], length=30)
    df['Momentum_40'] = pata.mom(df['close'], length=40)
    df['Momentum_50'] = pata.mom(df['close'], length=50)

    # Rate of Change (ROC)
    df['ROC_10'] = pata.roc(df['close'], length=10)
    df['ROC_20'] = pata.roc(df['close'], length=20)
    df['ROC_30'] = pata.roc(df['close'], length=30)
    df['ROC_40'] = pata.roc(df['close'], length=40)
    df['ROC_50'] = pata.roc(df['close'], length=50)

    # Chande Momentum Oscillator (CMO)
    df['CMO_10'] = pata.cmo(df['close'], length=10)
    df['CMO_20'] = pata.cmo(df['close'], length=20)
    df['CMO_30'] = pata.cmo(df['close'], length=30)
    df['CMO_40'] = pata.cmo(df['close'], length=40)
    df['CMO_50'] = pata.cmo(df['close'], length=50)

    # Money Flow Index (MFI)
    df['MFI_10'] = pata.mfi(df['high'], df['low'], df['close'], df['volume'], length=10)
    df['MFI_20'] = pata.mfi(df['high'], df['low'], df['close'], df['volume'], length=20)
    df['MFI_30'] = pata.mfi(df['high'], df['low'], df['close'], df['volume'], length=30)
    df['MFI_40'] = pata.mfi(df['high'], df['low'], df['close'], df['volume'], length=40)
    df['MFI_50'] = pata.mfi(df['high'], df['low'], df['close'], df['volume'], length=50)

    # Relative Strength Index (RSI)
    df['RSI_10'] = pata.rsi(df['close'], length=10)
    df['RSI_20'] = pata.rsi(df['close'], length=20)
    df['RSI_30'] = pata.rsi(df['close'], length=30)
    df['RSI_40'] = pata.rsi(df['close'], length=40)
    df['RSI_50'] = pata.rsi(df['close'], length=50)

    # Accumulation/Distribution Line (A/D Line)
    df['Accumulation_Distribution_Line'] = pata.ad(df['high'], df['low'], df['close'], df['volume'])

    # Elder's Force Index (EFI)
    df['Elder_Force_Index_2'] = pata.efi(df['close'], df['volume'], length=2)
    df['Elder_Force_Index_13'] = pata.efi(df['close'], df['volume'], length=13)
    df['Elder_Force_Index_5'] = pata.efi(df['close'], df['volume'], length=5)
    df['Elder_Force_Index_10'] = pata.efi(df['close'], df['volume'], length=10)
    df['Elder_Force_Index_25'] = pata.efi(df['close'], df['volume'], length=25)

    # Relative Vigor Index (RVI)
    df['Relative_Vigor_Index_10'] = pata.rsi(df['close'], length=10)  # pandas-ta는 RVI가 없으므로 RSI로 대체
    df['Relative_Vigor_Index_20'] = pata.rsi(df['close'], length=20)
    df['Relative_Vigor_Index_30'] = pata.rsi(df['close'], length=30)
    df['Relative_Vigor_Index_40'] = pata.rsi(df['close'], length=40)
    df['Relative_Vigor_Index_50'] = pata.rsi(df['close'], length=50)

    # Volume Ratio (VR)
    def volume_ratio(close, volume, period):
        vr = []
        for i in range(len(close)):
            if i < period:
                vr.append(None)
            else:
                vol_up = sum(volume[j] for j in range(i - period + 1, i + 1) if close[j] > close[j - 1])
                vol_down = sum(volume[j] for j in range(i - period + 1, i + 1) if close[j] < close[j - 1])
                vol_same = sum(volume[j] for j in range(i - period + 1, i + 1) if close[j] == close[j - 1])
                vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
                vr.append(vr_value)
        return vr

    df['VR_10'] = volume_ratio(df['close'], df['volume'], period=10)
    df['VR_20'] = volume_ratio(df['close'], df['volume'], period=20)
    df['VR_30'] = volume_ratio(df['close'], df['volume'], period=30)
    df['VR_40'] = volume_ratio(df['close'], df['volume'], period=40)
    df['VR_50'] = volume_ratio(df['close'], df['volume'], period=50)

    return df

# 지표 계산
data = calculate_indicators(data)

  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100
  vr_value = (vol_up + vol_same / 2) / (vol_down + vol_same / 2) * 100


In [None]:
data.head()

Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,atr_14,...,Relative_Vigor_Index_10,Relative_Vigor_Index_20,Relative_Vigor_Index_30,Relative_Vigor_Index_40,Relative_Vigor_Index_50,VR_10,VR_20,VR_30,VR_40,VR_50
0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34,15:00,0.0,0.0,0.0,...,,,,,,,,,,
1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86,15:01,0.0,0.0,0.0,...,,,,,,,,,,
2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13,15:02,0.0,0.0,0.0,...,,,,,,,,,,
3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76,15:03,0.0,0.0,0.0,...,,,,,,,,,,
4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91,15:04,0.00404,0.0,0.0,...,,,,,,,,,,


In [None]:
data.columns

Index(['open_time', 'open', 'high', 'low', 'close', 'volume', 'time', 'atr_5',
       'atr_10', 'atr_14', 'atr_20', 'atr_50', 'vwap', 'stoch_%k_14_3',
       'stoch_%d_14_3', 'stoch_%k_21_5', 'stoch_%d_21_5', 'stoch_%k_9_3',
       'stoch_%d_9_3', 'stoch_%k_5_2', 'stoch_%d_5_2', 'stoch_%k_20_7',
       'stoch_%d_20_7', 'atr', 'upperband', 'lowerband', 'in_uptrend',
       'supertrend_upper_7_3_14', 'supertrend_lower_7_3_14',
       'supertrend_in_uptrend_7_3_14', 'supertrend_upper_10_3_20',
       'supertrend_lower_10_3_20', 'supertrend_in_uptrend_10_3_20',
       'supertrend_upper_14_2_10', 'supertrend_lower_14_2_10',
       'supertrend_in_uptrend_14_2_10', 'supertrend_upper_20_4_50',
       'supertrend_lower_20_4_50', 'supertrend_in_uptrend_20_4_50',
       'supertrend_upper_50_5_5', 'supertrend_lower_50_5_5',
       'supertrend_in_uptrend_50_5_5', 'max_return_60min', 'min_return_60min',
       'Parabolic_SAR_0.02', 'Parabolic_SAR_0.04', 'Parabolic_SAR_0.06',
       'Parabolic_SAR_0.

######################

**지표 저장**
######################

In [None]:
path_edit = "/content/drive/MyDrive/Data/SOL60_INDICATOR3_SMALL";
data.to_csv(path_edit, index=True);

NameError: name 'data' is not defined

######################

**지표 관계 파악(간단)**
######################

In [None]:
# max_return_60min이 1.1 이상인 데이터 필터링
filtered_df_relative = data[data['max_return_60min'] >= 1.1]

# 'time' 컬럼을 분 단위로 변환
filtered_df_relative['time'] = filtered_df_relative['time'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

# 상관계수 계산 (open_time을 제외한 나머지 항목들에 대해서만 계산)
correlation_matrix = filtered_df_relative.drop(columns=['open_time']).corr()

# max_return_60min과 나머지 항목들 간의 상관계수 출력
max_return_correlations = correlation_matrix['max_return_60min'].drop('max_return_60min').sort_values(ascending=False)
max_return_correlations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_relative['time'] = filtered_df_relative['time'].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))


volume                            0.290336
atr                               0.102573
atr_5                             0.102573
atr_10                            0.099949
atr_14                            0.097475
                                    ...   
supertrend_lower_7_3_14          -0.141304
supertrend_lower_20_4_50         -0.141515
supertrend_lower_50_5_5          -0.143015
lowerband                        -0.143015
Accumulation_Distribution_Line   -0.144703
Name: max_return_60min, Length: 93, dtype: float64

######################

**학습 데이터 처리(공통)**
######################

In [None]:
# open_time 열이 datetime 형식이 아닌 경우 변환
if not np.issubdtype(data['open_time'].dtype, np.datetime64):
    data['open_time'] = pd.to_datetime(data['open_time'])

# time 열을 분 단위로 변환
data['time'] = data['open_time'].dt.hour * 60 + data['open_time'].dt.minute

# 사용하지 않을 열 제외
data = data.drop(columns=['open_time'])

######################

**학습 데이터 처리(시계열)**
######################

In [None]:
# 목표 변수 생성 (max_return_60min이 1.1% 이상인 경우 1, 그렇지 않으면 0)
data['target'] = (data['max_return_60min'] >= 1.1).astype(int)

# 특성과 목표 변수 분리
X = data.drop(columns=['max_return_60min', 'min_return_60min', 'target'])
y = data['target']

# 무한대 값을 NaN으로 대체
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN 값을 평균으로 대체
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# 데이터 정규화
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 시계열 데이터 형태로 변환 (LSTM용)
def create_sequences(data, target, sequence_length):
    sequences = []
    targets = []
    for i in range(len(data) - sequence_length):
        seq = data[i:i + sequence_length]
        label = target[i + sequence_length]
        sequences.append(seq)
        targets.append(label)
    return np.array(sequences), np.array(targets)

# 시퀀스 길이 설정
sequence_length = 10

# LSTM용 데이터 생성
X_seq, y_seq = create_sequences(X_scaled, y, sequence_length)

######################

**학습 데이터 처리(비시계열)**
######################

In [None]:
# 목표 변수 생성 (max_return_60min이 1.1% 이상인 경우 1, 그렇지 않으면 0)
data['target'] = (data['max_return_60min'] >= 1.1).astype(int)

# 특성과 목표 변수 분리
X = data.drop(columns=['max_return_60min', 'min_return_60min', 'target'])
y = data['target']

# 무한대 값을 NaN으로 대체
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN 값을 평균으로 대체
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# 데이터 정규화
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_imputed)

######################

**xgboost**
######################

In [None]:
pip install xgboost



In [None]:
import xgboost as xgb

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# XGBoost 모델 정의
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.05, max_depth=6, random_state=42, use_label_encoder=False, eval_metric='logloss')

# 모델 학습
xgb_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# 예측
y_pred = xgb_model.predict(X_test.reshape(X_test.shape[0], -1))

# 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Accuracy: 0.697
Precision: 0.670
Recall: 0.420
F1 Score: 0.516


In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_xgb_model.joblib'
joblib.dump(xgb_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_xgb_model.joblib


In [None]:
# ExtraTreesClassifier 모델 로딩
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_xgb_model.joblib'

xgb_model = joblib.load(model_path_load)

print("Model loaded successfully")

In [None]:
# 예측 수행
predictions = xgb_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_xgb'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_xgb'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_xgb
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,False,163.590931,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,0
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,True,156.862755,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,True,156.736100,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,False,163.704624,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,0
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,False,163.590931,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,True,168.816198,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,True,164.098585,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,0
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,False,153.399907,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,False,169.778412,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction_xgb'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction_xgb'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 16.734174285383634
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 25.911352852679087


######################

**catboost**
######################

In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
from catboost import CatBoostClassifier

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# CatBoost 모델 정의
cat_model = CatBoostClassifier(iterations=100, learning_rate=0.05, depth=6, random_seed=42, verbose=0)

# 모델 학습
cat_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# 예측
y_pred = cat_model.predict(X_test.reshape(X_test.shape[0], -1))

# 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Accuracy: 0.680
Precision: 0.641
Recall: 0.384
F1 Score: 0.480


In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(cat_model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_cat_model.joblib'
joblib.dump(cat_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_cat_model.joblib


In [None]:
# ExtraTreesClassifier 모델 로딩
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_cat_model.joblib'

cat_model = joblib.load(model_path_load)

print("Model loaded successfully")

In [None]:
# 예측 수행
predictions = cat_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_cat'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_cat'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_xgb,prediction_cat
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,163.590931,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,0,1
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,156.862755,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,156.736100,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,163.704624,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,0,1
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,163.590931,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,168.816198,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,164.098585,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,0,1
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,153.399907,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,169.778412,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction_cat'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction_cat'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 15.542182617752577
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 28.503645411410716


######################

**lightgbm**
######################

In [None]:
pip install lightgbm



In [None]:
import lightgbm as lgb

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# LightGBM 데이터셋 생성
lgb_train = lgb.Dataset(X_train.reshape(X_train.shape[0], -1), y_train, params={'max_bin': 255})
lgb_eval = lgb.Dataset(X_test.reshape(X_test.shape[0], -1), y_test, reference=lgb_train, params={'max_bin': 255})

# LightGBM 하이퍼파라미터 설정
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss', 'auc'},
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'verbose': -1
}

# 모델 학습
gbm = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=lgb_eval,
    callbacks=[lgb.early_stopping(stopping_rounds=10)]
)

# 예측
y_pred_prob = gbm.predict(X_test.reshape(X_test.shape[0], -1), num_iteration=gbm.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.577246	valid_0's auc: 0.743145
Accuracy: 0.6941131794971237
Precision: 0.6591079399211547
Recall: 0.4247679838743543
F1 Score: 0.5166051660516605


In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(gbm)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_LightGBM_model.joblib'
joblib.dump(gbm, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_LightGBM_model.joblib


In [None]:
# ExtraTreesClassifier 모델 로딩
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_LightGBM_model.joblib'

gbm = joblib.load(model_path_load)

print("Model loaded successfully")

Model loaded successfully


In [None]:
# 예측 수행
predictions_prob = gbm.predict(data_test_predict_scaled)

# 확률을 0 또는 1로 변환 (기본 임계값 0.5)
predictions = (predictions_prob > 0.5).astype(int)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_LightGBM'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_LightGBM'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_xgb,prediction_cat,prediction_LightGBM
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,0,1,1
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1,1,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1,1,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,0,1,1
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0,0,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,0,1,1
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1,1,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0,0,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction_LightGBM'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction_LightGBM'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 16.294410369170233
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 24.661497511862052


######################

**RandomForestClassifier**
######################

In [None]:
from sklearn.ensemble import RandomForestClassifier

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# # 모델 학습
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# 예측
y_pred_rf = rf_model.predict(X_test.reshape(X_test.shape[0], -1))

# 평가
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, zero_division=1)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f"Random Forest - Accuracy: {accuracy_rf:.3f}")
print(f"Random Forest - Precision: {precision_rf:.3f}")
print(f"Random Forest - Recall: {recall_rf:.3f}")
print(f"Random Forest - F1 Score: {f1_rf:.3f}")

# 교차 검증 평가
accuracy_cv = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
precision_cv = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='precision')
recall_cv = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='recall')
f1_cv = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1')

print(f"Cross-Validated Accuracy: {np.mean(accuracy_cv):.3f}")
print(f"Cross-Validated Precision: {np.mean(precision_cv):.3f}")
print(f"Cross-Validated Recall: {np.mean(recall_cv):.3f}")
print(f"Cross-Validated F1 Score: {np.mean(f1_cv):.3f}")

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Random Forest - Accuracy: 0.959
Random Forest - Precision: 0.956
Random Forest - Recall: 0.936
Random Forest - F1 Score: 0.946


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_RandomForestClassifier_model.joblib'
joblib.dump(rf_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2번째 계정 전용
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_RandomForestClassifierr_model2.joblib'
joblib.dump(rf_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(rf_model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# 모델 로딩
model_path_load = '/content/drive/MyDrive/Data/SO60_SMALL_RandomForestClassifierr_model2.joblib'

rf_model = joblib.load(model_path_load)

print("Model loaded successfully")

Model loaded successfully


In [None]:
# 예측 수행
predictions = rf_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_rf'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_rf'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_xgb,prediction_cat,prediction_LightGBM,prediction_rf
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,False,163.766759,161.623241,False,3.818484,-0.215212,0,1,1,1
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,True,156.426560,154.714806,False,3.796005,-0.057807,1,1,1,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,True,156.426560,154.633844,False,3.782673,-0.070644,1,1,1,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,False,163.937498,161.602502,False,3.773817,-0.258144,0,1,1,1
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,False,163.838998,161.471002,False,3.761062,-0.270403,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,False,169.307830,168.249485,True,0.000000,-0.865288,0,0,0,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,True,164.186768,162.223585,True,0.000000,-1.121667,0,1,1,1
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,False,153.993659,151.555073,False,0.000000,-0.672939,1,1,1,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,False,170.016173,168.344062,False,0.000000,-0.886054,0,0,0,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction_rf'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction_rf'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 16.294410369170233
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 24.661497511862052


######################

**GradientBoostingClassifier**
######################

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Gradient Boosting 모델 정의
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.05, max_depth=6, random_state=42)

# 모델 학습
gb_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# 예측
y_pred_gb = gb_model.predict(X_test.reshape(X_test.shape[0], -1))

# 평가
accuracy_gb = accuracy_score(y_test, y_pred_gb)
precision_gb = precision_score(y_test, y_pred_gb, zero_division=1)
recall_gb = recall_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)

print(f"Gradient Boosting - Accuracy: {accuracy_gb:.3f}")
print(f"Gradient Boosting - Precision: {precision_gb:.3f}")
print(f"Gradient Boosting - Recall: {recall_gb:.3f}")
print(f"Gradient Boosting - F1 Score: {f1_gb:.3f}")

# 교차 검증 평가
accuracy_cv = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='accuracy')
precision_cv = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='precision')
recall_cv = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='recall')
f1_cv = cross_val_score(gb_model, X_train, y_train, cv=5, scoring='f1')

print(f"Cross-Validated Accuracy: {np.mean(accuracy_cv):.3f}")
print(f"Cross-Validated Precision: {np.mean(precision_cv):.3f}")
print(f"Cross-Validated Recall: {np.mean(recall_cv):.3f}")
print(f"Cross-Validated F1 Score: {np.mean(f1_cv):.3f}")

In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_GradientBoostingClassifier_model.joblib'
joblib.dump(gb_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2번째 계정 전용
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_GradientBoostingClassifier_model2.joblib'
joblib.dump(gb_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(gb_model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# ExtraTreesClassifier 모델 로딩
model_path_load = '/content/drive/MyDrive/Data/SO60_SMALL_GradientBoostingClassifier_model.joblib'

gb_model = joblib.load(model_path_load)

print("Model loaded successfully")

Model loaded successfully


In [None]:
# 예측 수행
predictions = gb_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_gb'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_gb'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_gb
1330056,1330056,2023-07-14 00:28:00,25.71,25.73,25.69,25.70,9232.62,28,0.093703,0.091990,...,False,25.953513,25.428957,False,26.178513,25.241487,False,25.019455,0.000000,1
1330054,1330054,2023-07-14 00:26:00,25.79,25.79,25.64,25.70,13705.60,26,0.098910,0.093075,...,False,25.992870,25.437130,False,26.209552,25.220448,False,25.019455,-0.389105,1
1330055,1330055,2023-07-14 00:27:00,25.70,25.74,25.60,25.72,17370.59,27,0.107128,0.097767,...,False,25.953513,25.386487,False,26.205642,25.134358,False,24.922240,-0.116641,1
1330053,1330053,2023-07-14 00:25:00,25.90,25.91,25.78,25.78,5433.67,25,0.086138,0.086750,...,False,26.116296,25.573704,False,26.275690,25.727413,True,24.631497,-0.698216,1
1330040,1330040,2023-07-14 00:12:00,25.78,25.82,25.77,25.78,4471.04,12,0.070943,0.070237,...,False,26.009573,25.561118,False,26.149717,25.727413,True,24.631497,-0.698216,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335376,1335376,2023-07-17 17:08:00,26.63,26.69,26.62,26.66,14424.84,1028,0.074813,0.070265,...,False,26.913115,26.396885,False,26.991015,26.280934,False,-0.075019,-3.038260,1
1337589,1337589,2023-07-19 06:01:00,26.51,26.51,26.48,26.50,1205.26,361,0.035754,0.033494,...,True,26.633162,26.364355,True,26.673772,26.355043,True,-0.075472,-1.320755,1
1336954,1336954,2023-07-18 19:26:00,25.51,25.52,25.47,25.51,2907.05,1166,0.042682,0.045614,...,False,25.681383,25.308387,False,25.704786,25.281590,False,-0.078401,-1.019208,1
1346065,1346065,2023-07-25 03:17:00,23.46,23.47,23.46,23.46,281.27,197,0.017135,0.019364,...,False,23.553539,23.417285,True,23.550674,23.399324,True,-0.085251,-0.596760,1


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction_gb'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction_gb'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 17.787710291624855
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 21.013892053396418


######################

**AdaBoostClassifier**
######################

In [None]:
from sklearn.ensemble import AdaBoostClassifier

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# AdaBoost 모델 정의
ab_model = AdaBoostClassifier(n_estimators=100, learning_rate=0.05, random_state=42)

# 모델 학습
ab_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)

# 예측
y_pred_ab = ab_model.predict(X_test.reshape(X_test.shape[0], -1))

# 평가
accuracy_ab = accuracy_score(y_test, y_pred_ab)
precision_ab = precision_score(y_test, y_pred_ab, zero_division=1)
recall_ab = recall_score(y_test, y_pred_ab)
f1_ab = f1_score(y_test, y_pred_ab)

print(f"AdaBoost - Accuracy: {accuracy_ab:.3f}")
print(f"AdaBoost - Precision: {precision_ab:.3f}")
print(f"AdaBoost - Recall: {recall_ab:.3f}")
print(f"AdaBoost - F1 Score: {f1_ab:.3f}")

# 교차 검증 평가
accuracy_cv = cross_val_score(ab_model, X_train, y_train, cv=5, scoring='accuracy')
precision_cv = cross_val_score(ab_model, X_train, y_train, cv=5, scoring='precision')
recall_cv = cross_val_score(ab_model, X_train, y_train, cv=5, scoring='recall')
f1_cv = cross_val_score(ab_model, X_train, y_train, cv=5, scoring='f1')

print(f"Cross-Validated Accuracy: {np.mean(accuracy_cv):.3f}")
print(f"Cross-Validated Precision: {np.mean(precision_cv):.3f}")
print(f"Cross-Validated Recall: {np.mean(recall_cv):.3f}")
print(f"Cross-Validated F1 Score: {np.mean(f1_cv):.3f}")

AdaBoost - Accuracy: 0.654
AdaBoost - Precision: 0.615
AdaBoost - Recall: 0.268
AdaBoost - F1 Score: 0.373


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-aa112f0fc05b>", line 27, in <cell line: 27>
    accuracy_cv = cross_val_score(ab_model, X_train, y_train, cv=5, scoring='accuracy')
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 515, in cross_val_score
    cv_results = cross_validate(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 266, in cross_validate
    results = parallel(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1918, in __call__
    return output if self.return_generator else list(output)
  File "/usr/local/lib/python3.10/dist-packages/joblib/

TypeError: object of type 'NoneType' has no len()

In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_AdaBoostClassifier_model.joblib'
joblib.dump(ab_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2번째 계정 전용
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_AdaBoostClassifier_model2.joblib'
joblib.dump(ab_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/SO60_SMALL_AdaBoostClassifier_model2.joblib


In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(ab_model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# ExtraTreesClassifier 모델 로딩
model_path_load = '/content/drive/MyDrive/Data/SO60_SMALL_AdaBoostClassifier_model.joblib'

ab_model = joblib.load(model_path_load)

print("Model loaded successfully")

Model loaded successfully


In [None]:
# 예측 수행
predictions = ab_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_ab'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_ab'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_ab
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,False,163.590931,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,0
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,True,156.862755,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,True,156.736100,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,False,163.704624,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,0
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,False,163.590931,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,True,168.816198,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,True,164.098585,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,0
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,False,153.399907,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,0
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,False,169.778412,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction_ab'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction_ab'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 7.996759634301585
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 53.43131581992825


######################

**ExtraTreesClassifier**
######################

In [None]:
#신규 ExtraTreesClassifier 모델 학습 및 예측 및 결과 검증

from sklearn.ensemble import ExtraTreesClassifier

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 모델 학습
et_model = ExtraTreesClassifier(random_state=42)
et_model.fit(X_train, y_train)

# 예측
y_pred_et = et_model.predict(X_test)

# 평가
accuracy_et = accuracy_score(y_test, y_pred_et)
precision_et = precision_score(y_test, y_pred_et, zero_division=1)
recall_et = recall_score(y_test, y_pred_et)
f1_et = f1_score(y_test, y_pred_et)

print(f"Extra Trees - Accuracy: {accuracy_et:.3f}")
print(f"Extra Trees - Precision: {precision_et:.3f}")
print(f"Extra Trees - Recall: {recall_et:.3f}")
print(f"Extra Trees - F1 Score: {f1_et:.3f}")

# 교차 검증 평가
accuracy_cv = cross_val_score(et_model, X_train, y_train, cv=5, scoring='accuracy')
precision_cv = cross_val_score(et_model, X_train, y_train, cv=5, scoring='precision')
recall_cv = cross_val_score(et_model, X_train, y_train, cv=5, scoring='recall')
f1_cv = cross_val_score(et_model, X_train, y_train, cv=5, scoring='f1')

print(f"Cross-Validated Accuracy: {np.mean(accuracy_cv):.3f}")
print(f"Cross-Validated Precision: {np.mean(precision_cv):.3f}")
print(f"Cross-Validated Recall: {np.mean(recall_cv):.3f}")
print(f"Cross-Validated F1 Score: {np.mean(f1_cv):.3f}")

Extra Trees - Accuracy: 0.963
Extra Trees - Precision: 0.957
Extra Trees - Recall: 0.945
Extra Trees - F1 Score: 0.951
Cross-Validated Accuracy: 0.958
Cross-Validated Precision: 0.953
Cross-Validated Recall: 0.938
Cross-Validated F1 Score: 0.945


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_ExtraTreesClassifier_model.joblib'
joblib.dump(et_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_ExtraTreesClassifier_model.joblib


In [None]:
#2번째 계정 전용
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_ExtraTreesClassifier_model2.joblib'
joblib.dump(et_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/SO60_SMALL_ExtraTreesClassifier_model2.joblib


In [None]:
# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# SHAP 값 계산 및 시각화
explainer = shap.Explainer(et_model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# ExtraTreesClassifier 모델 로딩
model_path_load = '/content/drive/MyDrive/Data/SO60_SMALL_ExtraTreesClassifier_model2.joblib'

et_model = joblib.load(model_path_load)

print("Model loaded successfully")


Model loaded successfully


In [None]:
#2번째 계정 전용
# ExtraTreesClassifier 모델 로딩
model_path_load = '/content/drive/MyDrive/SO60_SMALL_ExtraTreesClassifier_model2.joblib'

et_model = joblib.load(model_path_load)

print("Model loaded successfully")

In [None]:
# 로드된 ExtraTreesClassifier 모델 예측 및 결과 검증

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 예측
y_pred_et = et_model.predict(X_test.reshape(X_test.shape[0], -1))

# 평가
accuracy_et = accuracy_score(y_test, y_pred_et)
precision_et = precision_score(y_test, y_pred_et, zero_division=1)
recall_et = recall_score(y_test, y_pred_et)
f1_et = f1_score(y_test, y_pred_et)

print(f"Extra Trees - Accuracy: {accuracy_et:.3f}")
print(f"Extra Trees - Precision: {precision_et:.3f}")
print(f"Extra Trees - Recall: {recall_et:.3f}")
print(f"Extra Trees - F1 Score: {f1_et:.3f}")

# 교차 검증 평가
X_reshaped = X_train.reshape(X_train.shape[0], -1)
accuracy_cv = cross_val_score(et_model, X_reshaped, y_train, cv=5, scoring='accuracy')
precision_cv = cross_val_score(et_model, X_reshaped, y_train, cv=5, scoring='precision')
recall_cv = cross_val_score(et_model, X_reshaped, y_train, cv=5, scoring='recall')
f1_cv = cross_val_score(et_model, X_reshaped, y_train, cv=5, scoring='f1')

print(f"Cross-Validated Accuracy: {np.mean(accuracy_cv):.3f}")
print(f"Cross-Validated Precision: {np.mean(precision_cv):.3f}")
print(f"Cross-Validated Recall: {np.mean(recall_cv):.3f}")
print(f"Cross-Validated F1 Score: {np.mean(f1_cv):.3f}")

In [None]:
# 예측 수행
predictions = et_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_gb,prediction_ab,prediction
1330056,1330056,2023-07-14 00:28:00,25.71,25.73,25.69,25.70,9232.62,28,0.093703,0.091990,...,25.428957,False,26.178513,25.241487,False,25.019455,0.000000,1,1,1
1330054,1330054,2023-07-14 00:26:00,25.79,25.79,25.64,25.70,13705.60,26,0.098910,0.093075,...,25.437130,False,26.209552,25.220448,False,25.019455,-0.389105,1,1,1
1330055,1330055,2023-07-14 00:27:00,25.70,25.74,25.60,25.72,17370.59,27,0.107128,0.097767,...,25.386487,False,26.205642,25.134358,False,24.922240,-0.116641,1,1,1
1330053,1330053,2023-07-14 00:25:00,25.90,25.91,25.78,25.78,5433.67,25,0.086138,0.086750,...,25.573704,False,26.275690,25.727413,True,24.631497,-0.698216,1,1,1
1330040,1330040,2023-07-14 00:12:00,25.78,25.82,25.77,25.78,4471.04,12,0.070943,0.070237,...,25.561118,False,26.149717,25.727413,True,24.631497,-0.698216,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335376,1335376,2023-07-17 17:08:00,26.63,26.69,26.62,26.66,14424.84,1028,0.074813,0.070265,...,26.396885,False,26.991015,26.280934,False,-0.075019,-3.038260,1,1,1
1337589,1337589,2023-07-19 06:01:00,26.51,26.51,26.48,26.50,1205.26,361,0.035754,0.033494,...,26.364355,True,26.673772,26.355043,True,-0.075472,-1.320755,1,0,1
1336954,1336954,2023-07-18 19:26:00,25.51,25.52,25.47,25.51,2907.05,1166,0.042682,0.045614,...,25.308387,False,25.704786,25.281590,False,-0.078401,-1.019208,1,0,1
1346065,1346065,2023-07-25 03:17:00,23.46,23.47,23.46,23.46,281.27,197,0.017135,0.019364,...,23.417285,True,23.550674,23.399324,True,-0.085251,-0.596760,1,0,1


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1) & (data_test['prediction'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1) & (data_test['prediction'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 15.969963302485446
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 26.056851253064185


######################

**tensorflow**
######################

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# MLP 모델 정의
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# 모델 컴파일
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

# 모델 평가
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")

# 로드된 모델로 예측
y_pred_loaded = (model.predict(X_test) > 0.5).astype("int32")

# 로드된 모델 평가
accuracy_loaded = accuracy_score(y_test, y_pred_loaded)
precision_loaded = precision_score(y_test, y_pred_loaded, zero_division=1)
recall_loaded = recall_score(y_test, y_pred_loaded)
f1_loaded = f1_score(y_test, y_pred_loaded)

print(f"Loaded MLP - Accuracy: {accuracy_loaded:.3f}")
print(f"Loaded MLP - Precision: {precision_loaded:.3f}")
print(f"Loaded MLP - Recall: {recall_loaded:.3f}")
print(f"Loaded MLP - F1 Score: {f1_loaded:.3f}")

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.6775256991386414
Loaded MLP - Accuracy: 0.678
Loaded MLP - Precision: 0.644
Loaded MLP - Recall: 0.362
Loaded MLP - F1 Score: 0.463


In [None]:
# 교차 검증 평가
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracy_cv = []
precision_cv = []
recall_cv = []
f1_cv = []

for train_index, test_index in kf.split(X_scaled):
    X_train_cv, X_test_cv = X_scaled[train_index], X_scaled[test_index]
    y_train_cv, y_test_cv = y[train_index], y[test_index]

    model_cv = Sequential()
    model_cv.add(Dense(64, input_dim=X_train_cv.shape[1], activation='relu'))
    model_cv.add(Dropout(0.5))
    model_cv.add(Dense(32, activation='relu'))
    model_cv.add(Dropout(0.5))
    model_cv.add(Dense(1, activation='sigmoid'))

    model_cv.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

    model_cv.fit(X_train_cv, y_train_cv, epochs=50, batch_size=32, verbose=0)

    y_pred_cv = (model_cv.predict(X_test_cv) > 0.5).astype("int32")

    accuracy_cv.append(accuracy_score(y_test_cv, y_pred_cv))
    precision_cv.append(precision_score(y_test_cv, y_pred_cv, zero_division=1))
    recall_cv.append(recall_score(y_test_cv, y_pred_cv))
    f1_cv.append(f1_score(y_test_cv, y_pred_cv))

print("MLP - 교차 검증 평가")
print(f"Cross-Validated Accuracy: {np.mean(accuracy_cv):.3f}")
print(f"Cross-Validated Precision: {np.mean(precision_cv):.3f}")
print(f"Cross-Validated Recall: {np.mean(recall_cv):.3f}")
print(f"Cross-Validated F1 Score: {np.mean(f1_cv):.3f}")

In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_MLP_model.joblib'
joblib.dump(model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_MLP_model.joblib


In [None]:
#2번째 계정 전용
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_MLP_model.joblib'
joblib.dump(model, model_path)

print(f"Model saved to {model_path}")

In [None]:
# SHAP 값 계산 및 시각화
explainer = shap.Explainer(model)
shap_values = explainer(X_test.reshape(X_test.shape[0], -1))
shap.summary_plot(shap_values, X_test.reshape(X_test.shape[0], -1), feature_names=X.columns)

In [None]:
# 예측 수행
predictions = model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_MLP'] = predictions



In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_MLP'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_MLP
1330056,1330056,2023-07-14 00:28:00,25.71,25.73,25.69,25.70,9232.62,28,0.093703,0.091990,...,False,25.953513,25.428957,False,26.178513,25.241487,False,25.019455,0.000000,1.000000e+00
1330054,1330054,2023-07-14 00:26:00,25.79,25.79,25.64,25.70,13705.60,26,0.098910,0.093075,...,False,25.992870,25.437130,False,26.209552,25.220448,False,25.019455,-0.389105,1.000000e+00
1330055,1330055,2023-07-14 00:27:00,25.70,25.74,25.60,25.72,17370.59,27,0.107128,0.097767,...,False,25.953513,25.386487,False,26.205642,25.134358,False,24.922240,-0.116641,1.000000e+00
1330053,1330053,2023-07-14 00:25:00,25.90,25.91,25.78,25.78,5433.67,25,0.086138,0.086750,...,False,26.116296,25.573704,False,26.275690,25.727413,True,24.631497,-0.698216,1.000000e+00
1330040,1330040,2023-07-14 00:12:00,25.78,25.82,25.77,25.78,4471.04,12,0.070943,0.070237,...,False,26.009573,25.561118,False,26.149717,25.727413,True,24.631497,-0.698216,9.999896e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335376,1335376,2023-07-17 17:08:00,26.63,26.69,26.62,26.66,14424.84,1028,0.074813,0.070265,...,False,26.913115,26.396885,False,26.991015,26.280934,False,-0.075019,-3.038260,1.000000e+00
1337589,1337589,2023-07-19 06:01:00,26.51,26.51,26.48,26.50,1205.26,361,0.035754,0.033494,...,True,26.633162,26.364355,True,26.673772,26.355043,True,-0.075472,-1.320755,5.923811e-07
1336954,1336954,2023-07-18 19:26:00,25.51,25.52,25.47,25.51,2907.05,1166,0.042682,0.045614,...,False,25.681383,25.308387,False,25.704786,25.281590,False,-0.078401,-1.019208,4.124725e-01
1346065,1346065,2023-07-25 03:17:00,23.46,23.47,23.46,23.46,281.27,197,0.017135,0.019364,...,False,23.553539,23.417285,True,23.550674,23.399324,True,-0.085251,-0.596760,8.284887e-23


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_MLP'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_MLP'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 10.116620855370426
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 2.4413300303954317


######################

**Stochastic Gradient Boosting**
######################

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Stochastic Gradient Boosting 모델 생성 및 학습
sgb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    subsample=0.8,  # 각 단계에서 80%의 샘플만 사용하여 무작위성 도입
    random_state=42
)
sgb_model.fit(X_train, y_train)

# 예측 수행
y_pred_prob = sgb_model.predict_proba(X_test)[:, 1]  # 클래스 1에 대한 확률
y_pred = (y_pred_prob > 0.5).astype(int)  # 임계값 0.5를 기준으로 0 또는 1로 변환

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Accuracy: 0.684094434748885
Precision: 0.6486109658754227
Recall: 0.39071095620039475
F1 Score: 0.48766297582388785


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_sgb_model.joblib'
joblib.dump(sgb_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_sgb_model.joblib


In [None]:
# 예측 수행
predictions = sgb_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_sgb'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_sgb'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_sgb
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,False,163.590931,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,1
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,True,156.862755,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,True,156.736100,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,False,163.704624,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,1
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,False,163.590931,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,True,168.816198,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,True,164.098585,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,1
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,False,153.399907,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,False,169.778412,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_sgb'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_sgb'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 13.435944913783127
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 25.47158893646569


######################

**Gradient Boosted Decision Trees (GBDT)**
######################

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Gradient Boosted Decision Trees (GBDT) 모델 생성 및 학습
gbdt_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)
gbdt_model.fit(X_train, y_train)

# 예측 수행
y_pred_prob = gbdt_model.predict_proba(X_test)[:, 1]  # 클래스 1에 대한 확률
y_pred = (y_pred_prob > 0.5).astype(int)  # 임계값 0.5를 기준으로 0 또는 1로 변환

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Accuracy: 0.6843287441018681
Precision: 0.6498843809123397
Recall: 0.38947213706798806
F1 Score: 0.4870549312047054


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_gbdt_model.joblib'
joblib.dump(gbdt_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_gbdt_model.joblib


In [None]:
# 예측 수행
predictions = gbdt_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_gbdt'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_gbdt'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_sgb,prediction_gbdt
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,163.590931,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,1,1
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,156.862755,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,156.736100,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,163.704624,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,1,1
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,163.590931,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,168.816198,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,164.098585,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,1,1
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,153.399907,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,169.778412,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_gbdt'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_gbdt'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 13.459090383057518
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 24.04814257609073


######################

**DART (Dropouts meet Multiple Additive Regression Trees)**
######################

In [None]:
import lightgbm as lgb

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# LightGBM 데이터셋 생성
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

# LightGBM 하이퍼파라미터 설정 (DART 사용)
params = {
    'boosting_type': 'dart',  # DART 사용
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
    'num_leaves': 31,
    'learning_rate': 0.1,
    'feature_fraction': 0.9,
    'drop_rate': 0.1,  # Dropout rate
    'max_drop': 10,  # 최대 drop 수
    'skip_drop': 0.5,  # Dropout을 스킵할 확률
    'verbose': -1,
    'random_state': 42
}

# 모델 학습 (조기 종료 없이)
lgb_dart = lgb.train(
    params,
    lgb_train,
    num_boost_round=100,
    valid_sets=lgb_eval
)

# 예측 수행
y_pred_prob = lgb_dart.predict(X_test, num_iteration=lgb_dart.best_iteration)
y_pred = (y_pred_prob > 0.5).astype(int)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

X_train shape: (495071, 62)
X_test shape: (123768, 62)
y_train shape: (495071,)
y_test shape: (123768,)
Accuracy: 0.6913337857927736
Precision: 0.6528070573735932
Recall: 0.42262629656070216
F1 Score: 0.5130832664194038


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_lgb_dart.joblib'
joblib.dump(lgb_dart, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Data/SO60_SMALL_lgb_dart.joblib


In [None]:
# 예측 수행
predictions_prob = lgb_dart.predict(data_test_predict_scaled)

# 확률을 0 또는 1로 변환 (기본 임계값 0.5)
predictions = (predictions_prob > 0.5).astype(int)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_lgb_dart'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_lgb_dart'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_sgb,prediction_gbdt,prediction_lgb_dart
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,1,1,0
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1,1,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,1,1,1
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,1,1,0
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0,0,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,1,1,0
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1,1,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0,0,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_lgb_dart'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_lgb_dart'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 13.192917486402036
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 28.00601782201134


######################

**LSTM**
######################

In [None]:
# 학습 데이터와 테스트 데이터 분리 (LSTM용)
X_train_seq, X_test_seq, y_train_seq, y_test_seq = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

# LSTM 모델 학습
lstm_model = Sequential()
lstm_model.add(LSTM(50, activation='relu', input_shape=(sequence_length, X_train_seq.shape[2])))
lstm_model.add(Dense(1, activation='sigmoid'))
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

lstm_model.fit(X_train_seq, y_train_seq, epochs=20, batch_size=32, validation_data=(X_test_seq, y_test_seq))

# 예측 및 성능 평가
y_pred_prob_lstm = lstm_model.predict(X_test_seq)
y_pred_lstm = (y_pred_prob_lstm > 0.5).astype(int)
print("LSTM Accuracy:", accuracy_score(y_test_seq, y_pred_lstm))
print("LSTM Precision:", precision_score(y_test_seq, y_pred_lstm))
print("LSTM Recall:", recall_score(y_test_seq, y_pred_lstm))
print("LSTM F1 Score:", f1_score(y_test_seq, y_pred_lstm))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
LSTM Accuracy: 0.6628153127676422
LSTM Precision: 0.6639321697957385
LSTM Recall: 0.25287814289009586
LSTM F1 Score: 0.36625664388762336


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_lstm_model.joblib'
joblib.dump(lstm_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2nd Account
# 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_lstm_model.joblib'
joblib.dump(lstm_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/SO60_SMALL_lstm_model.joblib


In [None]:
# 예측 수행
y_pred_prob = lstm_model.predict(X_test_seq)
y_pred = (y_pred_prob > 0.5).astype(int)

# 예측 결과를 원본 데이터에 매핑
start_index = sequence_length
end_index = start_index + len(y_pred)
data_test_comparison = data_test.iloc[start_index:end_index].copy()
data_test_comparison['prediction_LSTM'] = y_pred



In [None]:
data_test_non_nan = data_test_comparison.dropna(subset=['prediction_LSTM'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,atr_14,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_LSTM
11170,2021-01-08 09:10:00,2.4276,2.4322,2.4248,2.4299,7169.12,550,0.013243,0.012952,0.012702,...,True,2.482098,2.374902,True,2.494713,2.362287,True,39.923454,0.074077,0
11171,2021-01-08 09:11:00,2.4317,2.4500,2.4317,2.4467,21257.63,551,0.014614,0.013667,0.013230,...,True,2.494984,2.386716,True,2.513920,2.367780,True,38.962684,-0.069481,0
11172,2021-01-08 09:12:00,2.4450,2.4586,2.4450,2.4570,26436.11,552,0.014411,0.013660,0.013257,...,True,2.505939,2.397661,True,2.523856,2.379744,True,38.380138,-0.130240,0
11175,2021-01-08 09:15:00,2.4707,2.4796,2.4638,2.4699,21377.89,555,0.017796,0.015906,0.015022,...,True,2.527923,2.419420,True,2.560681,2.384024,True,37.657395,-0.194340,0
11174,2021-01-08 09:14:00,2.4795,2.4855,2.4655,2.4736,27511.41,554,0.018295,0.015918,0.014962,...,True,2.531580,2.419420,True,2.566976,2.384024,True,37.451488,-0.396184,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92704,2021-03-06 01:23:00,12.5773,12.5810,12.5693,12.5715,210.76,83,0.013927,0.016688,0.018003,...,False,12.638673,12.491662,False,12.637698,12.505516,False,-0.601360,-2.363282,0
92697,2021-03-06 01:16:00,12.5549,12.5775,12.5549,12.5726,354.44,76,0.019676,0.021268,0.021729,...,False,12.638673,12.477684,False,12.646756,12.467820,False,-0.610057,-2.371824,0
92681,2021-03-06 01:00:00,12.5761,12.5875,12.5656,12.5875,1189.75,60,0.024288,0.022930,0.022275,...,False,12.664131,12.567813,True,12.697989,12.571851,True,-0.727706,-1.275869,1
92683,2021-03-06 01:02:00,12.5927,12.6024,12.5724,12.5923,647.34,62,0.024296,0.023122,0.022490,...,False,12.675261,12.567813,True,12.708881,12.571851,True,-0.765547,-1.857484,1


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test_sorted[(data_test_sorted['max_return_60min'] >= 1.1) & (data_test_sorted['prediction_LSTM'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test_sorted[(data_test_sorted['max_return_60min'] < 1.1) & (data_test_sorted['prediction_LSTM'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 1.2315175411663948
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 9.887691288399076


######################

**TabNet**
######################

In [None]:
pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m726.5 kB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.3->pytorch-tabnet)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinu

In [None]:
# GPU 설정 (가능한 경우)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# TabNet 모델 생성 및 학습
tabnet_model = TabNetClassifier()
tabnet_model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# 예측 수행
y_pred = tabnet_model.predict(X_test)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')



epoch 0  | loss: 0.62311 | train_accuracy: 0.55707 | valid_accuracy: 0.55762 |  0:01:42s
epoch 1  | loss: 0.60802 | train_accuracy: 0.65938 | valid_accuracy: 0.66158 |  0:02:42s
epoch 2  | loss: 0.60474 | train_accuracy: 0.67023 | valid_accuracy: 0.67137 |  0:03:40s
epoch 3  | loss: 0.60377 | train_accuracy: 0.66626 | valid_accuracy: 0.6668  |  0:04:37s
epoch 4  | loss: 0.60362 | train_accuracy: 0.67776 | valid_accuracy: 0.67875 |  0:05:36s
epoch 5  | loss: 0.60316 | train_accuracy: 0.66953 | valid_accuracy: 0.67166 |  0:06:57s
epoch 6  | loss: 0.60311 | train_accuracy: 0.66898 | valid_accuracy: 0.67039 |  0:08:01s
epoch 7  | loss: 0.60901 | train_accuracy: 0.66428 | valid_accuracy: 0.66553 |  0:09:01s
epoch 8  | loss: 0.60522 | train_accuracy: 0.66331 | valid_accuracy: 0.66406 |  0:09:56s
epoch 9  | loss: 0.60403 | train_accuracy: 0.67309 | valid_accuracy: 0.67374 |  0:10:54s
epoch 10 | loss: 0.60299 | train_accuracy: 0.67625 | valid_accuracy: 0.67818 |  0:11:51s
epoch 11 | loss: 0.60



Accuracy: 0.6787457177945834
Precision: 0.6234190126478988
Recall: 0.41708310586654346
F1 Score: 0.49979242410899616


In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SO60_SMALL_tabnet_model.joblib'
joblib.dump(lstm_model, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2nd Account
# 모델 저장
model_path = '/content/drive/MyDrive/SO60_SMALL_tabnet_model.joblib'
joblib.dump(tabnet_model, model_path)

print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/SO60_SMALL_tabnet_model.joblib


In [None]:
# 예측 수행
predictions = tabnet_model.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_tabnet'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_tabnet'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,atr_10,...,supertrend_in_uptrend_14_2_10,supertrend_upper_20_4_50,supertrend_lower_20_4_50,supertrend_in_uptrend_20_4_50,supertrend_upper_50_5_5,supertrend_lower_50_5_5,supertrend_in_uptrend_50_5_5,max_return_60min,min_return_60min,prediction_tabnet
7109,7109,2024-07-19 13:29:00,162.75,162.80,162.59,162.63,1289.717,809,0.214352,0.235240,...,False,163.590931,161.769140,False,163.766759,161.623241,False,3.818484,-0.215212,1
2880,2880,2024-07-16 15:00:00,155.91,155.93,155.69,155.69,1983.029,900,0.219039,0.222679,...,True,156.862755,154.977555,True,156.426560,154.714806,False,3.796005,-0.057807,1
2881,2881,2024-07-16 15:01:00,155.70,155.78,155.60,155.71,1419.077,901,0.211231,0.218411,...,True,156.736100,154.977555,True,156.426560,154.633844,False,3.782673,-0.070644,0
7106,7106,2024-07-19 13:26:00,162.82,162.85,162.69,162.70,1820.981,806,0.233500,0.250055,...,False,163.704624,161.835376,False,163.937498,161.602502,False,3.773817,-0.258144,1
7107,7107,2024-07-19 13:27:00,162.69,162.78,162.53,162.72,2155.233,807,0.236800,0.250049,...,False,163.590931,161.719069,False,163.838998,161.471002,False,3.761062,-0.270403,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8522,8522,2024-07-20 13:02:00,168.56,168.73,168.56,168.73,846.118,782,0.132566,0.128740,...,True,168.816198,168.145976,False,169.307830,168.249485,True,0.000000,-0.865288,0
3519,3519,2024-07-17 01:39:00,163.03,163.15,163.03,163.15,619.675,99,0.219354,0.251539,...,True,164.098585,162.673690,True,164.186768,162.223585,True,0.000000,-1.121667,1
1075,1075,2024-07-15 08:55:00,152.72,153.06,152.72,153.06,1988.077,535,0.266985,0.238983,...,False,153.399907,152.140127,False,153.993659,151.555073,False,0.000000,-0.672939,1
7594,7594,2024-07-19 21:34:00,169.14,169.29,169.14,169.29,1151.051,1294,0.174188,0.174318,...,False,169.778412,168.489909,False,170.016173,168.344062,False,0.000000,-0.886054,0


In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_tabnet'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_tabnet'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 8.54067816224974
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 45.48084712417545


######################

**AutoGluon**
######################

In [None]:
pip install autogluon

Collecting autogluon
  Downloading autogluon-1.1.1-py3-none-any.whl (9.7 kB)
Collecting autogluon.multimodal==1.1.1 (from autogluon)
  Downloading autogluon.multimodal-1.1.1-py3-none-any.whl (427 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m428.0/428.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting autogluon.timeseries[all]==1.1.1 (from autogluon)
  Downloading autogluon.timeseries-1.1.1-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.2/148.2 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Collecting ray[default,tune]<2.11,>=2.10.0 (from autogluon.core[all]==1.1.1->autogluon)
  Downloading ray-2.10.0-cp310-cp310-manylinux2014_x86_64.whl (65.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.1/65.1 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting Pillow<11,>=10.0.1 (from autogluon.multimodal==1.1.1->autogluon)
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl

In [None]:
from autogluon.tabular import TabularPredictor

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 학습 데이터와 검증 데이터를 DataFrame으로 변환
train_data = pd.DataFrame(X_train, columns=X.columns)
train_data['target'] = y_train.values

test_data = pd.DataFrame(X_test, columns=X.columns)
test_data['target'] = y_test.values

# AutoGluon 모델 학습
predictor = TabularPredictor(label='target', eval_metric='accuracy').fit(train_data)

# 검증 데이터에 대한 예측 수행
y_pred = predictor.predict(test_data)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

No path specified. Models will be saved in: "AutogluonModels/ag-20240723_030958"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       8.43 GB / 12.67 GB (66.5%)
Disk Space Avail:   75.74 GB / 107.72 GB (70.3%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
	Co

[1000]	valid_set's binary_error: 0.272874
[2000]	valid_set's binary_error: 0.252272
[3000]	valid_set's binary_error: 0.229651
[4000]	valid_set's binary_error: 0.208847
[5000]	valid_set's binary_error: 0.19693
[6000]	valid_set's binary_error: 0.187033
[7000]	valid_set's binary_error: 0.17653
[8000]	valid_set's binary_error: 0.170067
[9000]	valid_set's binary_error: 0.165825
[10000]	valid_set's binary_error: 0.159362


	0.8408	 = Validation score   (accuracy)
	960.33s	 = Training   runtime
	6.7s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's binary_error: 0.210463
[2000]	valid_set's binary_error: 0.153504
[3000]	valid_set's binary_error: 0.122198
[4000]	valid_set's binary_error: 0.100586
[5000]	valid_set's binary_error: 0.0858412
[6000]	valid_set's binary_error: 0.078166
[7000]	valid_set's binary_error: 0.0737225
[8000]	valid_set's binary_error: 0.068269
[9000]	valid_set's binary_error: 0.0644314
[10000]	valid_set's binary_error: 0.0620077


	0.9384	 = Validation score   (accuracy)
	1042.07s	 = Training   runtime
	4.93s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.9505	 = Validation score   (accuracy)
	2431.12s	 = Training   runtime
	1.33s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.955	 = Validation score   (accuracy)
	2860.52s	 = Training   runtime
	0.66s	 = Validation runtime
Fitting model: CatBoost ...
	0.8994	 = Validation score   (accuracy)
	2643.64s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.903	 = Validation score   (accuracy)
	390.52s	 = Training   runtime
	0.49s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.9061	 = Validation score   (accuracy)
	340.99s	 = Training   runtime
	0.46s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	0.6978	 = Validation score   (accuracy)
	464.15s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: XGBoost ...
	0.9497	 = Validation score   (accuracy)
	1681.46s	 = Training

[1000]	valid_set's binary_error: 0.128257
[2000]	valid_set's binary_error: 0.0820036
[3000]	valid_set's binary_error: 0.0648354
[4000]	valid_set's binary_error: 0.0577661
[5000]	valid_set's binary_error: 0.0523127
[6000]	valid_set's binary_error: 0.049283
[7000]	valid_set's binary_error: 0.0470612
[8000]	valid_set's binary_error: 0.0452434
[9000]	valid_set's binary_error: 0.0434256
[10000]	valid_set's binary_error: 0.0420117


	0.9592	 = Validation score   (accuracy)
	1299.84s	 = Training   runtime
	13.18s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'LightGBMLarge': 0.556, 'RandomForestEntr': 0.222, 'XGBoost': 0.222}
	0.962	 = Validation score   (accuracy)
	0.35s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 17989.73s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 152.6 rows/s (4951 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240723_030958")


Accuracy: 0.9569678753797427
Precision: 0.9552107098273859
Recall: 0.9318649477176332
F1 Score: 0.943393418926962


In [None]:
# 최적의 모델 정보 출력
leaderboard = predictor.leaderboard(test_data, silent=True)
print("Leaderboard:")
print(leaderboard)

best_model = predictor.get_model_best()
print(f"Best Model: {best_model}")

best_model_params = predictor.info()['model_info'][best_model]['hyperparameters']
print("Best Model Hyperparameters:")
print(best_model_params)

Leaderboard:
                  model  score_test  score_val eval_metric  pred_time_test  \
0   WeightedEnsemble_L2    0.956968   0.962028    accuracy      642.607019   
1         LightGBMLarge    0.955707   0.959200    accuracy      247.774772   
2      RandomForestEntr    0.951684   0.954959    accuracy       17.344031   
3               XGBoost    0.946771   0.949707    accuracy      377.452070   
4      RandomForestGini    0.944832   0.950515    accuracy       21.752576   
5              LightGBM    0.933909   0.938396    accuracy      121.902113   
6        ExtraTreesEntr    0.904604   0.906080    accuracy       13.220259   
7        ExtraTreesGini    0.898980   0.903050    accuracy       14.699028   
8              CatBoost    0.892484   0.899414    accuracy        1.508024   
9            LightGBMXT    0.836452   0.840840    accuracy      141.282534   
10       NeuralNetTorch    0.776380   0.782872    accuracy        0.878536   
11      NeuralNetFastAI    0.696981   0.697839    a

  best_model = predictor.get_model_best()


Best Model Hyperparameters:
{'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}


In [None]:
import os
# 특정 경로에 모델 저장
save_path = "/content/drive/MyDrive/Data/SO60_SMALL_AutoGluon"
os.makedirs(save_path, exist_ok=True)
predictor.save(directory=save_path)

In [None]:
import os
#2nd Account
# 특정 경로에서 모델 저장
save_path = "/content/drive/MyDrive/SO60_SMALL_AutoGluon"
os.makedirs(save_path, exist_ok=True)
predictor.save(save_path)

In [None]:
# 특정 경로에서 모델 로드
save_path = "/content/drive/MyDrive/SO60_SMALL_AutoGluon"
predictor = TabularPredictor.load(save_path)

In [None]:
# 예측 데이터를 DataFrame으로 변환
data_test_predict_df = pd.DataFrame(data_test_predict_scaled, columns=data_test_predict.columns)

# 예측 수행
predictions = predictor.predict(data_test_predict_df)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_AutoGluon'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_AutoGluon'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_AutoGluon'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_AutoGluon'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: 31.678029342042112
max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: 20.113470547266736


######################

**모델분리**
######################

In [None]:
#"WeightedEnsemble_L2", "LightGBMLarge", "RandomForestEntr", "XGBoost"
#"RandomForestGini", "LightGBM", "ExtraTreesEntr", "ExtraTreesGini", "CatBoost"
#"LightGBMXT", "NeuralNetTorch", "NeuralNetFastAI","KNeighborsDist", "KNeighborsUnif"

select = "LightGBMLarge"
predictor_sep = predictor._trainer.load_model(select)

#LightGBMLarge(autogluon) => ,

In [None]:
# 검증 데이터에 대한 예측 수행
y_pred = predictor_sep.predict(test_data)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9557074526533514
Precision: 0.9528107271789582
Recall: 0.9310040734052828
F1 Score: 0.9417811856162783


In [None]:
select_model_params = predictor.info()['model_info'][select]['hyperparameters']
print("Best Model Hyperparameters:")
print(select_model_params)

In [None]:
# ExtraTreesClassifier 모델 저장
model_path = f'/content/drive/MyDrive/Data/SOL60_automl_{select}.joblib'
joblib.dump(predictor_sep, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2nd Account
# 모델 저장
model_path = f'/content/drive/MyDrive/SOL60_automl_{select}.joblib'
joblib.dump(predictor_sep, model_path)

print(f"Model saved to {model_path}")

In [None]:
# 예측 데이터를 DataFrame으로 변환
data_test_predict_df = pd.DataFrame(data_test_predict_scaled, columns=data_test_predict.columns)

# 예측 수행
predictions = predictor_sep.predict(data_test_predict_df)

# 예측 결과를 데이터프레임에 동적으로 추가
column_name = f'prediction_AutoGluon_{select}'
data_test[column_name] = predictions

In [None]:
# 결측치가 없는 행만 선택하여 새로운 데이터프레임 생성
data_test_non_nan = data_test.dropna(subset=[column_name])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['column_name'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['column_name'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

######################

**H2O-3**
######################

In [None]:
pip install h2o

In [None]:
import h2o
from h2o.automl import H2OAutoML

# H2O 클러스터 시작
h2o.init()

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 학습 데이터와 검증 데이터를 DataFrame으로 변환
train_data = pd.DataFrame(X_train, columns=X.columns)
train_data['target'] = y_train.values

# H2O 프레임으로 변환
train_h2o = h2o.H2OFrame(train_data)
test_h2o = h2o.H2OFrame(test_data)

# H2O AutoML 학습
aml = H2OAutoML(max_models=20, seed=42)
aml.train(x=train_h2o.columns[:-1], y='target', training_frame=train_h2o)

# 리더보드 출력
lb = aml.leaderboard
print(lb)

# 최적의 모델
best_model_h2o = aml.leader
print(f"Best Model: {best_model_h2o}")

# 검증 데이터에 대한 예측 수행
pred_h2o = best_model_h2o.predict(test_h2o)
y_pred = h2o.as_list(pred_h2o['predict'], use_pandas=True).values.flatten()

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [None]:
# 최적의 모델 저장
save_path = "/content/drive/MyDrive/Data/H2OModels"
os.makedirs(save_path, exist_ok=True)
best_model_h2o = aml.leader
best_model_save_path = h2o.save_model(model=best_model_h2o, path=save_path, force=True)

In [None]:
# 최적의 모델 저장
save_path = "/content/drive/MyDrive/H2OModels"
os.makedirs(save_path, exist_ok=True)
best_model_h2o = aml.leader
best_model_save_path = h2o.save_model(model=best_model_h2o, path=save_path, force=True)

In [None]:
# H2O 클러스터 시작
h2o.init()

# 모델을 로드할 경로를 명시적으로 설정
load_path = "/content/drive/MyDrive/Data/H2OModels/StackedEnsemble_AllModels_AutoML_20210729_125123"  # 실제 모델 파일명을 사용해야 합니다
best_model_h2o = h2o.load_model(load_path)

In [None]:
# 예측 데이터를 DataFrame으로 변환
data_test_predict_df = pd.DataFrame(data_test_predict_scaled, columns=data_test_predict.columns)
data_test_h2o = h2o.H2OFrame(data_test_predict_df)

# 예측 수행
predictions_h2o = best_model_h2o.predict(data_test_h2o)
predictions = h2o.as_list(predictions_h2o['predict'], use_pandas=True).values.flatten()

# 예측 결과를 데이터프레임에 추가
data_test['prediction_h2o'] = predictions

# H2O 클러스터 종료
h2o.shutdown()

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_h2o'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_h2o'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_h2o'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

######################

**Auto-sklearn**
######################

In [None]:
pip install auto-sklearn

In [None]:
import autosklearn.classification
import autosklearn.regression

# 학습 데이터와 검증 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Auto-sklearn 분류 모델 생성
automl = autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=3600,  # 전체 학습 시간 (초)
    per_run_time_limit=360,        # 각 모델 학습 시간 (초)
    n_jobs=-1                      # 모든 CPU 코어 사용
)

# 모델 학습
automl.fit(X_train, y_train)

# 검증 데이터에 대한 예측 수행
y_pred = automl.predict(X_test)

# 성능 평가
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

In [None]:
# ExtraTreesClassifier 모델 저장
model_path = '/content/drive/MyDrive/Data/SOL60_automl.joblib'
joblib.dump(automl, model_path)

print(f"Model saved to {model_path}")

In [None]:
#2nd Account
# 모델 저장
model_path = '/content/drive/MyDrive/SOL60_automl.joblib'
joblib.dump(automl, model_path)

print(f"Model saved to {model_path}")

In [None]:
# 모델 로드
model_path = '/content/drive/MyDrive/Data/SOL60_automl.joblib'
automl = joblib.load(model_path)

In [None]:
#2nd Account
# 모델 로드
model_path = '/content/drive/MyDrive/SOL60_automl.joblib'
automl = joblib.load(model_path)

In [None]:
# 예측 수행
predictions = automl.predict(data_test_predict_scaled)

# 예측 결과를 데이터프레임에 추가
data_test['prediction_automl'] = predictions

In [None]:
data_test_non_nan = data_test.dropna(subset=['prediction_automl'])

# max_return_60min 기준으로 내림차순 정렬
data_test_sorted = data_test_non_nan.sort_values(by='max_return_60min', ascending=False)

# 결과 출력
data_test_sorted

In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['prediction_automl'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['prediction_automl'] == 0)])

print(f"max_return_60min이 1 이상인데 prediction이 0인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1 미만인데 prediction이 1인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")