In [1]:
pip install joblib



In [2]:
pip install tdqm

Collecting tdqm
  Downloading tdqm-0.0.1.tar.gz (1.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tdqm
  Building wheel for tdqm (setup.py) ... [?25l[?25hdone
  Created wheel for tdqm: filename=tdqm-0.0.1-py3-none-any.whl size=1321 sha256=e59d3d61a9368d87b191978eceb840e1b1387aab5b60fd161ca21b64a59f5f41
  Stored in directory: /root/.cache/pip/wheels/37/31/b8/7b711038035720ba0df14376af06e5e76b9bd61759c861ad92
Successfully built tdqm
Installing collected packages: tdqm
Successfully installed tdqm-0.0.1


In [3]:
import numpy as np
import pandas as pd
import joblib
from tqdm import tqdm

# TensorFlow 및 TPU 설정
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# TCN 모델 정의
class TCNModel(nn.Module):
    def __init__(self, input_size, num_channels, kernel_size=2, dropout=0.2):
        super(TCNModel, self).__init__()
        self.tcn = nn.Conv1d(input_size, num_channels, kernel_size, padding=kernel_size//2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_channels, 1)

    def forward(self, x):
        x = x.transpose(1, 2)  # (batch_size, seq_len, input_size) -> (batch_size, input_size, seq_len)
        y1 = self.tcn(x)
        y1 = self.relu(y1)
        y1 = self.dropout(y1)
        y1 = y1[:, :, -1]
        o = self.fc(y1)
        return o

In [None]:
#1.1 1분 6개
#Main
file_path1 = '/content/drive/MyDrive/Data/SOL_Data_1m_Micro_Indicator3.csv'
data = pd.read_csv(file_path1)

# Display the first few rows of the dataset to ensure it is loaded correctly
data.head()

In [None]:
# 시간 관련 열 변환 함수
def convert_time_features(data):
    # open_time 열이 datetime 형식이 아닌 경우 변환
    if not np.issubdtype(data['open_time'].dtype, np.datetime64):
        data['open_time'] = pd.to_datetime(data['open_time'])

    # time 열을 분 단위로 변환
    data['time'] = data['open_time'].dt.hour * 60 + data['open_time'].dt.minute

    # 사용하지 않을 열 제외
    #data = data.drop(columns=['open_time', 'Unnamed: 0'])
    data = data.drop(columns=['open_time'])

    return data

# 시간 관련 열 변환
data = convert_time_features(data)

#Target 생
data['target'] = (data['max_return_60min'] >= 1.1).astype(int)

In [None]:
#1.1 1분 6개
#Test
file_path2 = '/content/drive/MyDrive/Data/SOL_Data_Test_1m_Indicator3.csv'
data_test = pd.read_csv(file_path2)

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test.head()

In [None]:
# open_time 열을 datetime 형식으로 변환
if not np.issubdtype(data_test['open_time'].dtype, np.datetime64):
    data_test['open_time'] = pd.to_datetime(data_test['open_time'])

# time 열을 분 단위로 변환
data_test['time'] = data_test['open_time'].dt.hour * 60 + data_test['open_time'].dt.minute

In [None]:
indicators_list = [
    "volume_ma_100", "volume_ma_5", "volume", "volume_ma_50", "obv", "volume_ma_20", "volume_ma_200", "atr_14", "disparity_index_50", "disparity_index_5",
    "atr_10", "bollinger_hband_100", "disparity_index_10", "supertrend_lower_14_2_10", "bollinger_lband_10", "bollinger_hband_200", "atr_5", "bollinger_lband_100", "bollinger_lband_200", "price_ma_100",
    "ichimoku_conversion_52", "disparity_index_20", "lowerband", "atr_50", "price_ma_200", "ichimoku_conversion_200", "atr_20", "ichimoku_conversion_100", "volume_ma_10", "supertrend_lower_20_4_50",
    "ichimoku_base_9", "supertrend_upper_10_3_20", "Accumulation_Distribution_Line", "bollinger_lband_50", "bollinger_hband_50", "bollinger_lband_20", "bollinger_hband_20", "disparity_index_200", "upperband", "price_ma_50",
    "low", "time", "disparity_index_100", "Parabolic_SAR_0.06", "supertrend_upper_20_4_50", "supertrend_upper_7_3_14", "supertrend_lower_10_3_20", "ichimoku_conversion_9", "price_ma_10", "supertrend_upper_14_2_10",
    "vwap", "Parabolic_SAR_0.1", "ROC_50", "supertrend_in_uptrend_10_3_20", "MFI_50", "supertrend_lower_7_3_14", "supertrend_in_uptrend_20_4_50", "close", "bollinger_hband_10", "Momentum_50",
    "VR_50", "Elder_Force_Index_25", "price_ma_20", "in_uptrend", "price_ma_5", "open", "supertrend_in_uptrend_7_3_14", "VR_40", "Momentum_30", "MFI_40",
    "stoch_%k_20_7", "high", "stoch_%k_9_3", "CMO_40", "ROC_30", "CMO_50", "Parabolic_SAR_0.02", "stoch_%d_21_5", "Williams_%R_40", "Parabolic_SAR_0.04",
    "CMO_20", "Williams_%R_50", "CMO_10", "ROC_20", "ROC_40", "MFI_30", "VR_30", "CCI_50", "Momentum_40", "Williams_%R_30",
    "MFI_20", "Momentum_10", "ROC_10", "MFI_10", "stoch_%d_20_7", "Elder_Force_Index_10", "Elder_Force_Index_13", "CCI_30", "VR_20", "CMO_30",
    "VR_10", "CCI_40", "CCI_10", "supertrend_in_uptrend_14_2_10", "Momentum_20", "stoch_%k_21_5", "stoch_%d_9_3", "stoch_%d_14_3", "CCI_20", "stoch_%k_14_3",
    "Elder_Force_Index_2", "Elder_Force_Index_5", "stoch_%d_5_2", "stoch_%k_5_2", "Parabolic_SAR_0.08", "Williams_%R_20", "supertrend_lower_50_5_5", "supertrend_in_uptrend_50_5_5", "Williams_%R_10", "supertrend_upper_50_5_5",
    "RSI_40", "RSI_30", "RSI_20", "RSI_10", "atr", "RSI_50", "Relative_Vigor_Index_10", "ichimoku_base_200", "Relative_Vigor_Index_50", "Relative_Vigor_Index_20",
    "Relative_Vigor_Index_30", "Relative_Vigor_Index_40", "ichimoku_base_52", "ichimoku_conversion_26", "ichimoku_base_26", "ichimoku_base_100"
]

In [None]:
# 시퀀스 길이 설정
sequence_length = 60

for idx in range(20, len(indicators_list) + 1) :

  indicator_tmp = indicators_list[:idx]

  print(f"---------START({indicator_tmp[len(indicator_tmp) - 1]})------------")

  # 특성과 목표 변수 분리
  X = data[indicator_tmp]
  y = data['target']

  # 무한대 값을 NaN으로 대체
  X.replace([np.inf, -np.inf], np.nan, inplace=True)

  # NaN 값을 평균으로 대체
  imputer = SimpleImputer(strategy='mean')
  X_imputed = imputer.fit_transform(X)

  # 데이터 정규화
  scaler = MinMaxScaler()
  X_scaled = scaler.fit_transform(X_imputed)

  # 예측 데이터를 시퀀스 형태로 변환
  def create_sequences_for_prediction(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        seq = data[i:i + sequence_length]
        sequences.append(seq)
    return np.array(sequences)

  # 시퀀스 데이터 생성
  y_array = y.values
  X_seq, y_seq = create_sequences(X_scaled, y_array, sequence_length)

  # 피처들만 남기기
  data_test_predict = data_test[indicator_tmp]

  # 무한대 값을 NaN으로 대체
  data_test_predict.replace([np.inf, -np.inf], np.nan, inplace=True)

  # NaN 값을 평균으로 대체
  imputer = SimpleImputer(strategy='mean')
  data_test_predict_imputed = imputer.fit_transform(data_test_predict)  # 같은 imputer 사용

  # 데이터 정규화
  scaler = MinMaxScaler()
  data_test_predict_scaled = scaler.fit_transform(data_test_predict_imputed)  # 같은 scaler 사용

  # 예측 데이터를 시퀀스 형태로 변환 (LSTM용)
  def create_sequences_for_prediction(data, sequence_length):
      sequences = []
      for i in range(len(data) - sequence_length + 1):
          seq = data[i:i + sequence_length]
          sequences.append(seq)
      return np.array(sequences)

  # 예측용 시퀀스 데이터 생성
  X_test_seq = create_sequences_for_prediction(data_test_predict_scaled, sequence_length)

  # 학습 데이터와 검증 데이터 분리
  X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.2, random_state=42)

  # 데이터를 텐서로 변환
  X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
  y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
  X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
  y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

  # 데이터 로더 생성
  train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
  test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
  train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

  # 모델 설정
  input_size = X_train.shape[2]
  num_channels = 64
  model = TCNModel(input_size, num_channels)

  # 손실 함수 및 옵티마이저 설정
  criterion = nn.BCEWithLogitsLoss()
  optimizer = optim.Adam(model.parameters(), lr=0.001)

  # 조기 종료 설정
  patience = 5
  best_loss = float('inf')
  patience_counter = 0

  # 학습 및 검증 손실을 저장할 리스트
  train_losses = []
  val_losses = []

  # 학습
  num_epochs = 5  # 최대 에포크 수
  for epoch in range(num_epochs):
      # 학습 단계
      model.train()
      running_loss = 0.0
      for X_batch, y_batch in train_loader:
          optimizer.zero_grad()
          output = model(X_batch)
          loss = criterion(output, y_batch)
          loss.backward()
          optimizer.step()
          running_loss += loss.item()

      avg_train_loss = running_loss / len(train_loader)
      train_losses.append(avg_train_loss)

      # 검증 단계
      model.eval()
      val_loss = 0.0
      with torch.no_grad():
          for X_batch, y_batch in test_loader:
              output = model(X_batch)
              loss = criterion(output, y_batch)
              val_loss += loss.item()

      avg_val_loss = val_loss / len(test_loader)
      val_losses.append(avg_val_loss)

      # 조기 종료 조건 체크
      if avg_val_loss < best_loss:
          best_loss = avg_val_loss
          patience_counter = 0
      else:
          patience_counter += 1

      if patience_counter >= patience:
          print("조기 종료 조건 충족. 학습을 중지합니다.")
          break

  # 모델 평가
  model.eval()
  with torch.no_grad():
    y_true = []
    y_pred = []
    for X_batch, y_batch in test_loader:
      output = model(X_batch)
      y_true.extend(y_batch.tolist())
      y_pred.extend(torch.sigmoid(output).squeeze().tolist())

  # 이진 분류 결과를 위한 평가 지표 계산
  y_pred = np.array(y_pred) > 0.5
  y_true = y_test_tensor.numpy()

  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)

  print(f'Accuracy: {accuracy:.4f}')
  print(f'Precision: {precision:.4f}')
  print(f'Recall: {recall:.4f}')
  print(f'F1 Score: {f1:.4f}')

  # 슬라이딩 윈도우로 데이터 범위 추출
  num_rows = data_test_predict_scaled.shape[0]

  # 시퀀스 데이터를 텐서로 변환
  X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)

  # 모델을 GPU로 이동 (가능한 경우)
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  model.to(device)
  X_test_tensor = X_test_tensor.to(device)

  #prediction 결과 저장
  results = []

  # 배치 크기 설정
  batch_size = 63

  #
  data_test_tmp = data_test

  #prediction 결과 저장
  results = []

  for end in tqdm(range(num_rows, 259200 - 1, -batch_size)):
      start = max(end - batch_size + 1, 0)

      # 해당 범위에 대한 시퀀스 텐서 추출
      X_test_tensor_tmp = X_test_tensor[start:end]

      # 예측 수행
      model.eval()
      with torch.no_grad():
          predictions = torch.sigmoid(model(X_test_tensor_tmp)).squeeze().cpu().numpy()

      # 예측 결과를 이진 분류로 변환 (0 또는 1)
      predictions = (predictions > 0.5).astype(int)

      # 예측 결과의 마지막 값을 추가
      if len(predictions.shape) > 0:
              results.append(predictions[-1])
      else:
          results.append(predictions)

  # 원래 순서대로 변경
  results = results[::-1]  # 원래 순서대로 변경

  data_test_tmp['Predictions'] = np.nan
  data_test_tmp.loc[data_test_tmp.index[-len(results):], 'Predictions'] = results
  data_test_tmp = data_test_tmp.dropna(subset=['Predictions'])

  # 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
  count_max_return_ge_1_prediction_0 = len(data_test_tmp[(data_test_tmp['max_return_60min'] >= 1.1) & (data_test_tmp['Predictions'] == 1)])

  # 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
  count_max_return_lt_1_prediction_1 = len(data_test_tmp[(data_test_tmp['max_return_60min'] < 1.1) & (data_test_tmp['Predictions'] == 0)])

  #
  tmp1 = count_max_return_ge_1_prediction_0/len(data_test_tmp)*100
  tmp2 = count_max_return_lt_1_prediction_1/len(data_test_tmp)*100

  #
  print(f"[max_return_60min/{indicator_tmp[len(indicator_tmp) - 1]}/{percent_point}이상/1/{window_size[idx_window]}] : {tmp1}")
  print(f"[max_return_60min/{indicator_tmp[len(indicator_tmp) - 1]}/{percent_point}미만/0/{window_size[idx_window]}] : {tmp2}")
  print(f"[확률/{indicator_tmp[len(indicator_tmp) - 1]}/{percent_point}/{window_size[idx_window]}] : {tmp1 + tmp2}")

  print(f"---------END({indicator_tmp[len(indicator_tmp) - 1]})------------")