In [None]:
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#테스트 데이터 호출
data_test_tmp = pd.read_csv("/kaggle/input/data-set-plus-test-24-08-07/SOL_Data_Test_Indicator3.csv")

# Display the first few rows of the dataset to ensure it is loaded correctly
data_test_tmp.head()

In [None]:
data_test = data_test_tmp
data_test.columns

In [None]:
data_test = data_test.drop(columns=['Unnamed: 0'])

In [None]:
# open_time 열을 datetime 형식으로 변환
if not np.issubdtype(data_test['open_time'].dtype, np.datetime64):
    data_test['open_time'] = pd.to_datetime(data_test['open_time'])

# time 열을 분 단위로 변환
data_test['time'] = data_test['open_time'].dt.hour * 60 + data_test['open_time'].dt.minute

In [None]:
# 사용하지 않을 열 제외
data_test_predict = data_test.drop(columns=['open_time', 'max_return_60min', 'min_return_60min'])

In [None]:
# 사용하지 않을 열 제외 v2
# 피처 목록
features_to_keep = ['ichimoku_conversion_9', 'ichimoku_conversion_200', 'supertrend_upper_14_2_10',
                    'supertrend_upper_10_3_20', 'bollinger_hband_200', 'volume_ma_100', 'ROC_30',
                    'open', 'high', 'supertrend_lower_10_3_20', 'obv', 'atr_50', 'volume_ma_200',
                    'Accumulation_Distribution_Line', 'bollinger_lband_20', 'lowerband', 'volume_ma_20',
                    'supertrend_lower_7_3_14', 'atr_14', 'disparity_index_100', 'price_ma_200',
                    'bollinger_lband_50', 'ichimoku_conversion_52', 'upperband', 'atr_20', 'price_ma_20',
                    'disparity_index_20', 'time', 'vwap', 'bollinger_lband_200', 'atr_10', 'MFI_40',
                    'volume_ma_10', 'supertrend_in_uptrend_7_3_14', 'Momentum_30', 'Momentum_20',
                    'supertrend_upper_20_4_50', 'bollinger_hband_100', 'MFI_50', 'CMO_50', 'close',
                    'Momentum_50', 'stoch_%k_21_5', 'supertrend_upper_7_3_14', 'bollinger_hband_50',
                    'Parabolic_SAR_0.02', 'bollinger_lband_100', 'stoch_%k_9_3', 'Williams_%R_30', 'CMO_40']

# 피처들만 남기기
data_test_predict = data_test[features_to_keep]

In [None]:
# 무한대 값을 NaN으로 대체
data_test_predict.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN 값을 평균으로 대체
imputer = SimpleImputer(strategy='mean')
data_test_predict_imputed = imputer.fit_transform(data_test_predict)  # 같은 imputer 사용

# 데이터 정규화
scaler = MinMaxScaler()
data_test_predict_scaled = scaler.fit_transform(data_test_predict_imputed)  # 같은 scaler 사용

######################

**Transformer 모델 로드**
######################

In [None]:
#Transformer model 호출
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# 모델 정의 (로드할 때 필요)
class TransformerEncoderModel(nn.Module):
    def __init__(self, input_dim, nhead, num_layers, dim_feedforward, output_dim):
        super(TransformerEncoderModel, self).__init__()
        self.embedding = nn.Linear(input_dim, input_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=nhead, dim_feedforward=dim_feedforward, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, src):
        src = self.embedding(src)
        output = self.transformer_encoder(src)
        output = self.fc(output[:, -1, :])  # Use the output from the last time step
        return output

In [None]:
#Transformer model 호출2
# 시퀀스 길이 설정
sequence_length = 60

# 예측 데이터를 시퀀스 형태로 변환
def create_sequences_for_prediction(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        seq = data[i:i + sequence_length]
        sequences.append(seq)
    return np.array(sequences)

# 예측용 시퀀스 데이터 생성
X_test_seq = create_sequences_for_prediction(data_test_predict_scaled, sequence_length)

# 입력 차원 확인 및 설정
input_dim = X_test_seq.shape[2]
print(f"Input dimension: {input_dim}")

# 모델 설정 (로드할 때 필요)
nhead = 2
num_layers = 2
dim_feedforward = 64
output_dim = 1

model = TransformerEncoderModel(input_dim, nhead, num_layers, dim_feedforward, output_dim)

# 모델 로드
model_path = '/kaggle/input/lstm_indiactor3/pytorch/tcn_transformer/6/SOL60_SMALLL_INDICATOR3_Transformer_v3.pth'
model.load_state_dict(torch.load(model_path))
model.eval()
print(f"Model loaded from {model_path}")

######################

TCN 모델 로드

######################

In [None]:
#TCN 모델 호출
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

#모델 로드
class TCNModel(nn.Module):
    def __init__(self, input_channels, num_channels, kernel_size=2, dropout=0.2):
        super(TCNModel, self).__init__()
        self.tcn = nn.Conv1d(input_channels, num_channels, kernel_size, padding=kernel_size//2)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(num_channels, 1)

    def forward(self, x):
        x = x.transpose(1, 2)  # (batch_size, seq_len, input_channels) -> (batch_size, input_channels, seq_len)
        y1 = self.tcn(x)
        y1 = self.relu(y1)
        y1 = self.dropout(y1)
        y1 = y1[:, :, -1]
        o = self.fc(y1)
        return o

In [None]:
#TCN 모델 호출2
# 시퀀스 길이 설정
sequence_length = 60

# 예측 데이터를 시퀀스 형태로 변환
def create_sequences_for_prediction(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length + 1):
        seq = data[i:i + sequence_length]
        sequences.append(seq)
    return np.array(sequences)

# 예측용 시퀀스 데이터 생성
X_test_seq = create_sequences_for_prediction(data_test_predict_scaled, sequence_length)

# 입력 차원 확인 및 설정
input_channels = X_test_seq.shape[2]
print(f"Input channels: {input_channels}")

# 모델 설정
num_channels = 64
model = TCNModel(input_channels, num_channels)

# 모델 로드
model_path = '/kaggle/input/lstm_indiactor3/pytorch/tcn_transformer/9/SOL60_SMALLL_INDICATOR3_TCN_v4_180_11.pth'
model.load_state_dict(torch.load(model_path))
model.eval()
print(f"Model loaded from {model_path}")

######################

예측 수행

######################

In [None]:
# 슬라이딩 윈도우로 데이터 범위 추출
num_rows = data_test_predict_scaled.shape[0]

# 시퀀스 데이터를 텐서로 변환
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)
print(f"Tensor Transform Complete")

In [None]:
# 모델을 GPU로 이동 (가능한 경우)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
X_test_tensor = X_test_tensor.to(device)

In [None]:
#prediction 결과 저장
results = []

#window_size = 216000
#window_size = 259200
window_size = 302400

batch_size = 63  # 배치 크기 설정

for end in tqdm(range(num_rows, window_size - 1, -batch_size)):
    start = max(end - batch_size + 1, 0)

    # 해당 범위에 대한 시퀀스 텐서 추출
    X_test_tensor_tmp = X_test_tensor[start:end]

    # 예측 수행
    model.eval()
    with torch.no_grad():
        predictions = torch.sigmoid(model(X_test_tensor_tmp)).squeeze().cpu().numpy()

    # 예측 결과를 이진 분류로 변환 (0 또는 1)
    predictions = (predictions > 0.5).astype(int)

    # 예측 결과의 마지막 값을 추가
    if len(predictions.shape) > 0:
        results.append(predictions[-1])
    else:
        results.append(predictions)

results = results[::-1]  # 원래 순서대로 변경

In [None]:
# 결과를 원본 데이터프레임에 추가
data_test['Predictions'] = np.nan
data_test.loc[data_test.index[-len(results):], 'Predictions'] = results
data_test = data_test.dropna(subset=['Predictions'])

In [None]:
# 'max_return_60min' 값이 1 이상이고 'prediction' 값이 0인 데이터의 개수
count_max_return_ge_1_prediction_0 = len(data_test[(data_test['max_return_60min'] >= 1.1) & (data_test['Predictions'] == 1)])

# 'max_return_60min' 값이 1 미만이고 'prediction' 값이 1인 데이터의 개수
count_max_return_lt_1_prediction_1 = len(data_test[(data_test['max_return_60min'] < 1.1) & (data_test['Predictions'] == 0)])

#
print(f"max_return_60min이 1.1 이상인데 prediction이 1인 데이터의 비율: {count_max_return_ge_1_prediction_0/len(data_test)*100}")
print(f"max_return_60min이 1.1 미만인데 prediction이 0인 데이터의 비율: {count_max_return_lt_1_prediction_1/len(data_test)*100}")

######################

시간대 분석

######################

In [None]:
# 시간대를 나타내는 새로운 열 추가 (30분 단위)
data_test['half_hour'] = data_test['open_time'].dt.floor('30T').dt.time

# 시간대별로 데이터 그룹화 (30분 단위)
grouped = data_test.groupby('half_hour')

# 각 시간대별로 조건을 만족하는 데이터의 개수를 계산하는 함수
def calculate_counts(group):
    count_max_return_ge_1_prediction_0 = len(group[(group['max_return_60min'] >= 1.1) & (group['Predictions'] == 1)])
    count_max_return_lt_1_prediction_1 = len(group[(group['max_return_60min'] < 1.1) & (group['Predictions'] == 0)])

    total_count = len(group)

    if total_count == 0:
        return pd.Series([0, 0, 0, 0, 0])

    rate_max_return_ge_1_prediction_0 = (count_max_return_ge_1_prediction_0 / total_count) * 100
    rate_max_return_lt_1_prediction_1 = (count_max_return_lt_1_prediction_1 / total_count) * 100

    total_rate = rate_max_return_ge_1_prediction_0 + rate_max_return_lt_1_prediction_1

    return pd.Series([count_max_return_ge_1_prediction_0, count_max_return_lt_1_prediction_1, rate_max_return_ge_1_prediction_0, rate_max_return_lt_1_prediction_1, total_rate])

# 각 그룹에 함수 적용
results = grouped.apply(calculate_counts)
results.columns = ['Count_GE_1_Pred_0', 'Count_LT_1_Pred_1', 'Rate_GE_1_Pred_0', 'Rate_LT_1_Pred_1', 'Total_Rate']

# 결과 출력
print(results)

In [None]:
# 시간대를 UTC 및 KST로 표시
results['half_hour_utc'] = pd.to_datetime(results.index.astype(str), format='%H:%M:%S').time
results['half_hour_kst'] = (pd.to_datetime(results.index.astype(str), format='%H:%M:%S') + pd.Timedelta(hours=9)).time
results['half_hour_label'] = results.index.astype(str) + ' (' + results['half_hour_kst'].astype(str) + ' KST)'

# x축을 시간으로, y축을 Total_Rate으로 하는 차트 그리기
plt.figure(figsize=(12, 6))
plt.plot(results['half_hour_label'], results['Total_Rate'], marker='o')
plt.xlabel('Half Hour of the Day (UTC)')
plt.ylabel('Total Rate (%)')
plt.title('Total Rate of Conditions Met by Half Hour of the Day (UTC with KST)')
plt.grid(True)
plt.xticks(rotation=45, ha='right')  # 레이블을 오른쪽으로 회전하여 수평으로 정렬

# y축 90 라인 진하게 표시
plt.axhline(90, color='red', linewidth=1.5, linestyle='--')  # y=90 라인을 진하게

plt.show()

In [None]:
# 시간대를 나타내는 새로운 열 추가 (30분 단위)
data_test['half_hour'] = data_test['open_time'].dt.floor('30T').dt.time

# 시간대별로 데이터 그룹화 (30분 단위)
grouped = data_test.groupby('half_hour')

# 각 시간대별로 max_return_60min이 1.1 이상이고 Predictions가 1인 값의 비율을 계산하는 함수
def calculate_ratio(group):
    count_condition_met = len(group[(group['max_return_60min'] >= 1.1) & (group['Predictions'] == 1)])
    total_count = len(group)

    if total_count == 0:
        return 0

    return (count_condition_met / total_count) * 100

# 각 그룹에 함수 적용
results = grouped.apply(calculate_ratio)
results = results.reset_index()
results.columns = ['half_hour', 'Condition_Met_Ratio']

# 결과 출력
print(results)

In [None]:
# 시간대를 UTC 및 KST로 표시
results['half_hour'] = pd.to_datetime(results['half_hour'].astype(str), format='%H:%M:%S').dt.strftime('%H:%M')
results['half_hour_kst'] = (pd.to_datetime(results['half_hour'], format='%H:%M') + pd.Timedelta(hours=9)).dt.strftime('%H:%M')
results['half_hour_label'] = results['half_hour'] + ' (' + results['half_hour_kst'] + ' KST)'

# x축을 시간으로, y축을 Condition_Met_Ratio로 하는 차트 그리기
plt.figure(figsize=(12, 6))
plt.plot(results['half_hour_label'], results['Condition_Met_Ratio'], marker='o')
plt.xlabel('Half Hour of the Day (UTC)')
plt.ylabel('Condition Met Ratio (%)')
plt.title('Ratio of max_return_60min >= 1.1 and Predictions == 1 by Half Hour (UTC with KST)')
plt.grid(True)
plt.xticks(rotation=45, ha='right')  # 레이블을 오른쪽으로 회전하여 수평으로 정렬

plt.show()


######################

시간대 분석2

######################



In [None]:
# Convert 'open_time' to datetime if it's not already
data_test['open_time'] = pd.to_datetime(data_test['open_time'])

# Add a new column representing half-hour intervals
data_test['half_hour'] = data_test['open_time'].dt.floor('30T').dt.time

# Group by the half-hour intervals
grouped = data_test.groupby('half_hour')

# Define the function to calculate the counts and rates
def calculate_counts(group):
    count_max_return_ge_1_prediction_0 = len(group[(group['max_return_60min'] >= 1.1) & (group['Predictions'] == 1)])
    count_max_return_lt_1_prediction_1 = len(group[(group['max_return_60min'] < 1.1) & (group['Predictions'] == 0)])

    total_count = len(group)

    if total_count == 0:
        return pd.Series([0, 0, 0, 0, 0])

    rate_max_return_ge_1_prediction_0 = (count_max_return_ge_1_prediction_0 / total_count) * 100
    rate_max_return_lt_1_prediction_1 = (count_max_return_lt_1_prediction_1 / total_count) * 100

    total_rate = rate_max_return_ge_1_prediction_0 + rate_max_return_lt_1_prediction_1

    return pd.Series([count_max_return_ge_1_prediction_0, count_max_return_lt_1_prediction_1, rate_max_return_ge_1_prediction_0, rate_max_return_lt_1_prediction_1, total_rate])

# Apply the function to the grouped data
results = grouped.apply(calculate_counts)
results.columns = ['Count_GE_1_Pred_0', 'Count_LT_1_Pred_1', 'Rate_GE_1_Pred_0', 'Rate_LT_1_Pred_1', 'Total_Rate']

# Function to calculate the ratio for condition met
def calculate_ratio(group):
    count_condition_met = len(group[(group['max_return_60min'] >= 1.1) & (group['Predictions'] == 1)])
    total_count = len(group)

    if total_count == 0:
        return 0

    return (count_condition_met / total_count) * 100

# Apply the function to the grouped data
ratio_results = grouped.apply(calculate_ratio)
ratio_results = ratio_results.reset_index()
ratio_results.columns = ['half_hour', 'Condition_Met_Ratio']

# Combine the results into a single DataFrame
combined_results = results.reset_index()
combined_results['Condition_Met_Ratio'] = ratio_results['Condition_Met_Ratio']

# 시간대를 UTC 및 KST로 표시
combined_results['half_hour_utc'] = pd.to_datetime(combined_results['half_hour'].astype(str), format='%H:%M:%S').dt.strftime('%H:%M')
combined_results['half_hour_kst'] = (pd.to_datetime(combined_results['half_hour_utc'], format='%H:%M') + pd.Timedelta(hours=9)).dt.strftime('%H:%M')
combined_results['half_hour_label'] = combined_results['half_hour_utc'] + ' (' + combined_results['half_hour_kst'] + ' KST)'

In [None]:
# Plotting the combined results
fig, ax1 = plt.subplots(figsize=(12, 6))

ax1.plot(combined_results['half_hour_label'], combined_results['Total_Rate'], marker='o', color='b', label='Total Rate')
ax1.set_xlabel('Half Hour of the Day (UTC)')
ax1.set_ylabel('Total Rate (%)', color='b')
ax1.tick_params(axis='y', labelcolor='b')
ax1.grid(True)
plt.xticks(rotation=45, ha='right')

ax2 = ax1.twinx()
ax2.plot(combined_results['half_hour_label'], combined_results['Condition_Met_Ratio'], marker='o', color='g', label='Condition Met Ratio')
ax2.set_ylabel('Condition Met Ratio (%)', color='g')
ax2.tick_params(axis='y', labelcolor='g')

fig.tight_layout()
plt.title('Comparison of Total Rate and Condition Met Ratio by Half Hour (UTC with KST)')
plt.show()