In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#메인 데이터
data = pd.read_csv("/content/drive/MyDrive/Data/SOL60_INDICATOR3")

# Display the first few rows of the dataset to ensure it is loaded correctly
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,open_time,open,high,low,close,volume,time,atr_5,...,price_ma_10,volume_ma_10,price_ma_20,volume_ma_20,price_ma_50,volume_ma_50,price_ma_100,volume_ma_100,price_ma_200,volume_ma_200
0,0,0,2020-12-31 15:00:00,1.4458,1.4458,1.4444,1.4456,161.34,15:00,0.0,...,,,,,,,,,,
1,1,1,2020-12-31 15:01:00,1.4477,1.4478,1.4463,1.4463,148.86,15:01,0.0,...,,,,,,,,,,
2,2,2,2020-12-31 15:02:00,1.4479,1.453,1.4466,1.4484,3208.13,15:02,0.0,...,,,,,,,,,,
3,3,3,2020-12-31 15:03:00,1.4503,1.4559,1.4503,1.4558,1639.76,15:03,0.0,...,,,,,,,,,,
4,4,4,2020-12-31 15:04:00,1.4558,1.4569,1.4545,1.4569,900.91,15:04,0.00404,...,,,,,,,,,,


In [None]:
# 시간 관련 열 변환 함수
def convert_time_features(data):
    # open_time 열이 datetime 형식이 아닌 경우 변환
    if not np.issubdtype(data['open_time'].dtype, np.datetime64):
        data['open_time'] = pd.to_datetime(data['open_time'])

    # time 열을 분 단위로 변환
    data['time'] = data['open_time'].dt.hour * 60 + data['open_time'].dt.minute

    # 사용하지 않을 열 제외
    data = data.drop(columns=['open_time', 'Unnamed: 0', 'Unnamed: 0.1'])

    return data
# 시간 관련 열 변환
data = convert_time_features(data)
data.head()

Unnamed: 0,open,high,low,close,volume,time,atr_5,atr_10,atr_14,atr_20,...,price_ma_10,volume_ma_10,price_ma_20,volume_ma_20,price_ma_50,volume_ma_50,price_ma_100,volume_ma_100,price_ma_200,volume_ma_200
0,1.4458,1.4458,1.4444,1.4456,161.34,900,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,1.4477,1.4478,1.4463,1.4463,148.86,901,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,1.4479,1.453,1.4466,1.4484,3208.13,902,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,1.4503,1.4559,1.4503,1.4558,1639.76,903,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,1.4558,1.4569,1.4545,1.4569,900.91,904,0.00404,0.0,0.0,0.0,...,,,,,,,,,,


In [None]:
# 무한대 값을 NaN으로 대체
data.replace([np.inf, -np.inf], np.nan, inplace=True)

# NaN 값을 평균으로 대체
imputer = SimpleImputer(strategy='mean')
data_imputed = imputer.fit_transform(data)

# numpy 배열을 다시 DataFrame으로 변환
data = pd.DataFrame(data_imputed, columns=data.columns)

data.head()

In [None]:
# time 열을 기준으로 그룹화하여 필요한 계산 수행
results = data.groupby('time').agg(
    total_rows=('max_return_60min', 'size'),
    count_above_1_1=('max_return_60min', lambda x: (x >= 1.1).sum())
)

# 비율 계산
results['percentage_above_1_1'] = (results['count_above_1_1'] / results['total_rows']) * 100

# 결과 출력
results = results.reset_index()

# 시각화
plt.figure(figsize=(10, 6))
plt.plot(results['time'], results['percentage_above_1_1'], marker='o', linestyle='-')
plt.title('Percentage of max_return_60min >= 1.1 over Time')
plt.xlabel('Time')
plt.ylabel('Percentage of max_return_60min >= 1.1')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# time 값을 시-분 형식으로 변환하는 함수
def convert_minutes_to_time(minutes):
    hours = minutes // 60
    mins = minutes % 60
    return f"{int(hours):02}:{int(mins):02}"

# time 열을 기준으로 그룹화하여 필요한 계산 수행
results = data.groupby('time').agg(
    total_rows=('max_return_60min', 'size'),
    count_above_1_1=('max_return_60min', lambda x: (x >= 1.1).sum())
)

# 비율 계산
results['percentage_above_1_1'] = (results['count_above_1_1'] / results['total_rows']) * 100

# time 값을 시-분 형식으로 변환
results = results.reset_index()
results['time'] = results['time'].apply(convert_minutes_to_time)

In [None]:
# time 열을 기준으로 그룹화하여 필요한 계산 수행
results = data.groupby('time').agg(
    total_rows=('max_return_60min', 'size'),
    count_above_1_1=('max_return_60min', lambda x: (x >= 1.1).sum())
)

# 비율 계산
results['percentage_above_1_1'] = (results['count_above_1_1'] / results['total_rows']) * 100

# 결과 출력
results = results.reset_index()

# 슬라이딩 윈도우의 크기 설정 (예: 60분)
window_size = 60

# 슬라이딩 윈도우 평균 계산
results['rolling_mean'] = results['percentage_above_1_1'].rolling(window=window_size // 10, min_periods=1).mean()

# 임계값 설정
threshold = 33  # 여기서 임계값을 설정하세요

# 임계값 이상의 구간 필터링
high_prob_zones = results[results['rolling_mean'] >= threshold]

# 연속된 구간의 시작과 끝을 식별
high_prob_zones['group'] = (high_prob_zones['time'].diff() > 10).cumsum()

# 각 그룹의 시작과 끝 시간 구하기
grouped_zones = high_prob_zones.groupby('group').agg(
    start_time=('time', 'first'),
    end_time=('time', 'last')
).reset_index()

# 시간축 이동
shift_point = 200
before_shift = results[results['time'] < shift_point].copy()
after_shift = results[results['time'] >= shift_point].copy()

before_shift['shifted_time'] = before_shift['time'] + 1440
after_shift['shifted_time'] = after_shift['time']

results_shifted = pd.concat([after_shift, before_shift]).reset_index(drop=True)

# 시각화
plt.figure(figsize=(12, 6))
plt.plot(results_shifted['shifted_time'], results_shifted['percentage_above_1_1'], marker='o', linestyle='-', label='Percentage over Time')
plt.plot(results_shifted['shifted_time'], results_shifted['rolling_mean'], linestyle='-', color='orange', label='Rolling Mean')

# 임계값 선 추가
plt.axhline(y=threshold, color='purple', linestyle='-', label=f'Threshold {threshold}%')

# 영역 색칠
plt.fill_between(results_shifted['shifted_time'], threshold, results_shifted['percentage_above_1_1'], where=(results_shifted['percentage_above_1_1'] >= threshold), interpolate=True, color='purple', alpha=0.3)

# 높은 확률 구간에서 시작과 끝 구간만 표시
for _, row in grouped_zones.iterrows():
    if row['start_time'] < shift_point:
        shifted_start_time = row['start_time'] + 1440
    else:
        shifted_start_time = row['start_time']
    if row['end_time'] < shift_point:
        shifted_end_time = row['end_time'] + 1440
    else:
        shifted_end_time = row['end_time']
    plt.axvline(x=shifted_start_time, color='red', linestyle='--', label='Start Time' if row['start_time'] == grouped_zones['start_time'].iloc[0] else "")
    plt.axvline(x=shifted_end_time, color='green', linestyle='--', label='End Time' if row['end_time'] == grouped_zones['end_time'].iloc[0] else "")

# 0 지점을 나타내는 세로선 추가
plt.axvline(x=shift_point, color='blue', linestyle=':', label='Midnight Transition')

plt.title('Percentage of max_return_60min >= 1.1 over Time')
plt.xlabel('Time (minutes)')
plt.ylabel('Percentage of max_return_60min >= 1.1')
plt.xticks(rotation=45)
plt.xlim(shift_point, 1440 + shift_point)
plt.xticks(np.arange(shift_point, 1440 + shift_point + 1, 100), labels=[str(x % 1440) for x in np.arange(shift_point, 1440 + shift_point + 1, 100)])
plt.xticks(list(plt.xticks()[0]) + [0], labels=list(plt.xticks()[1]) + ['0'])  # 0을 x축에 추가
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

# 분을 시-분 형식으로 변환하는 함수
def convert_minutes_to_time(minutes):
    hours = minutes // 60
    mins = minutes % 60
    return f"{int(hours):02}:{int(mins):02}"

# 한국 시간으로 변환하는 함수
def convert_to_kst(utc_minutes):
    kst_minutes = (utc_minutes + 540) % 1440  # 한국 시간은 UTC+9 (540분)
    return convert_minutes_to_time(kst_minutes)

# 높은 확률 구간 시작과 끝 값 출력
print(f"High probability zones above {threshold}% (start_time, end_time):")
for _, row in grouped_zones.iterrows():
    start_time_utc = convert_minutes_to_time(row['start_time'])
    end_time_utc = convert_minutes_to_time(row['end_time'])
    start_time_kst = convert_to_kst(row['start_time'])
    end_time_kst = convert_to_kst(row['end_time'])
    print(f"Start: {start_time_utc} (KST: {start_time_kst}), End: {end_time_utc} (KST: {end_time_kst})")