In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
import warnings

In [None]:
# Load the dataset from Kaggle
file_path = '/content/sample_data/SOL60_Indicator.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to ensure it is loaded correctly
data.head()

Unnamed: 0,open_time,open,high,low,close,volume,RSI_10,RSI_20,RSI_60,MACD_Line,...,Envelope_MA_20,Envelope_Upper_20,Envelope_Lower_20,Envelope_MA_60,Envelope_Upper_60,Envelope_Lower_60,INDICATOR,high_ind,low_ind,diff
0,2023-07-10 15:00:00,21.17,21.19,21.13,21.14,11640.49,,,,0.0,...,,,,,,,,,,
1,2023-07-10 15:01:00,21.14,21.17,21.11,21.11,5304.66,0.0,0.0,0.0,-0.002393,...,,,,,,,,,,
2,2023-07-10 15:02:00,21.12,21.12,21.09,21.11,4088.15,0.0,0.0,0.0,-0.004241,...,,,,,,,,,,
3,2023-07-10 15:03:00,21.11,21.14,21.1,21.14,7810.1,50.0,50.0,50.0,-0.003247,...,,,,,,,,,,
4,2023-07-10 15:04:00,21.15,21.17,21.11,21.11,2510.23,33.333333,33.333333,33.333333,-0.004825,...,,,,,,,,,,


In [None]:
#1시간씩 간격을 나누어서 종가 기준으로 최대 상승 및 하락하는 %를 구하는

# 경고 무시 설정
warnings.simplefilter(action='ignore', category=FutureWarning)

# Helper function to calculate percentage change
def percentage_change(start, end):
    return ((end - start) / start) * 100

# Function to process each chunk of data
def process_chunk(chunk, data):
    results = []
    chunk = chunk.reset_index(drop=True)  # Reset index to ensure iloc works correctly
    for i in range(len(chunk)):
        start_time = pd.to_datetime(chunk.iloc[i]['open_time'])
        end_time = start_time + pd.Timedelta(minutes=60)

        # Filter data within the 60-minute range using numpy for better performance
        mask = (pd.to_datetime(data['open_time']) >= start_time) & (pd.to_datetime(data['open_time']) <= end_time)
        subset = data[mask]

        if len(subset) == 0:
            continue

        start_close = chunk.iloc[i]['close']

        # Find the highest and lowest points within the subset
        highest_row = subset.loc[subset['high'].idxmax()]
        lowest_row = subset.loc[subset['low'].idxmin()]

        highest_time = highest_row['open_time']
        lowest_time = lowest_row['open_time']

        highest_change = percentage_change(start_close, highest_row['high'])
        lowest_change = percentage_change(start_close, lowest_row['low'])

        results.append({
            'start_time': start_time,
            'end_time': end_time,
            'volume_at_start': chunk.iloc[i]['volume'],
            'highest_time': highest_time,
            'highest_change': highest_change,
            'highest_volume': highest_row['volume'],
            'lowest_time': lowest_time,
            'lowest_change': lowest_change,
            'lowest_volume': lowest_row['volume'],
            'start_indicators': chunk.iloc[i][['RSI_10', 'RSI_20', 'RSI_60', 'MACD_Line', 'Signal_Line', 'MACD_Histogram', 'CCI_5', 'CCI_10', 'CCI_20', 'RSI_Stoch_K_10', 'RSI_Stoch_D_10', 'RSI_Stoch_K_20', 'RSI_Stoch_D_20', 'RSI_Stoch_K_60', 'RSI_Stoch_D_60', 'MFI_10', 'MFI_20', 'MFI_60', 'Volume_Ratio_10', 'Volume_Ratio_20', 'Volume_Ratio_60', 'BB_Mean_10', 'BB_Upper_10', 'BB_Lower_10', 'BB_Mean_20', 'BB_Upper_20', 'BB_Lower_20', 'BB_Mean_60', 'BB_Upper_60', 'BB_Lower_60', 'Tenkan_Sen', 'Kijun_Sen', 'Senkou_Span_A', 'Senkou_Span_B', 'Chikou_Span', 'Envelope_MA_10', 'Envelope_Upper_10', 'Envelope_Lower_10', 'Envelope_MA_20', 'Envelope_Upper_20', 'Envelope_Lower_20', 'Envelope_MA_60', 'Envelope_Upper_60', 'Envelope_Lower_60', 'INDICATOR', 'high_ind', 'low_ind', 'diff']].values,
            'highest_indicators': highest_row[['RSI_10', 'RSI_20', 'RSI_60', 'MACD_Line', 'Signal_Line', 'MACD_Histogram', 'CCI_5', 'CCI_10', 'CCI_20', 'RSI_Stoch_K_10', 'RSI_Stoch_D_10', 'RSI_Stoch_K_20', 'RSI_Stoch_D_20', 'RSI_Stoch_K_60', 'RSI_Stoch_D_60', 'MFI_10', 'MFI_20', 'MFI_60', 'Volume_Ratio_10', 'Volume_Ratio_20', 'Volume_Ratio_60', 'BB_Mean_10', 'BB_Upper_10', 'BB_Lower_10', 'BB_Mean_20', 'BB_Upper_20', 'BB_Lower_20', 'BB_Mean_60', 'BB_Upper_60', 'BB_Lower_60', 'Tenkan_Sen', 'Kijun_Sen', 'Senkou_Span_A', 'Senkou_Span_B', 'Chikou_Span', 'Envelope_MA_10', 'Envelope_Upper_10', 'Envelope_Lower_10', 'Envelope_MA_20', 'Envelope_Upper_20', 'Envelope_Lower_20', 'Envelope_MA_60', 'Envelope_Upper_60', 'Envelope_Lower_60', 'INDICATOR', 'high_ind', 'low_ind', 'diff']].values,
            'lowest_indicators': lowest_row[['RSI_10', 'RSI_20', 'RSI_60', 'MACD_Line', 'Signal_Line', 'MACD_Histogram', 'CCI_5', 'CCI_10', 'CCI_20', 'RSI_Stoch_K_10', 'RSI_Stoch_D_10', 'RSI_Stoch_K_20', 'RSI_Stoch_D_20', 'RSI_Stoch_K_60', 'RSI_Stoch_D_60', 'MFI_10', 'MFI_20', 'MFI_60', 'Volume_Ratio_10', 'Volume_Ratio_20', 'Volume_Ratio_60', 'BB_Mean_10', 'BB_Upper_10', 'BB_Lower_10', 'BB_Mean_20', 'BB_Upper_20', 'BB_Lower_20', 'BB_Mean_60', 'BB_Upper_60', 'BB_Lower_60', 'Tenkan_Sen', 'Kijun_Sen', 'Senkou_Span_A', 'Senkou_Span_B', 'Chikou_Span', 'Envelope_MA_10', 'Envelope_Upper_10', 'Envelope_Lower_10', 'Envelope_MA_20', 'Envelope_Upper_20', 'Envelope_Lower_20', 'Envelope_MA_60', 'Envelope_Upper_60', 'Envelope_Lower_60', 'INDICATOR', 'high_ind', 'low_ind', 'diff']].values,
        })
    return results

# 데이터를 24개의 청크로 나누기
num_chunks = 24
data_chunks = np.array_split(data, num_chunks)

# Use ProcessPoolExecutor with 24 workers
results = []
with ProcessPoolExecutor(max_workers=24) as executor:
    futures = {executor.submit(process_chunk, chunk, data): chunk for chunk in data_chunks}

    for future in tqdm(as_completed(futures), total=num_chunks):
        try:
            result = future.result()
            if result:
                results.extend(result)
        except Exception as e:
            print(f"Exception occurred: {e}")

# Convert the results into a DataFrame
result_df = pd.DataFrame(results)

  self.pid = os.fork()
100%|██████████| 24/24 [03:33<00:00,  8.91s/it]


In [None]:
result_df.head()

Unnamed: 0,start_time,end_time,volume_at_start,highest_time,highest_change,highest_volume,lowest_time,lowest_change,lowest_volume,start_indicators,highest_indicators,lowest_indicators
0,2023-07-12 18:05:00,2023-07-12 19:05:00,1155.33,2023-07-12 18:23:00,0.45045,1936.44,2023-07-12 18:07:00,-0.225225,1953.28,"[54.54545454545367, 53.12499999999982, 43.6170...","[81.24999999999972, 61.76470588235298, 55.4347...","[33.33333333333333, 51.61290322580668, 41.2371..."
1,2023-07-12 18:06:00,2023-07-12 19:06:00,794.24,2023-07-12 18:23:00,0.631769,1936.44,2023-07-12 18:07:00,-0.045126,1953.28,"[30.769230769230347, 45.71428571428566, 41.237...","[81.24999999999972, 61.76470588235298, 55.4347...","[33.33333333333333, 51.61290322580668, 41.2371..."
2,2023-07-12 18:07:00,2023-07-12 19:07:00,1953.28,2023-07-12 18:23:00,0.631769,1936.44,2023-07-12 18:07:00,-0.045126,1953.28,"[33.33333333333333, 51.61290322580668, 41.2371...","[81.24999999999972, 61.76470588235298, 55.4347...","[33.33333333333333, 51.61290322580668, 41.2371..."
3,2023-07-12 18:08:00,2023-07-12 19:08:00,839.15,2023-07-12 18:23:00,0.586378,1936.44,2023-07-12 18:08:00,-0.090212,839.15,"[41.66666666666691, 50.0, 42.26804123711364, -...","[81.24999999999972, 61.76470588235298, 55.4347...","[41.66666666666691, 50.0, 42.26804123711364, -..."
4,2023-07-12 18:09:00,2023-07-12 19:09:00,1708.8,2023-07-12 18:23:00,0.541028,1936.44,2023-07-12 18:10:00,-0.135257,1178.92,"[46.15384615384553, 53.33333333333326, 42.8571...","[81.24999999999972, 61.76470588235298, 55.4347...","[30.769230769230347, 51.61290322580672, 43.750..."


In [9]:
len(result_df)

10494

In [10]:
# 저장할 경로 설정
save_path1 = '/content/sample_data/SOL60_Indicator_rise_fall_percentages.csv'

# CSV 파일로 저장
result_df.to_csv(save_path1, index=False)

In [11]:
file_path2 = '/content/sample_data/SOL60_Indicator_rise_fall_percentages.csv'
data2 = pd.read_csv(file_path2)

In [13]:
# start_time을 시-분 형식으로 변환
data2['time'] = pd.to_datetime(data2['start_time']).dt.strftime('%H:%M')

# 그룹화 및 통계 계산
grouped = data2.groupby('time').agg({
    'highest_change': ['max', 'mean'],
    'lowest_change': ['min', 'mean']
})

# 1.2% 이상 상승할 확률 계산
def calc_probability(group):
    return (group['highest_change'] >= 1.2).mean() * 100

probabilities = data2.groupby('time').apply(calc_probability)

# 결과 병합
results = grouped.reset_index()
results.columns = ['time', 'max_highest_change', 'mean_highest_change', 'max_lowest_change', 'mean_lowest_change']
results['probability_above_1.2'] = probabilities.values
results.head()

Unnamed: 0,time,max_highest_change,mean_highest_change,max_lowest_change,mean_lowest_change,probability_above_1.2
0,00:00,23.624471,4.948448,-1.500577,-0.741233,57.142857
1,00:01,23.672055,4.91712,-1.462664,-0.772656,57.142857
2,00:02,23.529412,4.886793,-1.576317,-0.797629,57.142857
3,00:03,23.576923,4.832701,-1.538462,-0.851792,57.142857
4,00:04,23.958333,4.82112,-1.621037,-0.872041,57.142857


In [14]:
results = results.sort_values(by='probability_above_1.2', ascending=False)
results.head()

Unnamed: 0,time,max_highest_change,mean_highest_change,max_lowest_change,mean_lowest_change,probability_above_1.2
1423,23:43,5.905056,2.348141,-1.196449,-0.542188,85.714286
1424,23:44,5.741811,2.271935,-1.348748,-0.614915,85.714286
1425,23:45,5.905056,2.355389,-1.196449,-0.522173,85.714286
1439,23:59,23.529412,4.913316,-1.576317,-0.771817,71.428571
1426,23:46,10.810811,3.066347,-1.158301,-0.507067,71.428571


In [18]:
# 저장할 경로 설정
save_path2 = '/content/sample_data/SOL60_1.2_probability.csv'

# CSV 파일로 저장
results.to_csv(save_path2, index=False)

In [16]:
# probability_above_1.2가 70 이상인 항목 필터링
filtered_results = results[results['probability_above_1.2'] >= 70]
filtered_results = filtered_results.sort_values(by='time', ascending=False)
filtered_results

Unnamed: 0,time,max_highest_change,mean_highest_change,max_lowest_change,mean_lowest_change,probability_above_1.2
1439,23:59,23.529412,4.913316,-1.576317,-0.771817,71.428571
1438,23:58,23.339731,4.869717,-1.727447,-0.809555,71.428571
1437,23:57,23.0563,4.803835,-1.953275,-0.866569,71.428571
1436,23:56,23.624471,4.779179,-1.500577,-0.7694,71.428571
1435,23:55,23.576923,4.817196,-1.538462,-0.730941,71.428571
1434,23:54,23.672055,4.837445,-1.462664,-0.729164,71.428571
1433,23:53,23.576923,4.848131,-1.538462,-0.716491,71.428571
1432,23:52,23.672055,4.847511,-1.462664,-0.719551,71.428571
1431,23:51,23.576923,4.865723,-1.538462,-0.673902,71.428571
1430,23:50,23.815029,4.922474,-1.348748,-0.624784,71.428571


In [17]:
len(filtered_results)

26