In [2]:
!pip install finance-datareader



In [26]:
import FinanceDataReader as fdr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from datetime import datetime, timedelta

num_images = 5000
day_num = 32

In [27]:
def convert_image(data, num_images, day_num, change_std, save = False):
    def alpha_blend(color1, color2, alpha):
        return [
            int((color1[0] * alpha + color2[0] * (1 - alpha))),
            int((color1[1] * alpha + color2[1] * (1 - alpha))),
            int((color1[2] * alpha + color2[2] * (1 - alpha))),
            255,
        ]

    # 데이터수가 너무 적으면 패스
    if len(data) <= 100
        return []


    # 해당 종목의 주가 데이터 - 100이 요구되는 이미지 수보다 적으면(상장직후는 변동이 심할 수있으니 제외)
    if num_images - 100 > len(data):
        num_images = len(data) - 100

    #return_datas는 이미지, 라벨링(상승이면1 하락이면0), 변동률, 날짜를 담고있음
    return_datas = []
    prediction_day_num = 5


    image_idx = 1
    while(image_idx <= num_images):
        prediction_day = data[-image_idx - prediction_day_num : -image_idx]
        predict_mean = prediction_day['Close'].mean()
        now = data.iloc[-image_idx - prediction_day_num - 1]
        change = (predict_mean - now['Close']) / now['Close']

        # change가 너무 높으면 이상치라고 판단하여 패스
        if abs(change) < change_std or change_std * 2 < abs(change): # 향후 prediction_day_num일의 변동률이 기준 이하면
            image_idx += 1
            continue

        stock_data = data[-image_idx - day_num - prediction_day_num : -image_idx - prediction_day_num]

        # 이미지 크기 설정
        width, height = 96, 96

        # High, Low, Close, Volume 데이터 정규화
        high_prices = stock_data['High'].values
        low_prices = stock_data['Low'].values
        close_prices = stock_data['Close'].values
        volume_data = stock_data['Volume'].values

        high_prices_norm = (high_prices - np.min(low_prices)) / (np.max(high_prices) - np.min(low_prices))
        low_prices_norm = (low_prices - np.min(low_prices)) / (np.max(high_prices) - np.min(low_prices))
        close_prices_norm = (close_prices - np.min(low_prices)) / (np.max(high_prices) - np.min(low_prices))
        volume_data_norm = height * (volume_data) / (2 * (np.max(volume_data)))

        # 이동평균선 추가를 위한 stock_data 생성
        stock_data_ma20 = data[-image_idx - day_num - 19: -image_idx]
        stock_data_ma20 = stock_data_ma20.copy()
        stock_data_ma20['MA20'] = stock_data_ma20['Close'].rolling(window=20).mean()
        stock_data_ma20.dropna(inplace=True)

        # 이동평균선 값을 정규화
        stock_data_ma20['MA20_norm'] = (
                (stock_data_ma20['MA20'] - np.min(low_prices)) / (np.max(high_prices) - np.min(low_prices))
            )

        # 96x96 이미지 생성
        img = np.zeros((height, width, 4), dtype=np.uint8)  # 4 channels for RGBA

        for day in range(32):
            high_price = int(high_prices_norm[day] * (height - 1))
            low_price = int(low_prices_norm[day] * (height - 1))
            close_price = int(close_prices_norm[day] * (height - 1))
            open_price = int(
                (
                    (stock_data['Open'].values[day] - np.min(low_prices))
                    / (np.max(high_prices) - np.min(low_prices))
                )
                * (height - 1)
            )
            volume = int(volume_data_norm[day])

            x_start = day * 3

            # 시가, 종가, 고가, 저가 막대그래프
            # 막대 왼쪽, 오른쪽 그리기 (시가, 종가 범위)
            for x in range(x_start, x_start + 3):
                if close_prices[day] > stock_data['Open'].values[day]:
                    color = [255, 0, 0, 255]  # 상승: 빨강
                    tail_color = [200, 0, 0, 255]  # 상승 시 꼬리: 어두운 빨강
                else:
                    color = [0, 0, 255, 255]  # 하락: 파랑
                    tail_color = [0, 0, 200, 255]  # 하락 시 꼬리: 어두운 파랑

                for y in range(min(open_price, close_price), max(open_price, close_price) + 1):
                    img[height - 1 - y, x] = color

            # 꼬리 그리기 (High, Low 범위)
            for x in range(x_start + 1, x_start + 2):
                for y in range(low_price, high_price + 1):
                    if img[height - 1 - y, x][0] != 255 and img[height - 1 - y, x][2] != 255:
                        img[height - 1 - y, x] = tail_color

            # 거래량 그리기
            for x in range(x_start, x_start + 3):
                overlapped = height - volume
                if overlapped >= 0 and overlapped < height:
                    img[overlapped:, x] = [
                        alpha_blend(img[y, x], [128, 128, 128, 255], 0.45)
                        for y in range(overlapped, height)
                    ]

            # 20일 이동평균선 그리기
            ma20_norm = stock_data_ma20.iloc[day]['MA20_norm']

            for x in range(x_start, x_start + 3):
                if not np.isnan(ma20_norm):
                    ma20_y = int(ma20_norm * (height - 1))
                    if 0 <= ma20_y < height:
                        for y in range(ma20_y - 1, ma20_y + 2):  # 이동평균선 주변에 블렌딩 처리
                            if 0 <= y < height:
                                new_color = alpha_blend([255, 255, 0, 255], img[height - 1 - y, x], 0.6)  # 노란색으로 변경
                                img[height - 1 - y, x] = new_color
        if save :
            # 이미지 저장
            img = Image.fromarray(img, 'RGBA')
            img.save(f'samsung_stock_data_image_{image_idx}.png')

        if change_std <= change:
            return_datas.append((img, 1))
        elif change <= -change_std :
            return_datas.append((img, 0))

        # 비슷한 이미지를 피하기 위해 기준 예측일만큼 건너뜀
        image_idx += prediction_day_num


    return return_datas

In [28]:
top_50_tickers = ['005930', '051910', '000660', '207940', '005935', '051915', '006400', '005380', '035420', '363280',
                  '247540', '000270', '086520', '035720', '068270', '012330', '105560', '028260', '055550', '096770',
                  '066570', '047050', '032830', '323410', '003550', '015760', '000810', '033780', '091990', '009150',
                  '086790', '034730', '336370', '352820', '018260', '122630', '017670', '010130', '024880', '003490',
                  '010950', '086280', '259960', '024110', '030200', '316140', '010140', '090430', '028050', '233740']

In [29]:
def preprocess_data(data):
    # 0 값을 포함하는 행을 찾기 위해 0을 NaN으로 대체
    print('거래량이 0인 데이터수 :', (data['Volume'] == 0).sum())
    print(data[data['Volume'] == 0])

    data = data[data['Volume'] != 0]
    # 0이나 NaN을 포함하는 행 제거
    data = data.dropna(how='any')

    return data

In [30]:
result = []
for i, ticker in enumerate(top_50_tickers):
    print(f"{i+1}번째 종목 추출중...   ")

    data = preprocess_data(fdr.DataReader(ticker))  # 각 종목의 데이터 가져오기
    images = convert_image(data, num_images, day_num, 0.05)
    print(type(images[0][0]))
    result += images
    print(f"{len(images)}개 추출완료")

1번째 종목 추출중...   
거래량이 0인 데이터수 : 3
            Open  High  Low  Close  Volume  Change
Date                                              
2018-04-30     0     0    0  53000       0     0.0
2018-05-02     0     0    0  53000       0     0.0
2018-05-03     0     0    0  53000       0     0.0
<class 'numpy.ndarray'>
168개 추출완료
2번째 종목 추출중...   
거래량이 0인 데이터수 : 15
            Open  High  Low   Close  Volume  Change
Date                                               
2009-03-30     0     0    0  128000       0     0.0
2009-03-31     0     0    0  128000       0     0.0
2009-04-01     0     0    0  128000       0     0.0
2009-04-02     0     0    0  128000       0     0.0
2009-04-03     0     0    0  128000       0     0.0
2009-04-06     0     0    0  128000       0     0.0
2009-04-07     0     0    0  128000       0     0.0
2009-04-08     0     0    0  128000       0     0.0
2009-04-09     0     0    0  128000       0     0.0
2009-04-10     0     0    0  128000       0     0.0
2009-04-13     0  

In [31]:
print(len(result))

10152


In [32]:
import numpy as np
from sklearn.model_selection import train_test_split

# 이미지와 라벨 분리
images, labels = zip(*result)

# 이미지와 라벨을 NumPy로 변환
images = np.array([np.array(image) for image in images])
labels = np.array(labels)

# 훈련 데이터와 테스트 데이터로 분리
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

# NumPy 배열로 저장
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

In [None]:
# NumPy 배열로부터 데이터 로드
X_train = np.load('X_train.npy')
X_test = np.load('X_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')