In [1]:
import pyupbit
import numpy as np
import pandas as pd
import time
import datetime
import tqdm
import argparse

class MakeDataset:
    def __init__(self, ticker, interval, window, frm, to = None, baseline = 0.003):
        self.ticker = ticker
        self.interval = interval
        self.window = window
        self.frm = frm
        self.to = to
        self.baseline = baseline
        
        self.dataset = None
        
    def get_dataset(self):
        self.dataset = self.get_ohlcv_continue(self.ticker, self.interval, self.frm, self.to)

        print('preprocessing..')
        self.dataset = self.preprocess(self.dataset, self.window)

        print('remove noise..')
        self.dataset = self.remove_noise(self.dataset, self.baseline)

        print('add variables..')
        self.dataset = add_variables(self.dataset)

        print('done!')
        
        return self.dataset
    
    def get_ohlcv_continue(self, ticker, interval, frm, to = None):
    
        """
    
        ticker(str) : KRW-BTC
        interval : minute1, minute3, minute5, minute10, minute15, minute60
        frm(str) :
    
        """
    
        if isinstance(frm, str):
            frm = pd.to_datetime(frm).to_pydatetime()
        
        if to is not None:
            if isinstance(to, str):
                to = pd.to_datetime(to).to_pydatetime()
        else:
            to = datetime.datetime.now().replace(microsecond=0)
    
    
        if interval == "minute1":
            count = 60
        elif interval == "minute3":
            count = 20
        elif interval == "minute5":
            count = 12
        elif interval == "minute10":
            count = 6
        elif interval == "minute15":
            count = 4
        elif interval == "minute30":
            count = 2
        elif interval == "minute60":
            count = 1

        date_list = list(pd.date_range(start = frm, end = to, freq = 'H').to_pydatetime())

        dataframes = []
        for date in tqdm.tqdm(date_list[1:]):
            try:
                df = pyupbit.get_ohlcv(ticker, interval, count = count, to = date)
                dataframes.append(df)
                time.sleep(0.1)
            except:
                pass
        
        data = pd.concat(dataframes)
        # 중복 인덱스 제거
        data = data.reset_index().drop_duplicates(subset='index', keep='first').set_index('index')
        data.sort_index(inplace=True)

        return data
    
    def preprocess(self, data, window):
        data['close_ma5'] = data['close'].ewm(window).mean()
    
        data['change'] = np.zeros(len(data))
        data['change'] = data['close_ma5'].rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])
    
        data['signal'] = data['change'].diff()
    
        data.dropna(inplace = True)
    
        # 처음이 매도로 시작할 경우 삭제
        if data[data['signal'] == -1].index[0] < data[data['signal'] == 1].index[0]:
            data.drop(data[data['signal'] == -1].index[0], inplace = True)
        # 마지막이 매수로 끝날 경우 삭제
        if data[data['signal'] == 1].index[-1] > data[data['signal'] == -1].index[-1]:
            data.drop(data[data['signal'] == 1].index[-1], inplace = True)
    
        return data
    
    
    def remove_noise(self, data, baseline):
        buy_indices = data[data['signal'] == 1].index
        sell_indices = data[data['signal'] == -1].index

        data = data.copy()
        for i, (buy_index, sell_index) in enumerate(zip(buy_indices, sell_indices)):
            profit_loss = (data[data.index == sell_index]['close'].values[0] - data[data.index == buy_index]['close'].values[0]) / data[data.index == buy_index]['close'].values[0]
            
            if profit_loss < baseline:
                data.loc[data.index == buy_index, 'signal'] = 0
                data.loc[data.index == sell_index, 'signal'] = 0
            
        return data

In [2]:
ticker = 'KRW-ETC'
interval = 'minute1'
frm = '2021-04-30 00:00:00'

mk = MakeDataset(ticker = ticker, interval = interval, window = 3, frm = frm)
data = mk.get_ohlcv_continue(ticker = ticker, interval = interval,  frm = frm)
print(data)

100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:01<00:00,  5.48it/s]

                        open     high      low    close       volume
index                                                               
2021-04-30 00:00:00  40820.0  40860.0  40820.0  40830.0  1964.172420
2021-04-30 00:01:00  40850.0  40850.0  40810.0  40820.0  3064.448297
2021-04-30 00:02:00  40810.0  40830.0  40810.0  40810.0  3290.192940
2021-04-30 00:03:00  40820.0  40820.0  40800.0  40800.0  4444.268969
2021-04-30 00:04:00  40800.0  40800.0  40770.0  40770.0  3936.190899
...                      ...      ...      ...      ...          ...
2021-04-30 05:55:00  40610.0  40790.0  40570.0  40670.0  4109.285165
2021-04-30 05:56:00  40650.0  40680.0  40650.0  40680.0   945.202356
2021-04-30 05:57:00  40680.0  40740.0  40650.0  40730.0  2485.543231
2021-04-30 05:58:00  40670.0  40730.0  40670.0  40680.0   947.240074
2021-04-30 05:59:00  40680.0  40700.0  40680.0  40680.0  1024.277679

[360 rows x 5 columns]





In [7]:
def preprocess(data, window):
        data['close_ma'] = data['close'].ewm(window).mean()
    
        data['change'] = np.zeros(len(data))
        data['change'] = data['close_ma'].rolling(2).apply(lambda x: x.iloc[1] > x.iloc[0])
    
        data['signal'] = data['change'].diff()
    
        data.dropna(inplace = True)
        
        sell_indces = data[data['signal'] == -1].index
        buy_indces = data[data['signal'] == 1].index
        
        # 처음이 매도로 시작할 경우 앞에 삭제
        if sell_indces[0] < buy_indces[0]:
            data = data.iloc[data.index.get_loc(sell_indces[0]) + 1: ]        
        # 마지막이 매수로 끝날 경우 뒤에 삭제
        if buy_indces[-1] > sell_indces[-1]:
            data = data.iloc[:data.index.get_loc(buy_indces[-1])]
    
        return data

In [8]:
def cal_ichimoku(data):
    data = data.copy()
    tenkan_sen = (data['high'].rolling(9).max() + data['low'].rolling(9).min()) / 2
    kijun_sen = (data['high'].rolling(26).max() + data['low'].rolling(26).min()) / 2

    senkou_span1 = ((tenkan_sen + kijun_sen) / 2).shift(26)
    senkou_span2 = ((data['high'].rolling(52).max() + data['low'].rolling(52).min()) / 2).shift(26)

    data['ichimoku'] = senkou_span1 - senkou_span2

    return data

In [10]:
cal_ichimoku(data).tail(35)

Unnamed: 0_level_0,open,high,low,close,volume,close_ma,change,signal,ichimoku
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-04-30 05:25:00,40800.0,40800.0,40770.0,40770.0,212.277205,40786.134868,0.0,-1.0,-10.0
2021-04-30 05:26:00,40770.0,40790.0,40740.0,40770.0,1171.405442,40782.101151,0.0,0.0,-2.5
2021-04-30 05:27:00,40790.0,40790.0,40730.0,40730.0,3141.677309,40769.075863,0.0,0.0,-2.5
2021-04-30 05:28:00,40730.0,40750.0,40730.0,40730.0,467.006271,40759.306897,0.0,0.0,-2.5
2021-04-30 05:29:00,40740.0,40750.0,40730.0,40730.0,624.170799,40751.980173,0.0,0.0,-2.5
2021-04-30 05:30:00,40730.0,40870.0,40710.0,40790.0,1058.287166,40761.48513,1.0,1.0,-2.5
2021-04-30 05:31:00,40750.0,40900.0,40730.0,40850.0,479.936703,40783.613847,1.0,0.0,-2.5
2021-04-30 05:32:00,40800.0,40920.0,40730.0,40850.0,1582.764008,40800.210385,1.0,0.0,-2.5
2021-04-30 05:33:00,40850.0,40900.0,40850.0,40860.0,623.558605,40815.157789,1.0,0.0,-2.5
2021-04-30 05:34:00,40880.0,40900.0,40860.0,40890.0,394.392517,40833.868342,1.0,0.0,-2.5


In [5]:
def cal_MACD(data, num_long = 12, num_short = 26, num_signal=9):
    data = data.copy()
    ema_long = data['close'].ewm(span = num_long, min_periods = num_long - 1).mean()
    ema_short = data['close'].ewm(span = num_short, min_periods = num_short - 1).mean()
    MACD = ema_long - ema_short
    MACD_signal = MACD.ewm(span = num_signal, min_periods = num_signal - 1).mean()
    data['MACD_diff'] = MACD - MACD_signal
    
    # MACD cross
    data['MACD_cross'] = pd.Series(np.where(data['MACD_diff'] >= 0, 1, -1), index = data.index)
    # 지난 MACD 대비 MACD 비율
    data['MACD_lastMACD_ratio'] = np.zeros(len(data))
    data.loc[1:, 'MACD_lastMACD_ratio'] = (data['MACD_diff'][1:].values - data['MACD_diff'][:-1].values) / data['MACD_diff'][:-1].values
    
    data.drop('MACD_diff', axis = 1, inplace = True)
    
    return data

In [39]:
cal_MACD(data).tail(20)

Unnamed: 0_level_0,open,high,low,close,volume,close_ma,change,signal,MACD_cross,MACD_lastMACD_ratio
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-04-25 11:38:00,36600.0,36640.0,36570.0,36610.0,1709.777027,36626.85782,0.0,0.0,-1,-0.083122
2021-04-25 11:39:00,36610.0,36620.0,36530.0,36560.0,3306.321364,36610.143365,0.0,0.0,-1,0.072291
2021-04-25 11:40:00,36560.0,36590.0,36560.0,36570.0,1889.665579,36600.107524,0.0,0.0,-1,-0.044574
2021-04-25 11:41:00,36560.0,36650.0,36560.0,36630.0,2321.134496,36607.580643,1.0,1.0,-1,-0.295642
2021-04-25 11:42:00,36610.0,36640.0,36610.0,36610.0,2588.992711,36608.185482,1.0,0.0,-1,-0.222491
2021-04-25 11:43:00,36630.0,36630.0,36610.0,36620.0,1727.772101,36611.139112,1.0,0.0,-1,-0.306914
2021-04-25 11:44:00,36610.0,36620.0,36600.0,36600.0,2559.709314,36608.354334,0.0,-1.0,-1,-0.162941
2021-04-25 11:45:00,36600.0,36660.0,36600.0,36630.0,4606.09549,36613.76575,1.0,1.0,-1,-0.530124
2021-04-25 11:46:00,36630.0,36640.0,36610.0,36620.0,1453.66572,36615.324313,1.0,0.0,-1,-0.577383
2021-04-25 11:47:00,36640.0,36640.0,36540.0,36540.0,4304.158057,36596.493235,0.0,-1.0,-1,3.418548
