In [1]:
import numpy as np
import pandas as pd
from src.data_loader import *
# from src.strategy import *
from src.index import *
from src.strategy2 import *
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score,  classification_report
from fredapi import Fred
from keras.callbacks import EarlyStopping, ModelCheckpoint 
from keras.layers import Conv1D, LSTM, Dense, Dropout, Bidirectional, TimeDistributed
from keras.layers import MaxPooling1D, Flatten

from keras.models import Sequential # deep learning model
from keras.layers import LSTM, Dropout, Dense, Activation
from keras import optimizers # 옵티마이저 

import datetime
import tensorflow as tf
import pandas_datareader as pdr
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
data = DataGenerator(data_type='db').data_search(ticker='AAPL')
data.sort_index(ascending=True, inplace=True)
data3 = data.copy()
data3.sort_index(ascending=True, inplace=True)

기존 데이터 불러오기 성공


In [3]:
data

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1992-12-21,0.520089,0.535714,0.517857,0.532366,0.437059,256065600.0
1992-12-22,0.533482,0.546875,0.533482,0.541295,0.444389,280168000.0
1992-12-23,0.537946,0.540179,0.529018,0.533482,0.437975,112336000.0
1992-12-24,0.535714,0.535714,0.526786,0.526786,0.432478,45964800.0
1992-12-28,0.529018,0.533482,0.529018,0.531250,0.436143,70448000.0
...,...,...,...,...,...,...
2022-11-07,137.110001,139.149994,135.669998,138.919998,138.919998,83374600.0
2022-11-08,140.410004,141.429993,137.490005,139.500000,139.500000,89908500.0
2022-11-09,138.500000,138.550003,134.589996,134.869995,134.869995,74917800.0
2022-11-10,141.240005,146.869995,139.500000,146.869995,146.869995,118854000.0


In [4]:
def add_feature(df, **kwargs):
    
    fred = Fred(api_key='d929757b1ad9cd1d5115620a50badb0a')
    
    data = df.copy()

    data["Log_Close"]                              = np.log(data["Adj Close"])
    data["AC_pct"]                                 = data['Adj Close'].pct_change()
    data["V_pct"]                                  = data['Volume'].pct_change()
    data["sma(5)"]                                 = sma(data, 5)
    data["sma(20)"]                                = sma(data, 20)
    data["High_Yield"]                             = fred.get_series('BAMLH0A0HYM2', data.index[0]).fillna(method="ffill")
    data["volume_sma5"]                            = data["Volume"].rolling(5).mean()
    data["volume_sma20"]                           = data["Volume"].rolling(20).mean()
    data["T10Y2Y"]                                 = fred.get_series('T10Y2Y', data.index[0])
    data["VIX"]                                    = fred.get_series('VIXCLS', data.index[0])
    data["RSI"]                                    = rsi(data, **kwargs)
    data[["macd","macd_signal","macd_oscillator"]] = macd(data, **kwargs)
#-------------------------------------------------------------------------------------------------------------------------------------------
    
    OBV = []
    OBV.append(0)
    for i in range(1, len(data['Adj Close'])):
        if data['Adj Close'][i] > data['Adj Close'][i-1]: 
            OBV.append(OBV[-1] + data['Volume'][i]) 
        elif data['Adj Close'][i] < data['Adj Close'][i-1]:
            OBV.append( OBV[-1] - data['Volume'][i])
        else:
         OBV.append(OBV[-1])
    data['OBV'] = OBV
    data["OBV_mv20"] = data["OBV"].rolling(20).mean()
    
    #-------------------------------------------------------------------------------------------------------------------------------------------
    
    signal = []
    for idx in range(len(data['Adj Close'])):
        if idx == 0 or idx+1 == len(data['Adj Close']):
            signal.append(0)   
        else:
            if data['Adj Close'].iloc[idx-1]> data['Adj Close'].iloc[idx] and data['Adj Close'].iloc[idx+1]> data['Adj Close'].iloc[idx]:
                signal.append(1) #----- 상승
            elif data['Adj Close'].iloc[idx-1]< data['Adj Close'].iloc[idx] and data['Adj Close'].iloc[idx+1]< data['Adj Close'].iloc[idx]:
                signal.append(-1) #----- 하락
            else:
                signal.append(0) #----- 횡보추세
    data['trend'] = signal

    # data["diff"] = data["Log_Close"].diff()
    # data["trend"] = [1 if data.iloc[i]["diff"] > 0 else 0 for i in range(len(data))]

    
        
    
    # -------------------------------------------------------------------------------------------------------------------------------------------
    
    data['r_signal']    = [-1 if data.iloc[i]['RSI'] > 70 else 1 if data.iloc[i]['RSI'] < 30 else 0 for i in range(len(data))] 
    data['m_signal']    = [-1 if data.iloc[i]['macd'] < data.iloc[i]['macd_signal'] else 1 if data.iloc[i]['macd'] > data.iloc[i]['macd_signal'] else 0 for i in range(len(data))] 


    sma_signal = []
    for i in range(len(data["Adj Close"])):
        if data.iloc[i]["sma(5)"] < data.iloc[i]["Adj Close"] and data.iloc[i-1]["sma(5)"] > data.iloc[i-1]["Adj Close"]:
            sma_signal.append(1)
        elif data.iloc[i]["sma(5)"] > data.iloc[i]["Adj Close"] and data.iloc[i-1]["sma(5)"] < data.iloc[i-1]["Adj Close"]:
            sma_signal.append(-1)
        else:
            sma_signal.append(0)
    data["sma_signal"] = sma_signal

    # #-------------------------------------------------------------------------------------------------------------------------------------------
    
    target = []
    for i in range(1, len(data)+1):
        target.append(data.iloc[i-1:i, -4:].values.sum())
    data['tt'] = target

    data['position'] = 0
    data['position'].mask(data['tt'] >= 1, 'buy', inplace=True)
    data['position'].mask(data['tt'] == 0, 'hold', inplace=True)
    data['position'].mask(data['tt'] <= -1, 'sell', inplace=True)

    data['position'].mask(data['position'] == 'hold', 0, inplace=True)  # 0은 hold
    data['position'].mask(data['position'] == 'buy', 1, inplace=True)   # 1은 매수 
    data['position'].mask(data['position'] == 'sell', 2, inplace=True)  # 2는 매도
    data['position'] = data['position'].astype('int')

    # data["position"] = [2 if data.iloc[i]["AC_pct"] > 0 else 1 if data.iloc[i]["AC_pct"] < 0 else 0 for i in range(len(data))]
    
    #-------------------------------------------------------------------------------------------------------------------------------------------
    
    data.dropna(axis=0, inplace=True)
    data.drop(["Open","High","Low","Close","Adj Close","tt","m_signal","r_signal","trend","macd_oscillator","macd_signal","volume_sma5","sma_signal","OBV"], axis=1, inplace=True)
    # data.drop(["Open","High","Low","Close","volume_sma5","macd_oscillator","macd_signal","Log_Close"], axis=1, inplace=True)

    return data

In [5]:
data = add_feature(data)

In [6]:
data

Unnamed: 0_level_0,Volume,Log_Close,AC_pct,V_pct,sma(5),sma(20),High_Yield,volume_sma20,T10Y2Y,VIX,RSI,macd,OBV_mv20,position
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1996-12-31,3.837456e+08,-1.839625,-0.040228,0.465797,0.170293,0.178855,3.13,198625280.0,0.55,20.92,29.925120,-0.01,6.095760e+09,1
1997-01-02,1.431136e+08,-1.833655,0.005987,-0.627061,0.167059,0.177286,3.06,192004400.0,0.57,21.14,31.258422,-0.01,6.036964e+09,2
1997-01-03,1.197168e+08,-1.798565,0.035713,-0.163484,0.165156,0.176049,3.09,188448960.0,0.57,19.13,38.784893,-0.01,5.993694e+09,2
1997-01-06,1.882832e+09,-1.994775,-0.178160,14.727383,0.157164,0.173338,3.10,275483600.0,0.57,19.89,24.101423,-0.01,5.856283e+09,0
1997-01-07,9.769312e+08,-2.015977,-0.020978,-0.481137,0.150695,0.170436,3.10,312860800.0,0.59,19.35,23.186616,-0.01,5.658556e+09,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-11-04,1.407167e+08,4.930004,-0.001947,0.437080,145.061273,144.659009,4.77,92240910.0,-0.49,24.55,40.563604,-1.11,1.401518e+11,0
2022-11-07,8.337460e+07,4.933898,0.003902,-0.407500,142.228064,144.595637,4.71,92664690.0,-0.50,24.35,41.301851,-1.53,1.401437e+11,2
2022-11-08,8.990850e+07,4.938065,0.004175,0.078368,140.047964,144.633145,4.79,93308430.0,-0.53,25.54,42.133239,-1.80,1.401438e+11,2
2022-11-09,7.491780e+07,4.904311,-0.033190,-0.166733,138.064001,144.471101,4.99,93532635.0,-0.49,26.09,37.559833,-2.36,1.401438e+11,0


In [7]:
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.feature_selection import RFECV, SelectFromModel, SelectKBest
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import metrics

In [8]:
# 타겟값 분리
X = data.drop('position', axis=1)
y = data[['position']]

In [9]:
# 스탠다드 스케일러 사용 단, 타겟값은 적용하지 않음
ms = MinMaxScaler()
X  = ms.fit_transform(X)

In [10]:
X = pd.DataFrame(X, columns=data.columns[:-1], index=data.index)

In [11]:
# 학습비율
train_size = int(len(X) * 0.85)
X_train, X_test = X.iloc[:train_size, :], X.iloc[train_size:, :]
y_train, y_test = y.iloc[:train_size, :], y.iloc[train_size:, :]

In [12]:
# 강사님의 윈도우 코드 복붙(이건 우리 입맛에 나중에 바꿔도 상관없음)
def my_window_data(feature,target, window_size=20):
    data = pd.concat([feature,target], axis = 1)
    X_list = []
    y_list = []
    for i in range(len(data) - window_size-1):
        X = data.iloc[i:i+window_size]
        y = target.iloc[i+window_size]
        X_list.append(np.array(X))      #[[] ,[] ,[] ,[]]
        y_list.append(np.array(y))       #[]   
    return  np.array(X_list), np.array(y_list)

In [13]:
X_train, y_train = my_window_data(X_train, y_train)
X_test , y_test = my_window_data(X_test , y_test)

In [14]:
X_train.shape, X_test.shape

((5470, 20, 14), (948, 20, 14))

In [15]:
subsequences = 2
timesteps = X_train.shape[1]//subsequences
X_train = X_train.reshape((X_train.shape[0], subsequences, timesteps, X_train.shape[2]))
X_test = X_test.reshape((X_test.shape[0], subsequences, timesteps, X_test.shape[2]))
print('Train set shape', X_train.shape)
print('Validation set shape', X_test.shape)

Train set shape (5470, 2, 10, 14)
Validation set shape (948, 2, 10, 14)


In [16]:
with tf.device("/gpu:0"):
    model = Sequential()
    model.add(TimeDistributed(Conv1D(32, kernel_size=3, activation='relu'), input_shape=(None, X_train.shape[2], X_train.shape[3])))
    model.add(TimeDistributed(MaxPooling1D(2)))
    model.add(TimeDistributed(Conv1D(64, kernel_size=3, activation='relu')))
    model.add(TimeDistributed(MaxPooling1D(2)))
    # model.add(TimeDistributed(Conv1D(32, kernel_size=3, activation='relu')))
    # model.add(TimeDistributed(MaxPooling1D(2)))
    model.add(TimeDistributed(Flatten()))

    # LSTM layers
    model.add(LSTM(50, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.5))

    #Final layers
    model.add(Dense(units = 3, activation='softmax'))

    adam = optimizers.Adam(lr = 0.001)

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam')

    print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 time_distributed (TimeDistr  (None, None, 8, 32)      1376      
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, None, 4, 32)      0         
 tributed)                                                       
                                                                 
 time_distributed_2 (TimeDis  (None, None, 2, 64)      6208      
 tributed)                                                       
                                                                 
 time_distributed_3 (TimeDis  (None, None, 1, 64)      0         
 tributed)                                                       
                                                                 
 time_distributed_4 (TimeDis  (None, None, 64)         0

In [17]:
early_stopping = EarlyStopping(patience = 20)

fit_res = model.fit( X_train, y_train
                   , batch_size=10
                   , epochs=100 # 
                   , validation_data=(X_test, y_test)
                   , callbacks= [early_stopping]
                   )

loss, acc = model.evaluate(X_test, y_test)
print(f"loss: {loss}  accuracy : {acc}")

Epoch 1/100


: 

: 

In [None]:
pred = model.predict(X_test)
print(pred)
pp = np.argmax(pred, axis=-1)
pp.shape, y_test.shape

[[0.2942657  0.23499806 0.4707363 ]
 [0.06031467 0.10278109 0.8369041 ]
 [0.26051047 0.21964906 0.5198404 ]
 ...
 [0.502159   0.14810629 0.34973466]
 [0.33927932 0.05007213 0.6106485 ]
 [0.21194541 0.03614027 0.7519143 ]]


((660,), (660, 1))

In [None]:
f1 = f1_score(y_test, pp, average='macro')
print(f"F1 score:{f1:.4f}")

F1 score:0.4972


In [None]:
print(classification_report( y_test, pp, target_names=['holding','buy', 'sell']))

              precision    recall  f1-score   support

     holding       0.47      0.09      0.16       191
         buy       0.56      0.80      0.66       225
        sell       0.61      0.75      0.67       244

    accuracy                           0.58       660
   macro avg       0.55      0.55      0.50       660
weighted avg       0.55      0.58      0.52       660



In [None]:
from sklearn.metrics import confusion_matrix
res = confusion_matrix(y_test, pp)
print(res)  

[[ 18  93  80]
 [  7 180  38]
 [ 13  47 184]]


In [None]:
class backtest:
    def __init__(self, df ,position, result_show =False):
        self.df =df
        self.position = position
        self.result_show = result_show
        self.df = self.evaluate(self.df, cost=.001)
        self.performance(self.df)


    def __get_period(self, df):

        df.dropna(inplace=True)
        end_date = df['Datetime'].iloc[-1]
        start_date = df['Datetime'].iloc[0]
        days_between = (end_date - start_date).days
        return abs(days_between)
    def __annualize(self, rate, period):
        if period < 360:
            rate = ((rate-1) / period * 365) + 1
        elif period > 365:
            rate = rate ** (365 / period)
        else:
            rate = rate
        return round(rate, 4)


    def __get_sharpe_ratio(self, df, rf_rate):
        '''
        Calculate sharpe ratio
        :param df:
        :param rf_rate:
        :return: Sharpe ratio
        '''
        period = self.__get_period(df)
        rf_rate_daily = rf_rate / 365 + 1
        df['exs_rtn_daily'] = df['daily_rtn'] - rf_rate_daily
        exs_rtn_annual = (self.__annualize(df['acc_rtn'][-1:], period) - 1) - rf_rate
        exs_rtn_vol_annual = df['exs_rtn_daily'].std() * np.sqrt(365)
        sharpe_ratio = exs_rtn_annual / exs_rtn_vol_annual if exs_rtn_vol_annual>0 else 0
        return round(sharpe_ratio, 4)
    def evaluate(self, df, cost= .1):
        '''
        Calculate daily returns and MDDs of portfolio
        :param df: The dataframe containing trading position
        :param cost: Transaction cost when sell
        :return: Returns, MDD
        '''
        df['signal_price'] = np.nan
        df['signal_price'].mask(df[self.position]== 1, df['Adj Close'], inplace=True)
        df['signal_price'].mask(df[self.position]== 2, df['Adj Close'], inplace=True)
        record = df[[self.position,'signal_price']].dropna()
        record['rtn'] = 1
        record['rtn'].mask(record[self.position]== 2, (record['signal_price']*(1-cost))/record['signal_price'].shift(1), inplace=True)
        record['acc_rtn'] = record['rtn'].cumprod()

        df['signal_price'].mask(df[self.position]== 0, df['Adj Close'], inplace=True)
        df['rtn'] = record['rtn']
        df['rtn'].fillna(1, inplace=True)

        df['daily_rtn'] = 1
        df['daily_rtn'].mask(df[self.position] == 0, df['signal_price'] / df['signal_price'].shift(1), inplace=True)
        df['daily_rtn'].mask(df[self.position] == 2, (df['signal_price']*(1-cost)) / df['signal_price'].shift(1), inplace=True)
        df['daily_rtn'].fillna(1, inplace=True)
        df['acc_rtn'] = df['daily_rtn'].cumprod()
        df['acc_rtn_dp'] = ((df['acc_rtn']-1)*100).round(2)
        df['mdd'] = (df['acc_rtn'] / df['acc_rtn'].cummax()).round(4)
        df['bm_mdd'] = (df['Adj Close'] / df['Adj Close'].cummax()).round(4)
        df.drop(columns='signal_price', inplace=True)
        return df


    def performance(self, df, rf_rate=.01):
        '''
        Calculate additional information of portfolio
        :param df: The dataframe with daily returns
        :param rf_rate: Risk free interest rate
        :return: Number of trades, Number of wins, Hit ratio, Sharpe ratio, ...
        '''

        rst = {}
        rst['no_trades'] = (df[self.position]==1).sum()
        rst['no_win'] = (df['rtn']>1).sum()
        rst['acc_rtn'] = df['acc_rtn'][-1:].round(4)
        rst['hit_ratio'] = round((df['rtn']>1.0).sum() / rst['no_trades'], 4) if rst['no_trades']>0 else 0
        rst['avg_rtn'] = round(df[df['rtn']!=1.0]['rtn'].mean(), 4)
        rst['period'] = self.__get_period(df)
        rst['annual_rtn'] = self.__annualize(rst['acc_rtn'], rst['period'])
        rst['bm_rtn'] = round(df.iloc[-1,5]/df.iloc[0,5], 4)
        rst['sharpe_ratio'] = self.__get_sharpe_ratio(df, rf_rate)
        rst['mdd'] = df['mdd'].min()
        rst['bm_mdd'] = df['bm_mdd'].min()
        if self.result_show ==True:
            print('CAGR: ',round(rst['annual_rtn'].values[0] - 1,5))                       # 연간 수익
            print('Accumulated return:',round(rst['acc_rtn'].values[0] - 1,5))         # 
            print('Average return: ',round(rst['avg_rtn'] - 1,5))
            print('Benchmark return :',round(rst['bm_rtn']-1,5))
            print('Number of trades: ',(rst['no_trades']))
            print('Number of win:',(rst['no_win']))
            print('Hit ratio:',(rst['hit_ratio']))
            print('Investment period:',(rst['period']/365),'yrs')
            print('Sharpe ratio:',(rst['sharpe_ratio']))

            print('MDD:',(rst['mdd']-1)*100)
            print('Benchmark MDD:',(rst['bm_mdd']-1)*100)
            self.res = {'CAGR':(rst['annual_rtn'].values[0] - 1)*100,'Accumulated return':(rst['acc_rtn'].values[0] - 1)*100,'Average return': (rst['avg_rtn'] - 1)*100,'MDD':(rst['mdd']-1)*100}
        else:
            self.res = {'CAGR':(rst['annual_rtn'].values[0] - 1)*100,'Accumulated return':(rst['acc_rtn'].values[0] - 1)*100,'Average return': (rst['avg_rtn'] - 1)*100,'MDD':(rst['mdd']-1)*100}
            print('백테스팅 성공')

In [None]:
len(pp)

NameError: name 'pp' is not defined

In [None]:
data2 = pd.DataFrame({"actual":data.iloc[int(len(data) - len(pp)):]["position"], "position":pp})

In [None]:
data2["Adj Close"] = data3["Adj Close"][int(len(data)) - len(pp):]

In [None]:
data2.reset_index(inplace=True)

In [None]:
backtest(data2,'position',True)

CAGR:  0.0157
Accumulated return: 0.0417
Average return:  -0.0001
Benchmark return : -0.018
Number of trades:  320
Number of win: 155
Hit ratio: 0.4844
Investment period: 2.6301369863013697 yrs
Sharpe ratio: 659    0.02
Name: acc_rtn, dtype: float64
MDD: -35.58
Benchmark MDD: -44.599999999999994


<__main__.backtest at 0x7f8208707b90>