In [1]:
#できること一覧
#データ取得
#dataframeに格納
#時間可視化
#移動平均線
#ゴールデン・デッドクロス
#上下判定
#ボリンジャーバンド
#トレンドラインを作成（支持線・抵抗線・直近）
#機械学習（ランダムフォレスト）
#gfs
#グラフ

#クラスタリング
#ボリンジャーバンドで機械学習の層追加


In [2]:
#python_bitbankccのパッケージをインポート
#cloud9で起動するときのコマンド
#jupyter notebook --ip $IP --port $PORT --no-browser
import python_bitbankcc 
import datetime
import os 
import time
import numpy as np
import pandas as pd
import sys
from dateutil.relativedelta import relativedelta
#トレンドラインを引くため
from scipy.stats import linregress
#正規化
from sklearn.preprocessing import MinMaxScaler

In [3]:
#機械学習用のモジュール
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from sklearn import linear_model
% matplotlib inline
from __future__ import print_function
import copy
import matplotlib
matplotlib.style.use('ggplot')

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
# public API classのオブジェクトを取得
pub = python_bitbankcc.public()

In [5]:
#APIから基本データの取得、dataframeへ挿入
def get_candle(trade_name,span,back_day):
    # ロウソク足データを取得
    pub = python_bitbankcc.public()
    value = pub.get_candlestick( trade_name,span, back_day )
    candle = value['candlestick'][0]
    #print(candle['ohlcv'][0])
    dataframe = pd.DataFrame(candle['ohlcv'],columns=["Open", "High","Low","Close","Volume","Timestamp"])
    return dataframe


In [6]:
#指定した日から今日までの基本データをdataframeにして取得
#back_day,todayはdatetime
# span = ['1min', '5min', '15min', '30min', '1hour', '4hour', '8hour', '12hour', '1day', '1week']
#trade_name = ['btc_jpy', 'xrp_jpy', 'ltc_btc', 'eth_btc', 'mona_jpy', 'mona_btc', 'bcc_jpy', 'bcc_btc']

def make_df(trade_name,span,back_day,today):
    i = 1
    if (span =='1min')or(span== '5min')or(span== '15min')or(span== '30min')or(span== '1hour'):
        #基準が９時なので、そこを合わせてあげる
        if 0 <= today.hour < 9:
            k = 1
        elif 9 <= today.hour <= 23:
            k = 0

        while back_day <= today - datetime.timedelta(days=k):
            if i == 1:
                df1 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y%m%d'))
                back_day = back_day + datetime.timedelta(days=1)
                #print(back_day,len(df1))
                i += 1
            else:
                df2 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y%m%d'))
                df1 = pd.concat([df1, df2])
                back_day = back_day + datetime.timedelta(days=1)
                #print(back_day,len(df1))
                i += 1
    else:
        
        today = datetime.date(today.year, today.month, today.day)
        back_day = datetime.date(back_day.year, back_day.month, back_day.day)


        while back_day <= today:
            if i == 1:
                df1 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y'))
                back_day = back_day + relativedelta(years=1)
                i += 1
            else:
                df2 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y'))
                df1 = pd.concat([df1, df2])
                back_day = back_day +  relativedelta(years=1)
                i += 1
            

    return df1



In [7]:
def read_date(x):
    return datetime.datetime.fromtimestamp(x/1000)


In [8]:

# 単純移動平均（SMA）を取得する関数
#上記のdfデータフレーム
#numいくつの平均を求めるか
def getMA( df,num ):
 
    tmp = []
    avg = np.array([])
    for i in range(len(df) - num + 1):
        for j in range(num):
            #print(df['Close'][i+j])
            tmp.append( df['Close'][i+j])
            
         # 平均値計算
        value = np.average(tmp)
        avg = np.append(avg,value)
        tmp = []
    
    return avg

In [9]:
def getSTD( df,num ):
 
    tmp = []
    std = np.array([])
    for i in range(len(df) - num + 1):
        for j in range(num):
            #print(df['Close'][i+j])
            tmp.append( df['Close'][i+j])
            
         # 平均値計算
        value = np.std(tmp)
        std = np.append(std,value)
        tmp = []
    
    return std

In [10]:
#ゴールデン・デッドクロス判定
def golden_dead(data_df1):
    golden=np.zeros(len(data_df1['Cross']),dtype=int)
    dead=np.zeros(len(data_df1['Cross']),dtype=int)
    for i in range(len(data_df1['Cross'])-1):
        x = data_df1['Cross'][i]
        y = data_df1['Cross'][i+1]
        if ((x <= 0) & (y<=0)) |((x >= 0) & (y>=0)):
            pass
        elif ((x <= 0) & (y>=0)) :
            golden=np.insert(golden,i+1,1)
            golden=np.delete(golden,i+2)
        elif ((x >= 0) & (y<=0)) :
            dead=np.insert(dead,i+1,1)
            dead=np.delete(dead,i+2)

    return golden,dead
    

In [11]:
def make_onehot_label(df,MA5,STD5):
    data = df.copy()
    
    data.loc[data['sub'] > MA5 + STD5*2 , 'long'] = 1
    data.loc[~(data['sub'] > MA5 + STD5*2) , 'long'] = 0

    data.loc[(data['sub'] >= MA5 + STD5/2 )&(data['sub'] <= MA5 + STD5*2), 'sem_long']=1
    data.loc[~((data['sub'] >= MA5 + STD5/2 )&(data['sub'] <= MA5 + STD5*2)), 'sem_long']=0

    data.loc[(data['sub'] >= MA5 - STD5/2 )&(data['sub'] < MA5 + STD5/2 ), 'range']=1
    data.loc[~((data['sub'] >= MA5 - STD5/2 )&(data['sub'] < MA5 + STD5/2 )), 'range']=0

    data.loc[(data['sub'] >= MA5 - STD5*2 )&(data['sub'] < MA5 - STD5/2 ), 'sem_short']=1
    data.loc[~((data['sub'] >= MA5 - STD5*2 )&(data['sub'] < MA5 - STD5/2 )), 'sem_short']=0

    data.loc[data['sub'] < MA5 - STD5*2 , 'short'] = 1
    data.loc[~(data['sub'] < MA5 - STD5*2) , 'short'] = 0
    
    
    data['labels'] = np.array([2 for i in range(len(df)) ])
    data.loc[data['sub']> MA5 - STD5*2 , 'labels'] = 0
    data.loc[((data['sub']) >= MA5 + STD5/2 )&(data['sub'] <= MA5 + STD5*2), 'labels']=1
    data.loc[(data['sub']>= MA5 - STD5/2 )&(data['sub'] < MA5 + STD5/2 ), 'labels']=2
    data.loc[(data['sub'] >= MA5 - STD5*2 )&(data['sub'] < MA5 - STD5/2 ), 'labels']=3
    data.loc[data['sub'] < MA5 - STD5*2, 'short'] = 4

    
    a= np.array(data['long'])
    b=np.array(data['sem_long'])
    c=np.array(data['range'])
    d=np.array(data['sem_short'])
    e=np.array(data['short'])
    a=a[:, np.newaxis]
    b=b[:, np.newaxis]
    c=c[:, np.newaxis]
    d=d[:, np.newaxis]
    e=e[:, np.newaxis]
    
    onehot_label=np.concatenate([a,b,c,d,e], axis=1)
    
    return onehot_label,np.array(data['labels'],dtype=int)

In [12]:
#highトレンドラインを作成する関数
#df データフレーム
#num どれだけ前から
def make_high_trend_line(df,num):
    slope= np.zeros(num-1,dtype=float)
    intercept = np.zeros(num-1,dtype=float)
    for i in range(len(df)-num+1):        
        df_fin = df.copy()
        df_fin = df_fin[i:i+num]
        df_high = df.copy()
        df_high = df_high[i:i+num]
    
        # 高値のトレンドライン
        while len(df_high)>3:
            reg_1 = linregress(x=df_high['index'],y=df_high['High'])
            df_high  =  df_high.loc[df_high['High']>reg_1[0]*df_high['index']+reg_1[1]]

        if len(df_high)<=1:
            pass
        else:
            reg_1 = linregress(x=df_high['index'],y=df_high['High'])
        
        slope = np.append(slope,reg_1[0])
        intercept  = np.append(intercept ,reg_1[1])
    return slope,intercept    

In [13]:
#lowトレンドラインを作成する関数
#df データフレーム
#num どれだけ前から
def make_low_trend_line(df,num):
    slope= np.zeros(num-1,dtype=float)
    intercept = np.zeros(num-1,dtype=float)
    for i in range(len(df)-num+1):        
        df_fin = df.copy()
        df_fin = df_fin[i:i+num]
        df_low = df.copy()
        df_low = df_low[i:i+num]

    # 安値のトレンドライン
        while len(df_low)>3: 
            reg_2 = linregress( x = df_low['index'], y = df_low['Low']   )
            df_low = df_low.loc[df_low['Low'] < reg_2[0] * df_low['index'] + reg_2[1]]

        if len(df_low)<=1:
            pass
        else:
            reg_2 = linregress(x = df_low['index'],y = df_low['Low'])
            
        slope = np.append(slope,reg_2[0])
        intercept  = np.append(intercept ,reg_2[1])
    return slope,intercept    

In [14]:
#slope,傾き　intercept,切片　df、データフレーム
def make_trend_value(slope,intercept,df):
    index = np.array(df['index'])
    trend_value = slope*index + intercept
    return trend_value

In [15]:
#関数の使い方
today = datetime.datetime.today() 
back_day = today - datetime.timedelta(days=60)
trade_name = 'xrp_jpy'
span = '30min'
num = 5

In [16]:
#基本データの取得
old = time.time()
data_df1 =  make_df(trade_name,span,back_day,today)
data_df1['Timestamp'] = data_df1['Timestamp'].apply(read_date)
#indexの割り振り
data_df1 = data_df1.reset_index()
#df1の値は全てstrなのでTimestamp以外floatにキャストする
#Timestampは文字列のままでいいので削除することを指定
execlude = ['Timestamp']
cast = [col for col in data_df1.columns if col not in execlude]
for item in cast:
    data_df1 = data_df1.astype({item: float})

print(time.time()-old)
print(data_df1.tail())

16.110106945037842
      index    Open    High     Low   Close        Volume           Timestamp
2887    7.0  42.646  42.912  42.645  42.830  2.581997e+06 2018-11-30 12:30:00
2888    8.0  42.824  42.824  42.550  42.800  1.749971e+06 2018-11-30 13:00:00
2889    9.0  42.810  42.950  42.702  42.825  1.784809e+06 2018-11-30 13:30:00
2890   10.0  42.824  43.100  42.750  42.800  4.000222e+06 2018-11-30 14:00:00
2891   11.0  42.824  43.029  42.810  42.990  7.033542e+05 2018-11-30 14:30:00


In [17]:
#移動平均線np
MA5 = getMA( data_df1,num)
MA25 = getMA( data_df1,num+20)
zero5 = np.zeros(num-1,dtype = float )
zero25 = np.zeros(num+20-1,dtype = float )
MA5 = np.insert(MA5, 0, zero5)
MA25 = np.insert(MA25, 0, zero25)
print(MA25,MA5)
print(MA25.shape,type(MA25),MA5.shape,type(MA5))

[ 0.       0.       0.      ... 42.66456 42.65576 42.65032] [ 0.      0.      0.     ... 42.7888 42.777  42.849 ]
(2892,) <class 'numpy.ndarray'> (2892,) <class 'numpy.ndarray'>


In [18]:
#標準偏差計算
STD5 = getSTD( data_df1,num)
zero5 = np.zeros(num-1,dtype = float )
STD5 = np.insert(STD5, 0, zero5)
print(STD5,STD5.shape,type(STD5))

[0.         0.         0.         ... 0.08158284 0.07453858 0.07158212] (2892,) <class 'numpy.ndarray'>


In [19]:
#自作変数を作成する
data_df1['High-Low']=data_df1['High']-data_df1['Low']
data_df1['Close-Open']=data_df1['Close']-data_df1['Open']
data_df1['Similarity'] = data_df1['Close-Open'] / (data_df1['High-Low'] + 0.000001)
data_df1['(High-Low)*Volume '] = data_df1['High-Low']*data_df1['Volume']
data_df1['(Close-Open)*Volume '] = data_df1['Close-Open']*data_df1['Volume']
data_df1['High-Low']
data_df1['MA5'] = MA5
data_df1['MA25'] = MA25
data_df1['STD5']  =  STD5
data_df1['Cross'] = MA5 - MA25
data_df1['index'] = data_df1.index + 1
data_df1['bbd_p1']= MA5 + (STD5 * 1)
data_df1['bbd_p2']= MA5 + (STD5 * 2)
data_df1['bbd_p3']= MA5 + (STD5 * 3)
data_df1['bbd_m1']= MA5 - (STD5 * 1)
data_df1['bbd_m2']= MA5 - (STD5 * 2)
data_df1['bbd_m3']= MA5 - (STD5 * 3)

In [20]:
data_df1['golden'],data_df1['dead']  = golden_dead(data_df1)

In [21]:
old1 = time.time()
slope1,intercept1 = make_high_trend_line(df=data_df1,num=32)
print(time.time()-old1)

41.21417260169983


In [22]:
old1 = time.time()
slope2,intercept2 = make_low_trend_line(df=data_df1,num=32)
print(time.time()-old1)

42.69029712677002


In [23]:
tr_high = make_trend_value(slope1,intercept1,data_df1)
tr_low  = make_trend_value(slope2,intercept2,data_df1)

In [24]:
data_df1['high_slope'] = slope1
data_df1['low_slope'] = slope2
data_df1['tr_high'] = tr_high
data_df1['tr_low'] = tr_low
data_df1['tr_mid'] = (tr_high + tr_low)/2

In [25]:
#data_df1['up'],data_df1['mid_up'],data_df1['min_down'],data_df1['down'] = up_down(data_df1)
# a,b = up_down(data_df1)
# print(len(a),len(b))
data_df1['up'],data_df1['down'] = up_down(data_df1)


NameError: name 'up_down' is not defined

In [None]:
#data作成までの時間
print(time.time()-old)
#作成変数の表示
data_df1.loc[len(data_df1)-5:]

In [None]:
#次のCloseを予測する機械学習
#使わない変数を除外
exe_cols = ['index','Timestamp']
feature_cols = [col for col in data_df1.columns if col not in exe_cols]
print(feature_cols)
#機械学習用にnp配列に変換
data_np= np.array(data_df1[feature_cols])
print(data_np.shape)

In [None]:
Y = np.array(data_df1['Close'])#1次元
Y = np.delete(Y,0)
print(Y,Y.shape)
X = data_np[:len(Y),0:]
print(X[:],X.shape)

In [None]:
# 学習では、29/30を使うものとします。これは情報の偏りを防ぐためのものであり、全体でも構いません
L = int(len(X)//(200/199))
print(L)
train_x = X[50:L,:]
train_y = Y[50:L]

In [None]:
# 残りの全てをテストデータとします
test_x = X[L:len(X),:]
test_y = Y[L:len(Y)]

In [None]:
rf = RandomForestRegressor(random_state=1234)

In [None]:
params = {'n_estimators': [15,20,25], 'max_depth': [5,17,30]}
gscv = GridSearchCV(rf, param_grid=params, verbose=1,
                    cv=3, scoring='neg_mean_squared_error')
gscv.fit(train_x, train_y)

In [None]:
a = gscv.best_params_

In [None]:
print(a)
n_estimators = a.get('n_estimators')
max_depth = a.get('max_depth')

In [None]:
rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,random_state=1234)
rf.fit(train_x, train_y)
y_pred_rf = rf.predict(test_x)
rf_mse = mean_squared_error(test_y, y_pred_rf)
print('RandomForest MSE: ', rf_mse)

In [None]:
# 予測結果を出力します。これはランダムフォレスト
result = pd.DataFrame(y_pred_rf)
result.columns = ['y_pred_rf']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_rf'])
print(result.loc[:])

In [None]:
# Plot拡大
plt.plot(range(0,len(result[0:15])), test_y[len(test_y)-15:], label='Actual price', color='blue', marker = 'o')
plt.plot(range(0,len(result[0:15])), y_pred_rf[len(test_y)-15:], label='Predicted price', color='red', marker ='x')
plt.xlabel(span)
plt.ylabel('Price (\)')
n3='{0} Price by RandomForest'.format(trade_name)
plt.title(n3)
plt.grid(True)
plt.legend()
n4='{0}2 by RandomForest.png'.format(trade_name)
plt.savefig(n4)
plt.close()
plt.show()

In [None]:
# 変数増加法を実行する関数
def get_gfs_feature_indices(X, y, features, clf):
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X, y, test_size=0.3, shuffle=False)
    feature_indices = {feature: idx for idx, feature in enumerate(features)}
    features = set(features)
    last_mse = np.inf
    chosen_features = set()
    while len(chosen_features) < len(features):
        mse_features = []
        for feature in (features - chosen_features):
            candidates = chosen_features.union(set([feature]))
            indices = [feature_indices[feature] for feature in candidates]
            clf.fit(X_train_[:, indices], y_train_)
            y_pred = clf.predict(X_test_[:, indices])
            mse = mean_squared_error(y_test_, y_pred)
            mse_features += [(mse, feature)]
        mse, feature = min(mse_features)
        if mse >= last_mse:
            break
        last_mse = mse
        print('Newly Added Feature: {},\tMSE Score: {}'.format(feature, mse))
        chosen_features.add(feature)
    return [feature_indices[feature] for feature in chosen_features]

In [None]:
# 上記関数を使用して変数増加法を実行し、MSEを算出
#feature_cols = list('abcdefghij')
feature_cols = list('abcdefghijklmnopqrstuv')
print(feature_cols,len(feature_cols))

selected_feature_index_by_RandomForestRegressor = get_gfs_feature_indices(X=train_x,y=train_y,features=feature_cols,clf= RandomForestRegressor())
print(selected_feature_index_by_RandomForestRegressor)

In [None]:
#ランダムフォレスト
rf = RandomForestRegressor(random_state=1234)
rf.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)
y_pred_rf = rf.predict(test_x[:, selected_feature_index_by_RandomForestRegressor])
rf_mse = mean_squared_error(test_y, y_pred_rf)
print('RandomForest MSE: ', rf_mse)

In [None]:
params = {'n_estimators': [15,20,25], 'max_depth': [6,10,20]}
gscv = GridSearchCV(rf, param_grid=params, verbose=1,
                    cv=3, scoring='neg_mean_squared_error')
gscv.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)

In [None]:
a = gscv.best_params_

In [None]:
print(a)
n_estimators = a.get('n_estimators')
max_depth = a.get('max_depth')

In [None]:
rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,random_state=1234)
rf.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)
y_pred_rf_gfs = rf.predict(test_x[:, selected_feature_index_by_RandomForestRegressor])
rf_mse = mean_squared_error(test_y, y_pred_rf_gfs)
print('RandomForest MSE: ', rf_mse)

In [None]:
# 予測結果を出力します。これはランダムフォレスト
result = pd.DataFrame(y_pred_rf_gfs)
result.columns = ['y_pred_rf_gfs']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_rf_gfs'])
print(result.loc[:])

In [None]:
# Plot拡大
plt.plot(range(0,len(result[0:15])), test_y[len(test_y)-15:], label='Actual price', color='blue', marker = 'o')
plt.plot(range(0,len(result[0:15])), y_pred_rf_gfs[len(test_y)-15:], label='Predicted price', color='red', marker ='x')
plt.xlabel(span)
plt.ylabel('Price (\)')
n3='{0} Price by RandomForest gfs'.format(trade_name)
plt.title(n3)
plt.grid(True)
plt.legend()
n4='{0}2 by RandomForest gfs.png'.format(trade_name)
plt.savefig(n4)
plt.close()
plt.show()