In [1]:
#できること一覧
#データ取得
#dataframeに格納
#時間可視化
#移動平均線
#ゴールデン・デッドクロス
#上下判定
#ボリンジャーバンド
#トレンドラインを作成（支持線・抵抗線・直近）
#機械学習（ランダムフォレスト）
#gfs
#グラフ



In [2]:
#python_bitbankccのパッケージをインポート
#cloud9で起動するときのコマンド
#jupyter notebook --ip $IP --port $PORT --no-browser
import python_bitbankcc 
import datetime
import os 
import time
import numpy as np
import pandas as pd
import sys
from dateutil.relativedelta import relativedelta
#トレンドラインを引くため
from scipy.stats import linregress
#正規化
from sklearn.preprocessing import MinMaxScaler

In [3]:
#機械学習用のモジュール
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from sklearn import linear_model
% matplotlib inline
from __future__ import print_function
import copy
import matplotlib
matplotlib.style.use('ggplot')

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [4]:
# public API classのオブジェクトを取得
pub = python_bitbankcc.public()

In [5]:
#APIから基本データの取得、dataframeへ挿入
def get_candle(trade_name,span,back_day):
    # ロウソク足データを取得
    pub = python_bitbankcc.public()
    value = pub.get_candlestick( trade_name,span, back_day )
    candle = value['candlestick'][0]
    #print(candle['ohlcv'][0])
    dataframe = pd.DataFrame(candle['ohlcv'],columns=["Open", "High","Low","Close","Volume","Timestamp"])
    return dataframe


In [6]:
#指定した日から今日までの基本データをdataframeにして取得
#back_day,todayはdatetime
# span = ['1min', '5min', '15min', '30min', '1hour', '4hour', '8hour', '12hour', '1day', '1week']
#trade_name = ['btc_jpy', 'xrp_jpy', 'ltc_btc', 'eth_btc', 'mona_jpy', 'mona_btc', 'bcc_jpy', 'bcc_btc']

def make_df(trade_name,span,back_day,today):
    i = 1
    if (span =='1min')or(span== '5min')or(span== '15min')or(span== '30min')or(span== '1hour'):
        #基準が９時なので、そこを合わせてあげる
        if 0 <= today.hour < 9:
            k = 1
        elif 9 <= today.hour <= 23:
            k = 0

        while back_day <= today - datetime.timedelta(days=k):
            if i == 1:
                df1 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y%m%d'))
                back_day = back_day + datetime.timedelta(days=1)
                #print(back_day,len(df1))
                i += 1
            else:
                df2 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y%m%d'))
                df1 = pd.concat([df1, df2])
                back_day = back_day + datetime.timedelta(days=1)
                #print(back_day,len(df1))
                i += 1
    else:
        
        today = datetime.date(today.year, today.month, today.day)
        back_day = datetime.date(back_day.year, back_day.month, back_day.day)


        while back_day <= today:
            if i == 1:
                df1 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y'))
                back_day = back_day + relativedelta(years=1)
                i += 1
            else:
                df2 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y'))
                df1 = pd.concat([df1, df2])
                back_day = back_day +  relativedelta(years=1)
                i += 1
            

    return df1



In [7]:
def read_date(x):
    return datetime.datetime.fromtimestamp(x/1000)


In [8]:

# 単純移動平均（SMA）を取得する関数
#上記のdfデータフレーム
#numいくつの平均を求めるか
def getMA( df,num ):
 
    tmp = []
    avg = np.array([])
    for i in range(len(df) - num + 1):
        for j in range(num):
            #print(df['Close'][i+j])
            tmp.append( df['Close'][i+j])
            
         # 平均値計算
        value = np.average(tmp)
        avg = np.append(avg,value)
        tmp = []
    
    return avg

In [9]:
def getSTD( df,num ):
 
    tmp = []
    std = np.array([])
    for i in range(len(df) - num + 1):
        for j in range(num):
            #print(df['Close'][i+j])
            tmp.append( df['Close'][i+j])
            
         # 平均値計算
        value = np.std(tmp)
        std = np.append(std,value)
        tmp = []
    
    return std

In [10]:
#ゴールデン・デッドクロス判定
def golden_dead(data_df1):
    golden=np.zeros(len(data_df1['Cross']),dtype=int)
    dead=np.zeros(len(data_df1['Cross']),dtype=int)
    for i in range(len(data_df1['Cross'])-1):
        x = data_df1['Cross'][i]
        y = data_df1['Cross'][i+1]
        if ((x <= 0) & (y<=0)) |((x >= 0) & (y>=0)):
            pass
        elif ((x <= 0) & (y>=0)) :
            golden=np.insert(golden,i+1,1)
            golden=np.delete(golden,i+2)
        elif ((x >= 0) & (y<=0)) :
            dead=np.insert(dead,i+1,1)
            dead=np.delete(dead,i+2)

    return golden,dead
    

In [11]:
#highトレンドラインを作成する関数
#df データフレーム
#num どれだけ前から
def make_high_trend_line(df,num):
    slope= np.zeros(num-1,dtype=float)
    intercept = np.zeros(num-1,dtype=float)
    for i in range(len(df)-num+1):        
        df_fin = df.copy()
        df_fin = df_fin[i:i+num]
        df_high = df.copy()
        df_high = df_high[i:i+num]
    
        # 高値のトレンドライン
        while len(df_high)>3:
            reg_1 = linregress(x=df_high['index'],y=df_high['High'])
            df_high  =  df_high.loc[df_high['High']>reg_1[0]*df_high['index']+reg_1[1]]

        if len(df_high)<=1:
            pass
        else:
            reg_1 = linregress(x=df_high['index'],y=df_high['High'])
        
        slope = np.append(slope,reg_1[0])
        intercept  = np.append(intercept ,reg_1[1])
    return slope,intercept    

In [12]:
#lowトレンドラインを作成する関数
#df データフレーム
#num どれだけ前から
def make_low_trend_line(df,num):
    slope= np.zeros(num-1,dtype=float)
    intercept = np.zeros(num-1,dtype=float)
    for i in range(len(df)-num+1):        
        df_fin = df.copy()
        df_fin = df_fin[i:i+num]
        df_low = df.copy()
        df_low = df_low[i:i+num]

    # 安値のトレンドライン
        while len(df_low)>3: 
            reg_2 = linregress( x = df_low['index'], y = df_low['Low']   )
            df_low = df_low.loc[df_low['Low'] < reg_2[0] * df_low['index'] + reg_2[1]]

        if len(df_low)<=1:
            pass
        else:
            reg_2 = linregress(x = df_low['index'],y = df_low['Low'])
            
        slope = np.append(slope,reg_2[0])
        intercept  = np.append(intercept ,reg_2[1])
    return slope,intercept    

In [13]:
#slope,傾き　intercept,切片　df、データフレーム
def make_trend_value(slope,intercept,df):
    index = np.array(df['index'])
    trend_value = slope*index + intercept
    return trend_value

In [14]:
#関数の使い方
today = datetime.datetime.today() 
back_day = today - datetime.timedelta(days=90)
trade_name = 'xrp_jpy'
span = '30min'
num = 5

In [15]:
#基本データの取得
old = time.time()
data_df1 =  make_df(trade_name,span,back_day,today)
data_df1['Timestamp'] = data_df1['Timestamp'].apply(read_date)
#indexの割り振り
data_df1 = data_df1.reset_index()
#df1の値は全てstrなのでTimestamp以外floatにキャストする
#Timestampは文字列のままでいいので削除することを指定
execlude = ['Timestamp']
cast = [col for col in data_df1.columns if col not in execlude]
for item in cast:
    data_df1 = data_df1.astype({item: float})

print(time.time()-old)
print(data_df1.tail())

27.59193444252014
      index    Open    High     Low   Close        Volume           Timestamp
4343   23.0  33.693  33.850  33.600  33.780  2.907404e+06 2018-12-07 20:30:00
4344   24.0  33.780  34.127  33.551  33.943  5.991032e+06 2018-12-07 21:00:00
4345   25.0  33.942  34.115  33.800  34.051  3.486517e+06 2018-12-07 21:30:00
4346   26.0  34.011  34.130  33.900  34.050  4.512055e+06 2018-12-07 22:00:00
4347   27.0  34.040  34.198  33.981  34.083  1.420519e+06 2018-12-07 22:30:00


In [16]:
#移動平均線np
MA5 = getMA( data_df1,num)
MA25 = getMA( data_df1,num+20)
zero5 = np.zeros(num-1,dtype = float )
zero25 = np.zeros(num+20-1,dtype = float )
MA5 = np.insert(MA5, 0, zero5)
MA25 = np.insert(MA25, 0, zero25)
print(MA25,MA5)
print(MA25.shape,type(MA25),MA5.shape,type(MA5))

[ 0.       0.       0.      ... 33.76788 33.7898  33.81844] [ 0.      0.      0.     ... 33.8002 33.9034 33.9814]
(4348,) <class 'numpy.ndarray'> (4348,) <class 'numpy.ndarray'>


In [17]:
#標準偏差計算
STD5 = getSTD( data_df1,num)
zero5 = np.zeros(num-1,dtype = float )
STD5 = np.insert(STD5, 0, zero5)
print(STD5,STD5.shape,type(STD5))

[0.         0.         0.         ... 0.18223874 0.14445844 0.11127372] (4348,) <class 'numpy.ndarray'>


In [18]:
#自作変数を作成する
data_df1['High-Low']=data_df1['High']-data_df1['Low']
data_df1['Close-Open']=data_df1['Close']-data_df1['Open']
data_df1['Similarity'] = data_df1['Close-Open'] / (data_df1['High-Low'] + 0.000001)
data_df1['(High-Low)*Volume '] = data_df1['High-Low']*data_df1['Volume']
data_df1['(Close-Open)*Volume '] = data_df1['Close-Open']*data_df1['Volume']
data_df1['High-Low']
data_df1['MA5'] = MA5
data_df1['MA25'] = MA25
data_df1['Cross'] = MA5 - MA25
data_df1['index'] = data_df1.index + 1
data_df1['bbd_p1']= MA5 + (STD5 * 1)
data_df1['bbd_p2']= MA5 + (STD5 * 2)
data_df1['bbd_p3']= MA5 + (STD5 * 3)
data_df1['bbd_m1']= MA5 - (STD5 * 1)
data_df1['bbd_m2']= MA5 - (STD5 * 2)
data_df1['bbd_m3']= MA5 - (STD5 * 3)

In [19]:
data_df1['golden'],data_df1['dead']  = golden_dead(data_df1)

In [20]:
old1 = time.time()
slope1,intercept1 = make_high_trend_line(df=data_df1,num=32)
print(time.time()-old1)

46.2927520275116


In [21]:
old1 = time.time()
slope2,intercept2 = make_low_trend_line(df=data_df1,num=32)
print(time.time()-old1)

48.361074924468994


In [22]:
tr_high = make_trend_value(slope1,intercept1,data_df1)
tr_low  = make_trend_value(slope2,intercept2,data_df1)

In [23]:
data_df1['high_slope'] = slope1
data_df1['low_slope'] = slope2
data_df1['tr_high'] = tr_high
data_df1['tr_low'] = tr_low
data_df1['tr_mid'] = (tr_high + tr_low)/2

In [24]:
#data作成までの時間
print(time.time()-old)
#作成変数の表示
data_df1.loc[len(data_df1)-9:]

126.5589292049408


Unnamed: 0,index,Open,High,Low,Close,Volume,Timestamp,High-Low,Close-Open,Similarity,...,bbd_m1,bbd_m2,bbd_m3,golden,dead,high_slope,low_slope,tr_high,tr_low,tr_mid
4339,4340,33.55,33.802,33.398,33.4,3744676.0,2018-12-07 18:30:00,0.404,-0.15,-0.371286,...,33.387344,33.074287,32.761231,0,0,-0.099731,-0.066214,34.309167,32.177,33.243083
4340,4341,33.4,33.72,33.371,33.49,3240997.0,2018-12-07 19:00:00,0.349,0.09,0.257879,...,33.348348,33.129096,32.909844,0,0,-0.099731,-0.066214,34.209436,32.110786,33.160111
4341,4342,33.495,33.679,33.32,33.534,2867224.0,2018-12-07 19:30:00,0.359,0.039,0.108635,...,33.412249,33.348298,33.284346,0,0,-0.099731,-0.066214,34.109705,32.044571,33.077138
4342,4343,33.566,34.0,33.5,33.693,6162830.0,2018-12-07 20:00:00,0.5,0.127,0.253999,...,33.43857,33.343141,33.247711,0,0,-0.099297,-0.066214,34.047899,31.978357,33.013128
4343,4344,33.693,33.85,33.6,33.78,2907404.0,2018-12-07 20:30:00,0.25,0.087,0.347999,...,33.441282,33.303164,33.165045,0,0,-0.101284,-0.066214,33.903589,31.912143,32.907866
4344,4345,33.78,34.127,33.551,33.943,5991032.0,2018-12-07 21:00:00,0.576,0.163,0.282986,...,33.522813,33.357626,33.192439,0,0,-0.088723,-0.066214,34.105882,31.845929,32.975905
4345,4346,33.942,34.115,33.8,34.051,3486517.0,2018-12-07 21:30:00,0.315,0.109,0.346031,...,33.617961,33.435723,33.253484,1,0,-0.086033,-0.066214,34.115,31.779714,32.947357
4346,4347,34.011,34.13,33.9,34.05,4512055.0,2018-12-07 22:00:00,0.23,0.039,0.169564,...,33.758942,33.614483,33.470025,0,0,-0.080248,0.0025,34.127299,33.050167,33.588733
4347,4348,34.04,34.198,33.981,34.083,1420519.0,2018-12-07 22:30:00,0.217,0.043,0.198156,...,33.870126,33.758853,33.647579,0,0,-0.072677,0.005,34.198,33.101,33.6495


In [25]:
#次のCloseを予測する機械学習
#使わない変数を除外
exe_cols = ['index','Timestamp']
feature_cols = [col for col in data_df1.columns if col not in exe_cols]
print(feature_cols)
#機械学習用にnp配列に変換
data_np= np.array(data_df1[feature_cols])

#機械学習用の特徴量
print(data_np.shape)

['Open', 'High', 'Low', 'Close', 'Volume', 'High-Low', 'Close-Open', 'Similarity', '(High-Low)*Volume ', '(Close-Open)*Volume ', 'MA5', 'MA25', 'Cross', 'bbd_p1', 'bbd_p2', 'bbd_p3', 'bbd_m1', 'bbd_m2', 'bbd_m3', 'golden', 'dead', 'high_slope', 'low_slope', 'tr_high', 'tr_low', 'tr_mid']
(4348, 26)


In [26]:
Y = np.array(data_df1['Close'])#1次元
Y = np.delete(Y,0)
print(Y,Y.shape)
X = data_np[:len(Y),0:]
print(X[:],X.shape)

[32.19  32.193 32.196 ... 34.051 34.05  34.083] (4347,)
[[31.93       32.352      31.9        ...  0.          0.
   0.        ]
 [32.346      32.48       32.19       ...  0.          0.
   0.        ]
 [32.2        32.428      32.101      ...  0.          0.
   0.        ]
 ...
 [33.78       34.127      33.551      ... 34.10588152 31.84592857
  32.97590504]
 [33.942      34.115      33.8        ... 34.115      31.77971429
  32.94735714]
 [34.011      34.13       33.9        ... 34.12729914 33.05016667
  33.5887329 ]] (4347, 26)


In [27]:
# 学習では、29/30を使うものとします。これは情報の偏りを防ぐためのものであり、全体でも構いません
L = int(len(X)//(200/199))
print(L)
train_x = X[50:L,:]
train_y = Y[50:L]

4325


In [28]:
# 残りの全てをテストデータとします
test_x = X[L:len(X),:]
test_y = Y[L:len(Y)]

In [29]:
#ランダムフォレストによる学習（パラメータの選択はしていない）
rf = RandomForestRegressor(random_state=1234)

In [30]:
params = {'n_estimators': [15,20,25], 'max_depth': [5,17,30]}
gscv = GridSearchCV(rf, param_grid=params, verbose=1,
                    cv=3, scoring='neg_mean_squared_error')
gscv.fit(train_x, train_y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   37.8s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1234, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [15, 20, 25], 'max_depth': [5, 17, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [31]:
a = gscv.best_params_

In [32]:
print(a)
n_estimators = a.get('n_estimators')
max_depth = a.get('max_depth')

{'max_depth': 17, 'n_estimators': 15}


In [33]:
rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,random_state=1234)
rf.fit(train_x, train_y)
y_pred_rf = rf.predict(test_x)
rf_mse = mean_squared_error(test_y, y_pred_rf)
print('予測精度（低いほど良い）')
print('RandomForest MSE: ', rf_mse)

予測精度（低いほど良い）
RandomForest MSE:  0.25529998080808175


In [34]:
# 予測結果を出力します。これはランダムフォレスト
result = pd.DataFrame(y_pred_rf)
result.columns = ['y_pred_rf']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_rf'])
print('１時間前の特徴量データを元に１時間後のCloseを予測・実際のCloseと比較')
print('実際の価格と予測した価格の誤差')
print(result.loc[:])

１時間前の特徴量データを元に１時間後のCloseを予測・実際のCloseと比較
実際の価格と予測した価格の誤差
    y_pred_rf  test_y       RMS
0   34.041600  33.319 -0.722600
1   34.288133  33.076 -1.212133
2   33.875533  33.503 -0.372533
3   33.889600  33.650 -0.239600
4   33.889600  34.200  0.310400
5   33.535133  34.886  1.350867
6   34.390600  34.202 -0.188600
7   33.816867  33.828  0.011133
8   33.823000  34.300  0.477000
9   33.920467  34.154  0.233533
10  33.823867  33.991  0.167133
11  33.830133  33.404 -0.426133
12  33.768667  33.553 -0.215667
13  33.813200  33.400 -0.413200
14  33.803400  33.490 -0.313400
15  33.803400  33.534 -0.269400
16  33.550133  33.693  0.142867
17  33.566067  33.780  0.213933
18  33.579000  33.943  0.364000
19  33.699467  34.051  0.351533
20  33.699467  34.050  0.350533
21  33.699133  34.083  0.383867


In [35]:
# Plot拡大
plt.plot(range(0,len(result[0:15])), test_y[len(test_y)-15:], label='Actual price', color='blue', marker = 'o')
plt.plot(range(0,len(result[0:15])), y_pred_rf[len(test_y)-15:], label='Predicted price', color='red', marker ='x')
plt.xlabel(span)
plt.ylabel('Price (\)')
n3='{0} Price by RandomForest'.format(trade_name)
plt.title(n3)
plt.grid(True)
plt.legend()
n4='{0}2 by RandomForest.png'.format(trade_name)
plt.savefig(n4)
plt.close()
plt.show()

In [36]:
# 変数増加法を実行する関数
def get_gfs_feature_indices(X, y, features, clf):
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X, y, test_size=0.3, shuffle=False)
    feature_indices = {feature: idx for idx, feature in enumerate(features)}
    features = set(features)
    last_mse = np.inf
    chosen_features = set()
    while len(chosen_features) < len(features):
        mse_features = []
        for feature in (features - chosen_features):
            candidates = chosen_features.union(set([feature]))
            indices = [feature_indices[feature] for feature in candidates]
            clf.fit(X_train_[:, indices], y_train_)
            y_pred = clf.predict(X_test_[:, indices])
            mse = mean_squared_error(y_test_, y_pred)
            mse_features += [(mse, feature)]
        mse, feature = min(mse_features)
        if mse >= last_mse:
            break
        last_mse = mse
        print('Newly Added Feature: {},\tMSE Score: {}'.format(feature, mse))
        chosen_features.add(feature)
    return [feature_indices[feature] for feature in chosen_features]

In [37]:
# 上記関数を使用して変数増加法を実行し、MSEを算出
#feature_cols =　['Open', 'High', 'Low', 'Close', 'Volume', 'High-Low', 'Close-Open', 'Similarity', '(High-Low)*Volume ', '(Close-Open)*Volume ', 'MA5', 'MA25', 'Cross', 'bbd_p1', 'bbd_p2', 'bbd_p3', 'bbd_m1', 'bbd_m2', 'bbd_m3', 'golden', 'dead', 'high_slope', 'low_slope', 'tr_high', 'tr_low', 'tr_mid']
#feature_cols = list('abcdefghijklmnopqrstuv')
print(feature_cols,len(feature_cols))
selected_feature_index_by_RandomForestRegressor = get_gfs_feature_indices(X=train_x,y=train_y,features=feature_cols,clf= RandomForestRegressor())
print(selected_feature_index_by_RandomForestRegressor)

['Open', 'High', 'Low', 'Close', 'Volume', 'High-Low', 'Close-Open', 'Similarity', '(High-Low)*Volume ', '(Close-Open)*Volume ', 'MA5', 'MA25', 'Cross', 'bbd_p1', 'bbd_p2', 'bbd_p3', 'bbd_m1', 'bbd_m2', 'bbd_m3', 'golden', 'dead', 'high_slope', 'low_slope', 'tr_high', 'tr_low', 'tr_mid'] 26
Newly Added Feature: Close,	MSE Score: 0.6922420954848981
Newly Added Feature: Close-Open,	MSE Score: 0.45682590143414004
[3, 6]


In [38]:
#ランダムフォレスト
rf = RandomForestRegressor(random_state=1234)
rf.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)
y_pred_rf = rf.predict(test_x[:, selected_feature_index_by_RandomForestRegressor])
rf_mse = mean_squared_error(test_y, y_pred_rf)
print('予測精度（低いほど良い）')
print('RandomForest MSE: ', rf_mse)

予測精度（低いほど良い）
RandomForest MSE:  0.4854597763636374


In [39]:
params = {'n_estimators': [15,20,25], 'max_depth': [6,10,20]}
gscv = GridSearchCV(rf, param_grid=params, verbose=1,
                    cv=3, scoring='neg_mean_squared_error')
gscv.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    4.7s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1234, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [15, 20, 25], 'max_depth': [6, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [40]:
a = gscv.best_params_

In [41]:
print(a)
n_estimators = a.get('n_estimators')
max_depth = a.get('max_depth')

{'max_depth': 10, 'n_estimators': 15}


In [42]:
rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,random_state=1234)
rf.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)
y_pred_rf_gfs = rf.predict(test_x[:, selected_feature_index_by_RandomForestRegressor])
rf_mse = mean_squared_error(test_y, y_pred_rf_gfs)
print('予測精度（低いほど良い）')
print('RandomForest MSE: ', rf_mse)

予測精度（低いほど良い）
RandomForest MSE:  0.4288436440404046


In [43]:
# 予測結果を出力します。これはランダムフォレスト
result = pd.DataFrame(y_pred_rf_gfs)
result.columns = ['y_pred_rf_gfs']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_rf_gfs'])
print('１時間前の特徴量データを元に１時間後のCloseを予測・実際のCloseと比較')
print('実際の価格と予測した価格の誤差')
print(result.loc[:])

１時間前の特徴量データを元に１時間後のCloseを予測・実際のCloseと比較
実際の価格と予測した価格の誤差
    y_pred_rf_gfs  test_y       RMS
0       33.928733  33.319 -0.609733
1       33.949200  33.076 -0.873200
2       33.949200  33.503 -0.446200
3       34.489067  33.650 -0.839067
4       34.668400  34.200 -0.468400
5       33.759200  34.886  1.126800
6       34.505133  34.202 -0.303133
7       33.578267  33.828  0.249733
8       33.648600  34.300  0.651400
9       34.009133  34.154  0.144867
10      33.650267  33.991  0.340733
11      33.650267  33.404 -0.246267
12      33.934667  33.553 -0.381667
13      34.675133  33.400 -1.275133
14      34.010133  33.490 -0.520133
15      34.675133  33.534 -1.141133
16      34.666000  33.693 -0.973000
17      34.668400  33.780 -0.888400
18      33.964600  33.943 -0.021600
19      33.936733  34.051  0.114267
20      33.933333  34.050  0.116667
21      33.927067  34.083  0.155933


In [44]:
# Plot拡大
plt.plot(range(0,len(result[0:15])), test_y[len(test_y)-15:], label='Actual price', color='blue', marker = 'o')
plt.plot(range(0,len(result[0:15])), y_pred_rf_gfs[len(test_y)-15:], label='Predicted price', color='red', marker ='x')
plt.xlabel(span)
plt.ylabel('Price (\)')
n3='{0} Price by RandomForest gfs'.format(trade_name)
plt.title(n3)
plt.grid(True)
plt.legend()
n4='{0}2 by RandomForest gfs.png'.format(trade_name)
plt.savefig(n4)
plt.close()
plt.show()