In [1]:
#python_bitbankccのパッケージをインポート
#cloud9で起動するときのコマンド
#jupyter notebook --ip $IP --port $PORT --no-browser
import python_bitbankcc 
import datetime
import os 
import time
import numpy as np
import pandas as pd
import sys
from dateutil.relativedelta import relativedelta

In [2]:
#機械学習用のモジュール
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from sklearn import linear_model
% matplotlib inline
from __future__ import print_function
import copy
import matplotlib
matplotlib.style.use('ggplot')

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
# public API classのオブジェクトを取得
pub = python_bitbankcc.public()

In [4]:
#APIから基本データの取得、dataframeへ挿入
def get_candle(trade_name,span,back_day):
    # ロウソク足データを取得
    pub = python_bitbankcc.public()
    value = pub.get_candlestick( trade_name,span, back_day )
    candle = value['candlestick'][0]
    #print(candle['ohlcv'][0])
    dataframe = pd.DataFrame(candle['ohlcv'],columns=["Open", "High","Low","Close","Volume","Timestamp"])
    return dataframe


In [5]:
#指定した日から今日までの基本データをdataframeにして取得
#back_day,todayはdatetime
# span = ['1min', '5min', '15min', '30min', '1hour', '4hour', '8hour', '12hour', '1day', '1week']
#trade_name = ['btc_jpy', 'xrp_jpy', 'ltc_btc', 'eth_btc', 'mona_jpy', 'mona_btc', 'bcc_jpy', 'bcc_btc']

def make_df(trade_name,span,back_day,today):
    i = 1
    if (span =='1min')or(span== '5min')or(span== '15min')or(span== '30min')or(span== '1hour'):
        #基準が９時なので、そこを合わせてあげる
        if 0 <= today.hour < 9:
            k = 1
        elif 9 <= today.hour <= 23:
            k = 0

        while back_day <= today - datetime.timedelta(days=k):
            if i == 1:
                df1 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y%m%d'))
                back_day = back_day + datetime.timedelta(days=1)
                #print(back_day,len(df1))
                i += 1
            else:
                df2 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y%m%d'))
                df1 = pd.concat([df1, df2])
                back_day = back_day + datetime.timedelta(days=1)
                #print(back_day,len(df1))
                i += 1
    else:
        
        today = datetime.date(today.year, today.month, today.day)
        back_day = datetime.date(back_day.year, back_day.month, back_day.day)


        while back_day <= today:
            if i == 1:
                df1 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y'))
                back_day = back_day + relativedelta(years=1)
                i += 1
            else:
                df2 = get_candle(trade_name,span,datetime.datetime.strftime(back_day, '%Y'))
                df1 = pd.concat([df1, df2])
                back_day = back_day +  relativedelta(years=1)
                i += 1
            

    return df1



In [6]:
def read_date(x):
    return datetime.datetime.fromtimestamp(x/1000)


In [7]:

# 単純移動平均（SMA）を取得する関数
#上記のdfデータフレーム
#numいくつの平均を求めるか
def getMA( df,num ):
 
    tmp = []
    avg = np.array([])
    for i in range(len(df) - num + 1):
        for j in range(num):
            #print(df['Close'][i+j])
            tmp.append( df['Close'][i+j])
            
         # 平均値計算
        value = np.average(tmp)
        avg = np.append(avg,value)
        tmp = []
    
    return avg

In [8]:
def getSTD( df,num ):
 
    tmp = []
    std = np.array([])
    for i in range(len(df) - num + 1):
        for j in range(num):
            #print(df['Close'][i+j])
            tmp.append( df['Close'][i+j])
            
         # 平均値計算
        value = np.std(tmp)
        std = np.append(std,value)
        tmp = []
    
    return std

In [9]:
#正解かどうかを可視化する ゴールデン・デッドクロス判定
def golden_dead(data_df1):
    golden=np.zeros(len(data_df1['Cross']),dtype=int)
    dead=np.zeros(len(data_df1['Cross']),dtype=int)
    for i in range(len(data_df1['Cross'])-1):
        x = data_df1['Cross'][i]
        y = data_df1['Cross'][i+1]
        if ((x <= 0) & (y<=0)) |((x >= 0) & (y>=0)):
            pass
        elif ((x <= 0) & (y>=0)) :
            golden=np.insert(golden,i+1,1)
            golden=np.delete(golden,i+2)
        elif ((x >= 0) & (y<=0)) :
            dead=np.insert(dead,i+1,1)
            dead=np.delete(dead,i+2)

    return golden,dead
    

In [10]:
#関数の使い方
today = datetime.datetime.today() 
back_day = today - datetime.timedelta(days=200)
trade_name = 'xrp_jpy'
span = '1hour'
num = 5

In [11]:
#基本データの取得
old = time.time()
data_df1 =  make_df(trade_name,span,back_day,today)
data_df1['Timestamp'] = data_df1['Timestamp'].apply(read_date)
#indexの割り振り
data_df1 = data_df1.reset_index()
#df1の値は全てstrなのでTimestamp以外floatにキャストする
#Timestampは文字列のままでいいので削除することを指定
execlude = ['Timestamp']
cast = [col for col in data_df1.columns if col not in execlude]
for item in cast:
    data_df1 = data_df1.astype({item: float})
print(data_df1)

      index    Open    High     Low   Close        Volume           Timestamp
0       0.0  53.065  54.700  53.000  54.400  3.840467e+06 2018-03-31 09:00:00
1       1.0  54.332  55.802  53.812  55.704  4.568674e+06 2018-03-31 10:00:00
2       2.0  55.704  56.200  54.650  55.402  6.129927e+06 2018-03-31 11:00:00
3       3.0  55.402  55.498  54.136  54.221  2.934375e+06 2018-03-31 12:00:00
4       4.0  54.221  55.350  54.130  54.900  2.495468e+06 2018-03-31 13:00:00
5       5.0  54.840  55.005  53.600  53.750  2.507258e+06 2018-03-31 14:00:00
6       6.0  53.786  54.200  53.100  53.200  2.625376e+06 2018-03-31 15:00:00
7       7.0  53.102  53.742  53.100  53.350  1.513584e+06 2018-03-31 16:00:00
8       8.0  53.350  54.200  53.350  53.628  2.148779e+06 2018-03-31 17:00:00
9       9.0  53.601  54.200  53.200  54.022  2.086983e+06 2018-03-31 18:00:00
10     10.0  54.100  54.988  53.850  54.451  4.349989e+06 2018-03-31 19:00:00
11     11.0  54.451  54.960  54.200  54.413  3.083111e+06 2018-0

In [12]:
#移動平均線np
MA5 = getMA( data_df1,num)
MA25 = getMA( data_df1,num+20)
zero5 = np.zeros(num-1,dtype = float )
zero25 = np.zeros(num+20-1,dtype = float )
MA5 = np.insert(MA5, 0, zero5)
MA25 = np.insert(MA25, 0, zero25)
print(MA25,MA5)
print(MA25.shape,type(MA25),MA5.shape,type(MA5))

[ 0.       0.       0.      ... 50.9214  51.05336 51.11464] [ 0.      0.      0.     ... 51.9408 52.0304 52.1274]
(4807,) <class 'numpy.ndarray'> (4807,) <class 'numpy.ndarray'>


In [13]:
#標準偏差計算
STD5 = getSTD( data_df1,num)
zero5 = np.zeros(num-1,dtype = float )
STD5 = np.insert(STD5, 0, zero5)
print(STD5,STD5.shape,type(STD5))

[0.         0.         0.         ... 0.22679277 0.32111717 0.32370456] (4807,) <class 'numpy.ndarray'>


In [14]:
#自作変数を作成する
data_df1['High-Low']=data_df1['High']-data_df1['Low']
data_df1['Close-Open']=data_df1['Close']-data_df1['Open']
data_df1['Similarity'] = data_df1['Close-Open'] / (data_df1['High-Low'] + 0.000001)
data_df1['(High-Low)*Volume '] = data_df1['High-Low']*data_df1['Volume']
data_df1['(Close-Open)*Volume '] = data_df1['Close-Open']*data_df1['Volume']
data_df1['High-Low']
data_df1['MA5'] = MA5
data_df1['MA25'] = MA25
data_df1['Cross'] = MA5 - MA25
data_df1['bbd_p1']= MA5 + (STD5 * 1)
data_df1['bbd_p2']= MA5 + (STD5 * 2)
data_df1['bbd_p3']= MA5 + (STD5 * 3)
data_df1['bbd_m1']= MA5 - (STD5 * 1)
data_df1['bbd_m2']= MA5 - (STD5 * 2)
data_df1['bbd_m3']= MA5 - (STD5 * 3)

In [15]:
data_df1['golden'],data_df1['dead']  = golden_dead(data_df1)

In [16]:
#data作成までの時間
print(time.time()-old)
#作成変数の表示
data_df1.loc[len(data_df1)-5:]

47.64020872116089


Unnamed: 0,index,Open,High,Low,Close,Volume,Timestamp,High-Low,Close-Open,Similarity,...,MA25,Cross,bbd_p1,bbd_p2,bbd_p3,bbd_m1,bbd_m2,bbd_m3,golden,dead
4802,2.0,51.845,51.893,51.429,51.595,7347193.0,2018-10-17 11:00:00,0.464,-0.25,-0.538792,...,50.68792,1.27188,52.235017,52.510235,52.785452,51.684583,51.409365,51.134148,0,0
4803,3.0,51.579,52.16,51.569,51.932,7300044.0,2018-10-17 12:00:00,0.591,0.353,0.597292,...,50.79208,1.17772,52.242909,52.516018,52.789127,51.696691,51.423582,51.150473,0,0
4804,4.0,51.938,52.345,51.825,52.282,6028683.0,2018-10-17 13:00:00,0.52,0.344,0.661537,...,50.9214,1.0194,52.167593,52.394386,52.621178,51.714007,51.487214,51.260422,0,0
4805,5.0,52.28,53.197,52.272,52.498,20302740.0,2018-10-17 14:00:00,0.925,0.218,0.235675,...,51.05336,0.97704,52.351517,52.672634,52.993752,51.709283,51.388166,51.067048,0,0
4806,6.0,52.497,52.747,52.25,52.33,5395514.0,2018-10-17 15:00:00,0.497,-0.167,-0.336015,...,51.11464,1.01276,52.451105,52.774809,53.098514,51.803695,51.479991,51.156286,0,0


In [17]:
data_df1[['Close', 'bbd_p2', 'bbd_m2', 'MA5']][len(data_df1)-20:].plot()
plt.title('Bollinger Band')
plt.ylabel('XRP_JPY')
n4='Bollinger Band.png'
plt.savefig(n4)
plt.close()
plt.show()

In [18]:
#次のCloseを予測する機械学習
#使わない変数を除外
exe_cols = ['index','Timestamp']
feature_cols = [col for col in data_df1.columns if col not in exe_cols]
print(feature_cols)
#機械学習用にnp配列に変換
data_np= np.array(data_df1[feature_cols])
print(data_np.shape)

['Open', 'High', 'Low', 'Close', 'Volume', 'High-Low', 'Close-Open', 'Similarity', '(High-Low)*Volume ', '(Close-Open)*Volume ', 'MA5', 'MA25', 'Cross', 'bbd_p1', 'bbd_p2', 'bbd_p3', 'bbd_m1', 'bbd_m2', 'bbd_m3', 'golden', 'dead']
(4807, 21)


In [19]:
Y = np.array(data_df1['Close'])#1次元
Y = np.delete(Y,0)
print(Y,Y.shape)
X = data_np[:len(Y),0:]
print(X[:],X.shape)

[55.704 55.402 54.221 ... 52.282 52.498 52.33 ] (4806,)
[[53.065      54.7        53.         ...  0.          0.
   0.        ]
 [54.332      55.802      53.812      ...  0.          0.
   0.        ]
 [55.704      56.2        54.65       ...  0.          0.
   0.        ]
 ...
 [51.579      52.16       51.569      ... 51.15047281  0.
   0.        ]
 [51.938      52.345      51.825      ... 51.26042169  0.
   0.        ]
 [52.28       53.197      52.272      ... 51.06704848  0.
   0.        ]] (4806, 21)


In [20]:
# 学習では、29/30を使うものとします。これは情報の偏りを防ぐためのものであり、全体でも構いません
L = int(len(X)//(200/199))
print(L)
train_x = X[10:L,:]
train_y = Y[10:L]

4781


In [21]:
# 残りの全てをテストデータとします
test_x = X[L:len(X),:]
test_y = Y[L:len(Y)]

In [22]:
# LinearRegression を使用して線形回帰モデルで学習させよう。
model = linear_model.LinearRegression(normalize=True)
model.fit(train_x,train_y)
# pred_y に対して、テストデータを使用して学習結果を代入
pred_y = model.predict(test_x)
lm_mse = mean_squared_error(test_y,pred_y)
print('LinearRegression MSE',lm_mse)

LinearRegression MSE 0.09073685832472485


In [23]:
#決定木を実行し、MSEを算出
dtr = DecisionTreeRegressor()
dtr.fit(train_x,train_y)
dtr_y_pred2 = dtr.predict(test_x)
dtr_mse = mean_squared_error(test_y,dtr_y_pred2)
print(' DecisionTreeRegressor MSE:',dtr_mse)

 DecisionTreeRegressor MSE: 0.21372019999999953


In [24]:
# ランダムフォレストを実行し、MSEを算出
rf = RandomForestRegressor()
rf.fit(train_x,train_y)
rf_y_pred2 = rf.predict(test_x)
rf_mse = mean_squared_error(test_y,rf_y_pred2)
print(' RandomForestRegressor MSE:',rf_mse)
pred_y = rf_y_pred2
# 予測結果を出力します
result = pd.DataFrame(pred_y)
result.columns = ['pred_y']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['pred_y'])#*(result['test_y']-result['pred_y'])
print(result.loc[:])

 RandomForestRegressor MSE: 0.15094442360000135
     pred_y  test_y     RMS
0   50.3437  50.282 -0.0617
1   50.5569  50.363 -0.1939
2   50.6494  49.977 -0.6724
3   50.0507  49.998 -0.0527
4   49.9533  50.427  0.4737
5   50.4296  50.468  0.0384
6   50.4405  50.150 -0.2905
7   50.1168  50.250  0.1332
8   50.2512  50.139 -0.1122
9   49.9948  50.286  0.2912
10  50.2634  50.242 -0.0214
11  50.1772  50.700  0.5228
12  50.5595  50.915  0.3555
13  50.9236  51.080  0.1564
14  50.9207  51.798  0.8773
15  51.6602  51.950  0.2898
16  51.9727  51.882 -0.0907
17  51.6732  52.427  0.7538
18  52.6170  52.050 -0.5670
19  52.1522  51.845 -0.3072
20  51.8493  51.595 -0.2543
21  51.3431  51.932  0.5889
22  52.0774  52.282  0.2046
23  52.2229  52.498  0.2751
24  52.5588  52.330 -0.2288


In [25]:
# 変数増加法を実行する関数
def get_gfs_feature_indices(X, y, features, clf):
    X_train_, X_test_, y_train_, y_test_ = \
        train_test_split(X, y, test_size=0.3, shuffle=False)
    feature_indices = {feature: idx for idx, feature in enumerate(features)}
    features = set(features)
    last_mse = np.inf
    chosen_features = set()
    while len(chosen_features) < len(features):
        mse_features = []
        for feature in (features - chosen_features):
            candidates = chosen_features.union(set([feature]))
            indices = [feature_indices[feature] for feature in candidates]
            clf.fit(X_train_[:, indices], y_train_)
            y_pred = clf.predict(X_test_[:, indices])
            mse = mean_squared_error(y_test_, y_pred)
            mse_features += [(mse, feature)]
        mse, feature = min(mse_features)
        if mse >= last_mse:
            break
        last_mse = mse
        print('Newly Added Feature: {},\tMSE Score: {}'.format(feature, mse))
        chosen_features.add(feature)
    return [feature_indices[feature] for feature in chosen_features]

In [26]:
# 上記関数を使用して変数増加法を実行し、MSEを算出
#feature_cols = list('abcdefghij')
feature_cols = list('abcdefghijklm')
print(feature_cols,len(feature_cols))

selected_feature_index_by_LinearRegression = get_gfs_feature_indices(X=train_x,y=train_y,features=feature_cols,clf=LinearRegression())
print(selected_feature_index_by_LinearRegression)
selected_feature_index_by_DecisionTreeRegressor = get_gfs_feature_indices(X=train_x,y=train_y,features=feature_cols,clf=DecisionTreeRegressor())
print(selected_feature_index_by_DecisionTreeRegressor)
selected_feature_index_by_RandomForestRegressor = get_gfs_feature_indices(X=train_x,y=train_y,features=feature_cols,clf= RandomForestRegressor())
print(selected_feature_index_by_RandomForestRegressor)

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm'] 13
Newly Added Feature: d,	MSE Score: 0.7382765596299639
Newly Added Feature: k,	MSE Score: 0.7343757592455821
Newly Added Feature: h,	MSE Score: 0.7329322504359049
Newly Added Feature: m,	MSE Score: 0.7326109128007104
Newly Added Feature: i,	MSE Score: 0.732544962297537
Newly Added Feature: b,	MSE Score: 0.7324974626692677
Newly Added Feature: l,	MSE Score: 0.7324974626615219
[10, 3, 7, 8, 12, 1, 11]
Newly Added Feature: d,	MSE Score: 1.2698282475011644
[3]
Newly Added Feature: d,	MSE Score: 0.9899024312677054
Newly Added Feature: j,	MSE Score: 0.9231550032908529
[3, 9]


In [27]:
#線形回帰　特徴量選択済
lr = linear_model.LinearRegression(normalize=True)
print( selected_feature_index_by_LinearRegression)
lr.fit(train_x[:, selected_feature_index_by_LinearRegression], train_y)
y_pred_lr = lr.predict(test_x[:, selected_feature_index_by_LinearRegression])
lr_mse = mean_squared_error(test_y, y_pred_lr)
print('LinearRegression MSE: ', lr_mse)

[10, 3, 7, 8, 12, 1, 11]
LinearRegression MSE:  0.09482096758340303


In [28]:
#決定木
dtr = DecisionTreeRegressor(random_state=1234)
dtr.fit(train_x,train_y)
dtr.fit(train_x[:, selected_feature_index_by_DecisionTreeRegressor], train_y)
y_pred_dtr = dtr.predict(test_x[:, selected_feature_index_by_DecisionTreeRegressor])
dtr_mse = mean_squared_error(test_y, y_pred_dtr)
print('DecisionTreeRegressor MSE: ', dtr_mse)

DecisionTreeRegressor MSE:  0.19930213000000002


In [29]:
#ランダムフォレスト
rf = RandomForestRegressor(random_state=1234)
rf.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)
y_pred_rf = rf.predict(test_x[:, selected_feature_index_by_RandomForestRegressor])
rf_mse = mean_squared_error(test_y, y_pred_rf)
print('RandomForest MSE: ', rf_mse)

RandomForest MSE:  0.15971193599999786


In [30]:
params = {'n_estimators': [15,20,25], 'max_depth': [6,10,20]}
gscv = GridSearchCV(rf, param_grid=params, verbose=1,
                    cv=3, scoring='neg_mean_squared_error')
gscv.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    4.5s finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=1234, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [15, 20, 25], 'max_depth': [6, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=1)

In [31]:
a = gscv.best_params_

In [32]:
print(a)
n_estimators = a.get('n_estimators')
max_depth = a.get('max_depth')

{'max_depth': 10, 'n_estimators': 25}


In [33]:
rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth,random_state=1234)
rf.fit(train_x[:, selected_feature_index_by_RandomForestRegressor], train_y)
y_pred_rf = rf.predict(test_x[:, selected_feature_index_by_RandomForestRegressor])
rf_mse = mean_squared_error(test_y, y_pred_rf)
print('RandomForest MSE: ', rf_mse)

RandomForest MSE:  0.11019492038502461


In [34]:
# 予測結果を出力します。これはランダムフォレスト
result = pd.DataFrame(y_pred_rf)
result.columns = ['y_pred_rf']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_rf'])#*(result['test_y']-result['pred_y'])
print(result.loc[:])

    y_pred_rf  test_y       RMS
0   50.945491  50.282 -0.663491
1   50.296314  50.363  0.066686
2   50.289499  49.977 -0.312499
3   49.884836  49.998  0.113164
4   49.897330  50.427  0.529670
5   50.311011  50.468  0.156989
6   50.390606  50.150 -0.240606
7   50.222463  50.250  0.027537
8   50.275176  50.139 -0.136176
9   50.122635  50.286  0.163365
10  50.289499  50.242 -0.047499
11  50.283971  50.700  0.416029
12  50.956479  50.915 -0.041479
13  50.952500  51.080  0.127500
14  50.993744  51.798  0.804256
15  52.144092  51.950 -0.194092
16  52.078172  51.882 -0.196172
17  51.915406  52.427  0.511594
18  52.375971  52.050 -0.325971
19  52.036495  51.845 -0.191495
20  51.876303  51.595 -0.281303
21  51.488036  51.932  0.443964
22  52.037006  52.282  0.244994
23  52.150139  52.498  0.347861
24  52.441583  52.330 -0.111583


In [35]:
# Plot
plt.plot(range(0,len(result)), test_y, label='Actual price', color='blue')
plt.plot(range(0,len(result)), y_pred_rf, label='Predicted price', color='red')
plt.xlabel(span)
plt.ylabel('Price (\)')
n1 = '{0} Price by RandomForest'.format(trade_name)
plt.title(n1)
plt.grid(True)
plt.legend()
n2='{0}1 by RandomForest.png'.format(trade_name)
plt.savefig(n2)
plt.close()
plt.show()

In [36]:
# Plot拡大
plt.plot(range(0,len(result[0:15])), test_y[len(test_y)-15:], label='Actual price', color='blue', marker = 'o')
plt.plot(range(0,len(result[0:15])), y_pred_rf[len(test_y)-15:], label='Predicted price', color='red', marker ='x')
plt.xlabel(span)
plt.ylabel('Price (\)')
n3='{0} Price by RandomForest'.format(trade_name)
plt.title(n3)
plt.grid(True)
plt.legend()
n4='{0}2 by RandomForest.png'.format(trade_name)
plt.savefig(n4)
plt.close()
plt.show()

In [37]:
# 予測結果を出力します。これは線形回帰
result = pd.DataFrame(y_pred_lr)
result.columns = ['y_pred_lr']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_lr'])
print(result.loc[:])

    y_pred_lr  test_y       RMS
0   50.774088  50.282 -0.492088
1   50.219992  50.363  0.143008
2   50.345578  49.977 -0.368578
3   49.992678  49.998  0.005322
4   50.036911  50.427  0.390089
5   50.468189  50.468 -0.000189
6   50.470005  50.150 -0.320005
7   50.139338  50.250  0.110662
8   50.271452  50.139 -0.132452
9   50.148678  50.286  0.137322
10  50.324027  50.242 -0.082027
11  50.243841  50.700  0.456159
12  50.704549  50.915  0.210451
13  50.900258  51.080  0.179742
14  51.069024  51.798  0.728976
15  51.781643  51.950  0.168357
16  51.916803  51.882 -0.034803
17  51.856774  52.427  0.570226
18  52.421303  52.050 -0.371303
19  52.024281  51.845 -0.179281
20  51.842168  51.595 -0.247168
21  51.608526  51.932  0.323474
22  51.977166  52.282  0.304834
23  52.300878  52.498  0.197122
24  52.484218  52.330 -0.154218


In [38]:
# 予測結果を出力します。これはランダムフォレスト
result = pd.DataFrame(y_pred_lr)
result.columns = ['y_pred_lr']
result['test_y'] = test_y
#２乗平均誤差ではなく、ただの差
result['RMS'] = (result['test_y']-result['y_pred_lr'])
print(result.loc[:])

    y_pred_lr  test_y       RMS
0   50.774088  50.282 -0.492088
1   50.219992  50.363  0.143008
2   50.345578  49.977 -0.368578
3   49.992678  49.998  0.005322
4   50.036911  50.427  0.390089
5   50.468189  50.468 -0.000189
6   50.470005  50.150 -0.320005
7   50.139338  50.250  0.110662
8   50.271452  50.139 -0.132452
9   50.148678  50.286  0.137322
10  50.324027  50.242 -0.082027
11  50.243841  50.700  0.456159
12  50.704549  50.915  0.210451
13  50.900258  51.080  0.179742
14  51.069024  51.798  0.728976
15  51.781643  51.950  0.168357
16  51.916803  51.882 -0.034803
17  51.856774  52.427  0.570226
18  52.421303  52.050 -0.371303
19  52.024281  51.845 -0.179281
20  51.842168  51.595 -0.247168
21  51.608526  51.932  0.323474
22  51.977166  52.282  0.304834
23  52.300878  52.498  0.197122
24  52.484218  52.330 -0.154218


In [39]:
# Plot
plt.plot(range(0,len(result)), test_y, label='Actual price', color='blue')
plt.plot(range(0,len(result)), y_pred_lr, label='Predicted price', color='red')
plt.xlabel(span)
plt.ylabel('Price (\)')
n1 = '{0} Price by LinearRegression'.format(trade_name)
plt.title(n1)
plt.grid(True)
plt.legend()
n2='{0}1LinearRegression by .png'.format(trade_name)
plt.savefig(n2)
plt.close()
plt.show()

In [40]:
# Plot拡大
plt.plot(range(0,len(result[0:15])), test_y[len(test_y)-15:], label='Actual price', color='blue', marker = 'o')
plt.plot(range(0,len(result[0:15])), y_pred_lr[len(test_y)-15:], label='Predicted price', color='red', marker ='x')
plt.xlabel(span)
plt.ylabel('Price (\)')
n3='{0} Price by LinearRegression'.format(trade_name)
plt.title(n3)
plt.grid(True)
plt.legend()
n4='{0}2LinearRegression by .png'.format(trade_name)
plt.savefig(n4)
plt.close()
plt.show()