In [643]:
import pandas as pd
import math
from math import sqrt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# read in data predictions for BTC

In [590]:
predictions_df = pd.read_csv("training_configs/btc_all_predictions.csv", parse_dates=True)
true_price_df = pd.read_csv("../tmp/historic_crypto_prices - bitcoin_jan_2017_sep_4_2021 copy.csv")

In [591]:
predictions_df.head()

Unnamed: 0,date,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,date_prediction_for,test_model_lookback_1
0,2010-01-01,0.0,0.0,0.0,0.0,0.0,0.0,2010-01-09,0.0
1,2019-04-02,4752.149527,253.537135,4730.976374,654.310204,3469.734359,795.20143,2019-04-09,0.0
2,2019-04-03,4546.248123,407.206977,4373.560445,255.627627,4437.900633,1413.509939,2019-04-10,0.0
3,2019-04-04,4550.99859,726.168584,4045.591506,1861.981408,4012.01123,523.09301,2019-04-11,0.0
4,2019-04-05,5003.264197,353.606543,4819.668698,817.215937,3977.238773,553.057301,2019-04-12,0.0


In [592]:
predictions_df.date.min()

'2010-01-01'

In [593]:
predictions_df.date.max()

'2020-08-04'

In [594]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 492 entries, 0 to 491
Data columns (total 9 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   date                                                    492 non-null    object 
 1   nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2  492 non-null    float64
 2   tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2     492 non-null    float64
 3   nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2  492 non-null    float64
 4   tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2     492 non-null    float64
 5   nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2  492 non-null    float64
 6   tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2     492 non-null    float64
 7   date_prediction_for                                     492 non-null    object 
 8   test_model_lookback_1                    

In [595]:
true_close_df = true_price_df[['date','close']]

### Map the predictions for date to the true price date

In [596]:
predictions_df.date_prediction_for

0       2010-01-09
1       2019-04-09
2       2019-04-10
3       2019-04-11
4       2019-04-12
          ...     
487     2020-08-07
488     2020-08-08
489     2020-08-09
490     2020-08-10
491     2020-08-11
Name: date_prediction_for, Length: 492, dtype: object

In [597]:
merged_df = pd.merge(predictions_df, true_close_df, left_on='date_prediction_for',
                     right_on ='date', suffixes=['_pred','_true'])

In [598]:
merged_df['day'] = [t.day for t in pd.to_datetime(merged_df.date_prediction_for)]
merged_df['month'] =  [t.month for t in pd.to_datetime(merged_df.date_prediction_for)]
merged_df['quarter'] = [t.quarter for t in pd.to_datetime(merged_df.date_prediction_for)]

In [599]:
merged_df.drop(['date_pred', 'date_true', 'date_prediction_for'], inplace=True, axis=1)

In [600]:
merged_df.tail()

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,close,day,month,quarter
486,10584.307512,9961.906642,10649.915556,10202.599075,10650.126787,9736.913534,0.0,11601.47,7,8,3
487,10931.541496,9998.582844,12069.739522,10378.56136,10056.941156,10158.936073,0.0,11754.05,8,8,3
488,11495.76143,9630.692792,10574.792448,10048.738012,10654.639726,10654.99659,0.0,11675.74,9,8,3
489,10859.135558,10158.411734,11663.520801,10715.364345,10696.597976,10204.908968,0.0,11878.11,10,8,3
490,11184.236557,10360.02295,11112.723987,10307.32378,10081.569439,10199.958559,0.0,11410.53,11,8,3


In [601]:
merged_df.describe()

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,close,day,month,quarter
count,491.0,491.0,491.0,491.0,491.0,491.0,491.0,491.0,491.0,491.0,491.0
mean,8844.34558,8746.783719,8894.054241,8911.115659,8969.656057,8620.635048,0.0,8723.286721,15.720978,6.338086,2.462322
std,1797.747268,4104.531706,1980.121643,3951.81768,2099.310735,4118.7205,0.0,1642.467316,8.804078,3.061905,0.998011
min,4546.248123,139.87784,3666.543647,-289.448271,3261.251466,-280.599843,0.0,4970.79,1.0,1.0,1.0
25%,7463.266147,6658.51801,7532.412004,6859.012971,7651.020014,6878.503926,0.0,7553.57,8.0,4.0,2.0
50%,9052.784116,8824.585663,9030.426251,8714.065258,9024.592279,8637.863913,0.0,8897.47,16.0,6.0,2.0
75%,10018.505735,10029.18287,10035.180268,10246.668849,10228.135508,9704.617379,0.0,9731.76,23.0,8.0,3.0
max,13331.237899,23654.633486,14332.008433,23157.885335,14228.875403,23173.074402,0.0,13016.23,31.0,12.0,4.0


### train test split

In [689]:
split_pct = .95
l_merged = len(merged_df)
merged_df_train, merged_df_test = merged_df.iloc[:int(split_pct*l_merged),:],  merged_df.iloc[int(l_merged*split_pct):,:]
                                                                                                  

In [690]:
print(f"train rows = {len(merged_df_train)}")

train rows = 392


In [691]:
print(f"test rows = {len(merged_df_test)}")

test rows = 99


In [692]:
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 392 entries, 0 to 391
Data columns (total 11 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2  392 non-null    float64
 1   tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2     392 non-null    float64
 2   nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2  392 non-null    float64
 3   tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2     392 non-null    float64
 4   nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2  392 non-null    float64
 5   tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2     392 non-null    float64
 6   test_model_lookback_1                                   392 non-null    float64
 7   close                                                   392 non-null    float64
 8   day                                     

In [693]:
merged_df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99 entries, 392 to 490
Data columns (total 11 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2  99 non-null     float64
 1   tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2     99 non-null     float64
 2   nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2  99 non-null     float64
 3   tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2     99 non-null     float64
 4   nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2  99 non-null     float64
 5   tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2     99 non-null     float64
 6   test_model_lookback_1                                   99 non-null     float64
 7   close                                                   99 non-null     float64
 8   day                                    

In [694]:
merged_df_train.iloc[:, ~merged_df.columns.isin(['close'])]

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,day,month,quarter
0,4752.149527,253.537135,4730.976374,654.310204,3469.734359,795.201430,0.0,9,4,2
1,4546.248123,407.206977,4373.560445,255.627627,4437.900633,1413.509939,0.0,10,4,2
2,4550.998590,726.168584,4045.591506,1861.981408,4012.011230,523.093010,0.0,11,4,2
3,5003.264197,353.606543,4819.668698,817.215937,3977.238773,553.057301,0.0,12,4,2
4,5184.268504,197.768508,5123.185573,1596.333494,4093.602236,920.106602,0.0,13,4,2
...,...,...,...,...,...,...,...,...,...,...
387,7136.626083,6786.350395,6859.325242,6887.064658,7051.176119,6796.392386,0.0,30,4,2
388,7391.039505,6869.404160,7604.083136,6825.547912,6771.428244,6503.281720,0.0,1,5,2
389,7025.184867,6636.068825,6968.908588,6761.649883,7198.364165,7073.219600,0.0,2,5,2
390,7224.796220,7262.025130,6896.162267,7100.355215,7458.973494,6665.303715,0.0,3,5,2


In [695]:


merged_df_x_train, merged_df_y_train = merged_df_train.iloc[:, ~merged_df_train.columns.isin(['close'])], merged_df_train.iloc[:, merged_df_train.columns.isin(['close'])]
merged_df_x_test, merged_df_y_test =  merged_df_test.iloc[:, ~merged_df_test.columns.isin(['close'])], merged_df_test.iloc[:, merged_df_test.columns.isin(['close'])]
                                                                                                                                                 

In [696]:
merged_df_x_train

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,day,month,quarter
0,4752.149527,253.537135,4730.976374,654.310204,3469.734359,795.201430,0.0,9,4,2
1,4546.248123,407.206977,4373.560445,255.627627,4437.900633,1413.509939,0.0,10,4,2
2,4550.998590,726.168584,4045.591506,1861.981408,4012.011230,523.093010,0.0,11,4,2
3,5003.264197,353.606543,4819.668698,817.215937,3977.238773,553.057301,0.0,12,4,2
4,5184.268504,197.768508,5123.185573,1596.333494,4093.602236,920.106602,0.0,13,4,2
...,...,...,...,...,...,...,...,...,...,...
387,7136.626083,6786.350395,6859.325242,6887.064658,7051.176119,6796.392386,0.0,30,4,2
388,7391.039505,6869.404160,7604.083136,6825.547912,6771.428244,6503.281720,0.0,1,5,2
389,7025.184867,6636.068825,6968.908588,6761.649883,7198.364165,7073.219600,0.0,2,5,2
390,7224.796220,7262.025130,6896.162267,7100.355215,7458.973494,6665.303715,0.0,3,5,2


In [697]:
merged_df_y_train

Unnamed: 0,close
0,5204.96
1,5324.55
2,5064.49
3,5089.54
4,5096.59
...,...
387,8658.55
388,8864.77
389,8988.60
390,8897.47


### performance, rmse , on average prediction

In [698]:
average_predictions_train = np.mean(merged_df_x_train.iloc[:,:6], axis=1)
average_predictions_test = np.mean(merged_df_x_test.iloc[:,:6], axis=1)

In [699]:
average_predictions_train[:10]

0    2442.651505
1    2572.342291
2    2619.974055
3    2587.341908
4    2852.544153
5    2721.518844
6    3153.856190
7    2727.367343
8    3387.347636
9    2872.964842
dtype: float64

In [700]:
merged_df_y_train.head(10)

Unnamed: 0,close
0,5204.96
1,5324.55
2,5064.49
3,5089.54
4,5096.59
5,5167.72
6,5067.11
7,5235.56
8,5251.94
9,5298.39


In [701]:
average_rmse_train = sqrt(mean_squared_error(average_predictions_train, merged_df_y_train))
average_rmse_test = sqrt(mean_squared_error(average_predictions_test, merged_df_y_test))

In [702]:
print(f" The average training rmse is {average_rmse_train}")
print(f" The average testing rmse is {average_rmse_test}")

 The average training rmse is 1981.9978035024974
 The average testing rmse is 916.6777760514856


# train linear regression on ALL predictions

In [703]:
lregr = LinearRegression()

In [704]:
lregr.fit(merged_df_x_train, merged_df_y_train)

LinearRegression()

In [705]:
train_predictions_lrgr = lregr.predict(merged_df_x_train)

In [706]:
test_predictions_lrgr = lregr.predict(merged_df_x_test)

In [707]:
test_predictions_lrgr[:15]


array([[7568.55224316],
       [7584.00854377],
       [7948.46186642],
       [8404.1591101 ],
       [8279.63414651],
       [8358.30072111],
       [8237.55613694],
       [8390.22459791],
       [8457.68203229],
       [8528.82972838],
       [8650.65196811],
       [8833.13420606],
       [8920.31804059],
       [8862.30674872],
       [8920.60043074]])

In [708]:
merged_df_y_test.head(15)

Unnamed: 0,close
392,9003.07
393,9268.76
394,9951.52
395,9842.67
396,9593.9
397,8756.43
398,8601.8
399,8804.48
400,9269.99
401,9733.72


In [709]:
average_rmse_train_lrgr = sqrt(mean_squared_error(train_predictions_lrgr, merged_df_y_train))
average_rmse_test_lrgr = sqrt(mean_squared_error(test_predictions_lrgr, merged_df_y_test))

In [710]:
print(f" The average training rmse with linear stacking is {average_rmse_train_lrgr}")
print(f" The average testing rmse with linear stacking is {average_rmse_test_lrgr}")

 The average training rmse with linear stacking is 999.4434238577142
 The average testing rmse with linear stacking is 1052.4786798010348




#### feature importance

In [711]:
lregr.coef_.ravel()

array([ 3.20343697e-01,  1.32943274e-02,  2.87279434e-01,  3.63110406e-02,
        3.79222840e-02, -2.03051844e-02, -5.68434189e-14, -1.03310970e+01,
       -1.25992016e+02,  2.53243573e+02])

In [712]:
lregr.coef_.ravel()[np.argsort(lregr.coef_.ravel())[::-1]]

array([ 2.53243573e+02,  3.20343697e-01,  2.87279434e-01,  3.79222840e-02,
        3.63110406e-02,  1.32943274e-02, -5.68434189e-14, -2.03051844e-02,
       -1.03310970e+01, -1.25992016e+02])

In [713]:
merged_df_x_train.columns[np.argsort(lregr.coef_)[::-1]]

array([['month', 'day',
        'tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
        'test_model_lookback_1',
        'tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
        'tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2',
        'nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
        'nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2',
        'nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
        'quarter']], dtype=object)

## Test Random Forest

In [714]:

rf = RandomForestRegressor(n_estimators=500)

In [715]:
rf.fit(merged_df_x_train, merged_df_y_train)

  rf.fit(merged_df_x_train, merged_df_y_train)


RandomForestRegressor(n_estimators=500)

In [716]:
train_predictions_rf = rf.predict(merged_df_x_train)
test_predictions_rf = rf.predict(merged_df_x_test)

In [717]:
test_predictions_rf[:15].reshape(-1,1)


array([[8313.6438 ],
       [8257.456  ],
       [8385.04022],
       [8162.89974],
       [8319.19166],
       [8136.29086],
       [8305.28176],
       [8215.0161 ],
       [8003.75748],
       [8484.74446],
       [8049.43618],
       [8546.12122],
       [8687.3264 ],
       [8214.67814],
       [8572.97314]])

In [718]:
merged_df_y_test.head(15)

Unnamed: 0,close
392,9003.07
393,9268.76
394,9951.52
395,9842.67
396,9593.9
397,8756.43
398,8601.8
399,8804.48
400,9269.99
401,9733.72


In [719]:
average_rmse_train_rf = sqrt(mean_squared_error(train_predictions_rf, merged_df_y_train))
average_rmse_test_rf = sqrt(mean_squared_error(test_predictions_rf, merged_df_y_test))

In [720]:
print(f" The average training rmse with RF stacking is {average_rmse_train_rf}")
print(f" The average testing rmse with RF stacking is {average_rmse_test_rf}")

 The average training rmse with RF stacking is 244.6569081628251
 The average testing rmse with RF stacking is 1027.9417466305345


In [721]:
rf.feature_importances_.ravel()[np.argsort(rf.feature_importances_.ravel())[::-1]]

array([0.4434473 , 0.23379606, 0.08744208, 0.06693397, 0.05106938,
       0.04309471, 0.02745418, 0.02565979, 0.02110252, 0.        ])

In [722]:
merged_df_x_train.columns[np.argsort(rf.feature_importances_)[::-1]]

Index(['nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
       'nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2', 'month',
       'day', 'nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
       'tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2',
       'tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2', 'quarter',
       'tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
       'test_model_lookback_1'],
      dtype='object')

## Test gradient boosting

In [723]:
gb = GradientBoostingRegressor(n_estimators=200)

In [724]:
gb.fit(merged_df_x_train, merged_df_y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(n_estimators=200)

In [725]:
train_predictions_gb = gb.predict(merged_df_x_train)
test_predictions_gb = gb.predict(merged_df_x_test)

In [726]:
test_predictions_gb[:15].reshape(-1,1)


array([[8302.28497628],
       [8215.99940819],
       [8126.40403291],
       [8586.09231464],
       [8122.61441211],
       [8316.97581591],
       [8047.35339621],
       [8226.44290667],
       [8149.45017645],
       [8568.4965611 ],
       [8152.50435377],
       [8658.59481321],
       [8356.86897605],
       [8302.17441817],
       [8418.46439233]])

In [727]:
merged_df_y_test.head(15)

Unnamed: 0,close
392,9003.07
393,9268.76
394,9951.52
395,9842.67
396,9593.9
397,8756.43
398,8601.8
399,8804.48
400,9269.99
401,9733.72


In [728]:
average_rmse_train_gb = sqrt(mean_squared_error(train_predictions_gb, merged_df_y_train))
average_rmse_test_gb = sqrt(mean_squared_error(test_predictions_gb, merged_df_y_test))

In [729]:
print(f" The average training rmse with GB stacking is {average_rmse_train_gb}")
print(f" The average testing rmse with GB stacking is {average_rmse_test_gb}")

 The average training rmse with GB stacking is 224.28141597606398
 The average testing rmse with GB stacking is 895.6150789872025


In [730]:
gb.feature_importances_.ravel()[np.argsort(gb.feature_importances_.ravel())[::-1]]

array([0.44858195, 0.1912788 , 0.10202117, 0.07974801, 0.05352178,
       0.04020817, 0.0316567 , 0.03047284, 0.02251059, 0.        ])

In [731]:
merged_df_x_train.columns[np.argsort(gb.feature_importances_)[::-1]]

Index(['nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
       'nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2', 'month',
       'tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2', 'day',
       'nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2', 'quarter',
       'tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
       'tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
       'test_model_lookback_1'],
      dtype='object')