In [960]:
import pandas as pd
import math
from math import sqrt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# read in data predictions for BTC

In [961]:
predictions_df = pd.read_csv("training_configs/btc_all_predictions.csv", parse_dates=True)
true_price_df = pd.read_csv("../tmp/historic_crypto_prices - bitcoin_jan_2017_sep_4_2021 copy.csv")

In [962]:
predictions_df.head()

Unnamed: 0,date,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,date_prediction_for,test_model_lookback_1
0,2010-01-01,0.0,0.0,0.0,0.0,0.0,0.0,2010-01-09,0.0
1,2019-04-02,4752.149527,253.537135,4730.976374,654.310204,3469.734359,795.20143,2019-04-09,0.0
2,2019-04-03,4546.248123,407.206977,4373.560445,255.627627,4437.900633,1413.509939,2019-04-10,0.0
3,2019-04-04,4550.99859,726.168584,4045.591506,1861.981408,4012.01123,523.09301,2019-04-11,0.0
4,2019-04-05,5003.264197,353.606543,4819.668698,817.215937,3977.238773,553.057301,2019-04-12,0.0


In [963]:
predictions_df.date.min()

'2010-01-01'

In [964]:
predictions_df.date.max()

'2021-06-06'

In [823]:
predictions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 9 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   date                                                    541 non-null    object 
 1   nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2  541 non-null    float64
 2   tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2     541 non-null    float64
 3   nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2  541 non-null    float64
 4   tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2     541 non-null    float64
 5   nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2  541 non-null    float64
 6   tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2     541 non-null    float64
 7   date_prediction_for                                     541 non-null    object 
 8   test_model_lookback_1                    

In [824]:
true_close_df = true_price_df[['date','close']]

### Map the predictions for date to the true price date

In [825]:
predictions_df.date_prediction_for

0       2010-01-09
1       2019-04-09
2       2019-04-10
3       2019-04-11
4       2019-04-12
          ...     
536     2020-09-25
537     2020-09-26
538     2020-09-27
539     2020-09-28
540     2020-09-29
Name: date_prediction_for, Length: 541, dtype: object

In [826]:
merged_df = pd.merge(predictions_df, true_close_df, left_on='date_prediction_for',
                     right_on ='date', suffixes=['_pred','_true'])

In [827]:
merged_df['day'] = [t.day for t in pd.to_datetime(merged_df.date_prediction_for)]
merged_df['month'] =  [t.month for t in pd.to_datetime(merged_df.date_prediction_for)]
merged_df['quarter'] = [t.quarter for t in pd.to_datetime(merged_df.date_prediction_for)]

In [828]:
merged_df.drop(['date_pred', 'date_true', 'date_prediction_for'], inplace=True, axis=1)

In [829]:
merged_df.tail()

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,close,day,month,quarter
535,10826.426376,10362.220019,10633.704014,10749.163259,10391.53049,9856.715457,0.0,10692.72,25,9,3
536,11039.142639,10271.400729,10473.443642,10075.427808,11436.436547,10930.332111,0.0,10750.72,26,9,3
537,10960.423892,10149.405898,10941.914878,9838.207279,11449.616505,10617.881368,0.0,10775.27,27,9,3
538,11310.868536,10489.588384,10356.597065,10323.434719,9966.598961,10787.260297,0.0,10709.65,28,9,3
539,10720.133819,9823.123286,11386.870104,10544.204764,11732.128077,10902.035585,0.0,10844.64,29,9,3


In [830]:
merged_df.describe()

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,close,day,month,quarter
count,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0,540.0
mean,9072.691803,8923.196339,9097.661222,9070.248362,9192.142836,8796.675216,0.0,8937.536463,15.896296,6.542593,2.511111
std,1871.651118,3957.600653,2011.094979,3804.239117,2134.066788,3969.814904,0.0,1716.854959,8.757948,2.994125,0.964041
min,4546.248123,139.87784,3666.543647,-289.448271,3261.251466,-280.599843,0.0,4970.79,1.0,1.0,1.0
25%,7678.110747,6849.250236,7784.452043,7022.245954,7830.086539,7064.032121,0.0,7804.195,8.0,4.0,2.0
50%,9248.095649,8977.055439,9201.514996,8923.334721,9223.061306,8842.15174,0.0,9160.98,16.0,7.0,3.0
75%,10369.751803,10499.238135,10364.739354,10549.849168,10699.17203,10155.465484,0.0,10179.1875,23.0,9.0,3.0
max,13331.237899,23654.633486,14332.008433,23157.885335,14228.875403,23173.074402,0.0,13016.23,31.0,12.0,4.0


### train test split

In [917]:
split_pct = .90
l_merged = len(merged_df)
merged_df_train, merged_df_test = merged_df.iloc[:int(split_pct*l_merged),:],  merged_df.iloc[int(l_merged*split_pct):,:]
                                                                                                  

In [918]:
print(f"train rows = {len(merged_df_train)}")

train rows = 486


In [919]:
print(f"test rows = {len(merged_df_test)}")

test rows = 54


In [920]:
merged_df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 486 entries, 0 to 485
Data columns (total 11 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2  486 non-null    float64
 1   tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2     486 non-null    float64
 2   nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2  486 non-null    float64
 3   tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2     486 non-null    float64
 4   nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2  486 non-null    float64
 5   tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2     486 non-null    float64
 6   test_model_lookback_1                                   486 non-null    float64
 7   close                                                   486 non-null    float64
 8   day                                     

In [921]:
merged_df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 54 entries, 486 to 539
Data columns (total 11 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2  54 non-null     float64
 1   tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2     54 non-null     float64
 2   nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2  54 non-null     float64
 3   tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2     54 non-null     float64
 4   nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2  54 non-null     float64
 5   tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2     54 non-null     float64
 6   test_model_lookback_1                                   54 non-null     float64
 7   close                                                   54 non-null     float64
 8   day                                    

In [922]:
merged_df_train.iloc[:, ~merged_df.columns.isin(['close'])]

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,day,month,quarter
0,4752.149527,253.537135,4730.976374,654.310204,3469.734359,795.201430,0.0,9,4,2
1,4546.248123,407.206977,4373.560445,255.627627,4437.900633,1413.509939,0.0,10,4,2
2,4550.998590,726.168584,4045.591506,1861.981408,4012.011230,523.093010,0.0,11,4,2
3,5003.264197,353.606543,4819.668698,817.215937,3977.238773,553.057301,0.0,12,4,2
4,5184.268504,197.768508,5123.185573,1596.333494,4093.602236,920.106602,0.0,13,4,2
...,...,...,...,...,...,...,...,...,...,...
481,9474.582586,8937.307830,9412.221795,9763.418871,9596.541157,9605.129715,0.0,2,8,3
482,9620.547643,10027.948410,9655.541195,9408.660579,10571.252397,9565.245379,0.0,3,8,3
483,10105.403866,10505.595563,9235.231388,8725.041784,10110.418434,8921.825783,0.0,4,8,3
484,10749.375335,9294.040252,9867.545794,9935.776604,10468.456160,9248.010080,0.0,5,8,3


In [923]:


merged_df_x_train, merged_df_y_train = merged_df_train.iloc[:, ~merged_df_train.columns.isin(['close'])], merged_df_train.iloc[:, merged_df_train.columns.isin(['close'])]
merged_df_x_test, merged_df_y_test =  merged_df_test.iloc[:, ~merged_df_test.columns.isin(['close'])], merged_df_test.iloc[:, merged_df_test.columns.isin(['close'])]
                                                                                                                                                 

In [924]:
merged_df_x_train

Unnamed: 0,nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2,nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2,test_model_lookback_1,day,month,quarter
0,4752.149527,253.537135,4730.976374,654.310204,3469.734359,795.201430,0.0,9,4,2
1,4546.248123,407.206977,4373.560445,255.627627,4437.900633,1413.509939,0.0,10,4,2
2,4550.998590,726.168584,4045.591506,1861.981408,4012.011230,523.093010,0.0,11,4,2
3,5003.264197,353.606543,4819.668698,817.215937,3977.238773,553.057301,0.0,12,4,2
4,5184.268504,197.768508,5123.185573,1596.333494,4093.602236,920.106602,0.0,13,4,2
...,...,...,...,...,...,...,...,...,...,...
481,9474.582586,8937.307830,9412.221795,9763.418871,9596.541157,9605.129715,0.0,2,8,3
482,9620.547643,10027.948410,9655.541195,9408.660579,10571.252397,9565.245379,0.0,3,8,3
483,10105.403866,10505.595563,9235.231388,8725.041784,10110.418434,8921.825783,0.0,4,8,3
484,10749.375335,9294.040252,9867.545794,9935.776604,10468.456160,9248.010080,0.0,5,8,3


In [925]:
merged_df_y_train

Unnamed: 0,close
0,5204.96
1,5324.55
2,5064.49
3,5089.54
4,5096.59
...,...
481,11053.61
482,11246.35
483,11205.89
484,11747.02


# performance, rmse , on average prediction

In [926]:
average_predictions_train = np.mean(merged_df_x_train.iloc[:,:6], axis=1)
average_predictions_test = np.mean(merged_df_x_test.iloc[:,:6], axis=1)

In [927]:
average_predictions_train[:10]

0    2442.651505
1    2572.342291
2    2619.974055
3    2587.341908
4    2852.544153
5    2721.518844
6    3153.856190
7    2727.367343
8    3387.347636
9    2872.964842
dtype: float64

In [928]:
merged_df_y_train.head(10)

Unnamed: 0,close
0,5204.96
1,5324.55
2,5064.49
3,5089.54
4,5096.59
5,5167.72
6,5067.11
7,5235.56
8,5251.94
9,5298.39


In [929]:
average_rmse_train = sqrt(mean_squared_error(average_predictions_train, merged_df_y_train))
average_rmse_test = sqrt(mean_squared_error(average_predictions_test, merged_df_y_test))

In [930]:
print(f" The average training rmse is {average_rmse_train}")
print(f" The average testing rmse is {average_rmse_test}")

 The average training rmse is 1823.8193700892346
 The average testing rmse is 721.6392994016616


# train linear regression on ALL predictions

In [931]:
lregr = LinearRegression()

In [932]:
lregr.fit(merged_df_x_train, merged_df_y_train)

LinearRegression()

In [933]:
train_predictions_lrgr = lregr.predict(merged_df_x_train)

In [934]:
test_predictions_lrgr = lregr.predict(merged_df_x_test)

In [935]:
test_predictions_lrgr[:15]


array([[ 9995.86631283],
       [10472.61564195],
       [10321.24064166],
       [10337.27263209],
       [10304.93251317],
       [10121.44008054],
       [10080.41354908],
       [10379.50726677],
       [10410.58715221],
       [10550.24989192],
       [10686.90086298],
       [10447.38390401],
       [10316.15160927],
       [10633.96055763],
       [10578.82374457]])

In [936]:
merged_df_y_test.head(15)

Unnamed: 0,close
486,11601.47
487,11754.05
488,11675.74
489,11878.11
490,11410.53
491,11584.93
492,11784.14
493,11768.87
494,11865.7
495,11892.8


In [937]:
average_rmse_train_lrgr = sqrt(mean_squared_error(train_predictions_lrgr, merged_df_y_train))
average_rmse_test_lrgr = sqrt(mean_squared_error(test_predictions_lrgr, merged_df_y_test))

In [938]:
print(f" The average training rmse with linear stacking is {average_rmse_train_lrgr}")
print(f" The average testing rmse with linear stacking is {average_rmse_test_lrgr}")

 The average training rmse with linear stacking is 986.6867914377826
 The average testing rmse with linear stacking is 1088.6741686309315




#### feature importance

In [939]:
lregr.coef_.ravel()

array([ 4.34260091e-01,  6.33775434e-03,  2.52707704e-01,  2.11201128e-02,
        2.45796369e-02, -2.42143141e-02,  1.84741111e-13, -1.19905019e+01,
       -1.41275545e+02,  3.12935214e+02])

In [940]:
lregr.coef_.ravel()[np.argsort(lregr.coef_.ravel())[::-1]]

array([ 3.12935214e+02,  4.34260091e-01,  2.52707704e-01,  2.45796369e-02,
        2.11201128e-02,  6.33775434e-03,  1.84741111e-13, -2.42143141e-02,
       -1.19905019e+01, -1.41275545e+02])

In [941]:
merged_df_x_train.columns[np.argsort(lregr.coef_)[::-1]]

array([['month', 'day',
        'tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
        'test_model_lookback_1',
        'tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
        'tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2',
        'nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
        'nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2',
        'nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
        'quarter']], dtype=object)

## Test Random Forest

In [942]:

rf = RandomForestRegressor(n_estimators=500)

In [943]:
rf.fit(merged_df_x_train, merged_df_y_train)

  rf.fit(merged_df_x_train, merged_df_y_train)


RandomForestRegressor(n_estimators=500)

In [944]:
train_predictions_rf = rf.predict(merged_df_x_train)
test_predictions_rf = rf.predict(merged_df_x_test)

In [945]:
test_predictions_rf[:15].reshape(-1,1)


array([[11388.88658],
       [11408.46958],
       [11392.66194],
       [11300.0202 ],
       [11266.9458 ],
       [11212.31272],
       [10980.11056],
       [10415.13368],
       [10355.74526],
       [10305.09446],
       [10354.18302],
       [10338.12866],
       [10289.00302],
       [10291.79834],
       [10351.26662]])

In [946]:
merged_df_y_test.head(15)

Unnamed: 0,close
486,11601.47
487,11754.05
488,11675.74
489,11878.11
490,11410.53
491,11584.93
492,11784.14
493,11768.87
494,11865.7
495,11892.8


In [947]:
average_rmse_train_rf = sqrt(mean_squared_error(train_predictions_rf, merged_df_y_train))
average_rmse_test_rf = sqrt(mean_squared_error(test_predictions_rf, merged_df_y_test))

In [948]:
print(f" The average training rmse with RF stacking is {average_rmse_train_rf}")
print(f" The average testing rmse with RF stacking is {average_rmse_test_rf}")

 The average training rmse with RF stacking is 268.99419763916507
 The average testing rmse with RF stacking is 1221.3785771976902


In [949]:
rf.feature_importances_.ravel()[np.argsort(rf.feature_importances_.ravel())[::-1]]

array([0.54775669, 0.12920538, 0.0757591 , 0.05930464, 0.0567762 ,
       0.03963325, 0.03862842, 0.02795125, 0.02498508, 0.        ])

In [950]:
merged_df_x_train.columns[np.argsort(rf.feature_importances_)[::-1]]

Index(['nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
       'nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2', 'month',
       'day', 'nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
       'tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2',
       'tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2', 'quarter',
       'tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
       'test_model_lookback_1'],
      dtype='object')

## Test gradient boosting

In [951]:
gb = GradientBoostingRegressor(n_estimators=200)

In [952]:
gb.fit(merged_df_x_train, merged_df_y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(n_estimators=200)

In [953]:
train_predictions_gb = gb.predict(merged_df_x_train)
test_predictions_gb = gb.predict(merged_df_x_test)

In [954]:
test_predictions_gb[:15].reshape(-1,1)


array([[11236.4888797 ],
       [11434.1483725 ],
       [11391.59780092],
       [11253.82545603],
       [10896.57534784],
       [10806.34783648],
       [10685.08938925],
       [10451.68375825],
       [10657.24776663],
       [10674.60784759],
       [10626.25804594],
       [10470.6246272 ],
       [10394.2225457 ],
       [10572.2052481 ],
       [10605.65022014]])

In [955]:
merged_df_y_test.head(15)

Unnamed: 0,close
486,11601.47
487,11754.05
488,11675.74
489,11878.11
490,11410.53
491,11584.93
492,11784.14
493,11768.87
494,11865.7
495,11892.8


In [956]:
average_rmse_train_gb = sqrt(mean_squared_error(train_predictions_gb, merged_df_y_train))
average_rmse_test_gb = sqrt(mean_squared_error(test_predictions_gb, merged_df_y_test))

In [957]:
print(f" The average training rmse with GB stacking is {average_rmse_train_gb}")
print(f" The average testing rmse with GB stacking is {average_rmse_test_gb}")

 The average training rmse with GB stacking is 260.2303361794148
 The average testing rmse with GB stacking is 1166.5582090252108


In [958]:
gb.feature_importances_.ravel()[np.argsort(gb.feature_importances_.ravel())[::-1]]

array([0.5279611 , 0.1311332 , 0.119377  , 0.05095145, 0.04548332,
       0.03957752, 0.03815806, 0.02749728, 0.01986108, 0.        ])

In [959]:
merged_df_x_train.columns[np.argsort(gb.feature_importances_)[::-1]]

Index(['nbeats_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
       'nbeats_btc_lookback_30_window_5_std_1.25_num_add_dfs_2', 'month',
       'tcn_btc_lookback_30_window_5_std_1.25_num_add_dfs_2', 'day',
       'nbeats_btc_lookback_45_window_5_std_1.25_num_add_dfs_2', 'quarter',
       'tcn_btc_lookback_15_window_5_std_1.25_num_add_dfs_2',
       'tcn_btc_lookback_45_window_5_std_1.25_num_add_dfs_2',
       'test_model_lookback_1'],
      dtype='object')