In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.dummy import DummyRegressor
import numpy as np
import pandas as pd

In [4]:
df_master = pd.read_json('daily_combined.json', orient ='split', compression = 'infer')

In [5]:
# takes the average for that specific cryptocurrency of the training set and returns that
df_master.index.year.value_counts()
# shows that the last year is 2022 which will be the test dataset 

2020    366
2018    365
2019    365
2021    365
2022    202
2017    136
dtype: int64

In [9]:
# create train test partition
test  = df_master['2022-01-01':]
train = df_master[:'2021-12-31']

print('Train Dataset:',train.shape)
print('Test Dataset:',test.shape)

Train Dataset: (1597, 106)
Test Dataset: (202, 106)


## Naive Baseline

In [10]:
baseline_dict={}
skipped_crypto=[] # list of cryptos with no data in train ds which need to be removed from test

In [11]:
for crypto in train.columns:
    # calculate average
    
    mean_return = train[crypto].mean()
    
    # skip over cryptos that dont have 2022 data
    if np.isnan(mean_return):
        skipped_crypto.append(crypto)
        continue
    
    
    y_true = test[crypto].dropna()
    y_pred = pd.Series(mean_return, index=range(y_true.shape[0]))
    
    #compute MSE
    mse = mean_squared_error(y_true, y_pred)
    
    baseline_dict[crypto] = mse

In [12]:
baseline_dict

{'ZILUSDT': 0.01226379825662325,
 'CVXUSDT': 0.007142192016403001,
 'DCRUSDT': 0.0029354713399354316,
 'RUNEUSDT': 0.007875569671741614,
 'THETAUSDT': 0.004309841306901933,
 'BTCUSDT': 0.0013694831627563031,
 'MANAUSDT': 0.0054629864871445,
 'TRXUSDT': 0.0019476703710793815,
 'STORJUSDT': 0.0059764258787936815,
 'STXUSDT': 0.004028741410024981,
 'LRCUSDT': 0.006306420475160082,
 'AVAXUSDT': 0.004570315427457969,
 'SOLUSDT': 0.00409411244845121,
 'FTTUSDT': 0.0019104286716948547,
 'BALUSDT': 0.003697496521275288,
 'ENJUSDT': 0.0040077990697338535,
 'SNXUSDT': 0.007839814368872005,
 'IOTAUSDT': 0.002742513213817964,
 'MATICUSDT': 0.004530090255711201,
 'XMRUSDT': 0.0031952256406547917,
 'SRMUSDT': 0.0039049056295866534,
 'TWTUSDT': 0.00457225690253909,
 'HBARUSDT': 0.002711983242664424,
 'AMPUSDT': 0.0022701722016371744,
 'QNTUSDT': 0.0033093762278626044,
 'LTCUSDT': 0.002314690097055744,
 'HIVEUSDT': 0.004497928350064133,
 'MINAUSDT': 0.004342639459648053,
 'XRPUSDT': 0.0020858238932844

In [None]:
for crypto in train.columns:
    # calculate average
    
    mean_baseline = DummyRegressor(strategy='mean')
    
    X = train[crypto]
    mean_baseline.fit()
    # skip over cryptos that dont have 2022 data
    if np.isnan(mean_return):
        skipped_crypto.append(crypto)
        continue
    
    
    y_true = test[crypto].dropna()
    y_pred = pd.Series(mean_return, index=range(y_true.shape[0]))
    
    #compute MSE
    mse = mean_squared_error(y_true, y_pred)
    
    baseline_dict[crypto] = mse