# First Baseline 

In [1]:
import os
import time
import pickle
from datetime import datetime

import numpy as np
import pandas as pd
import lightgbm as lgb
import scipy.stats as stats
from scipy.stats import pearsonr
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import talib

import xgboost as xgb

import matplotlib.pyplot as plt
plt.style.use("seaborn-darkgrid")

## Load Dataset

In [2]:
data_dir = "../data"

crypto_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
asset_details = pd.read_csv(os.path.join(data_dir, "asset_details.csv"))

In [3]:
crypto_df.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40.0,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229.0,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5.0,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [4]:
asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


In [5]:
for idx, row in asset_details.sort_values(by="Asset_ID").iterrows():
    _df = crypto_df[crypto_df["Asset_ID"]==row["Asset_ID"]].set_index("timestamp")
    
    beg = _df.index[0].astype("datetime64[s]")
    end = _df.index[-1].astype("datetime64[s]")

    print(f"{row.Asset_Name:16} data gose from {beg} to {end}")

Binance Coin     data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Bitcoin          data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Bitcoin Cash     data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Cardano          data gose from 2018-04-17T09:11:00 to 2021-09-21T00:00:00
Dogecoin         data gose from 2019-04-12T14:34:00 to 2021-09-21T00:00:00
EOS.IO           data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Ethereum         data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Ethereum Classic data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
IOTA             data gose from 2018-05-09T08:07:00 to 2021-09-21T00:00:00
Litecoin         data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Maker            data gose from 2018-05-10T15:21:00 to 2021-09-21T00:00:00
Monero           data gose from 2018-01-01T00:01:00 to 2021-09-21T00:00:00
Stellar          data gose from 2018-02-16T23:53:00 to 2021-09-21T00:00:00
TRON             data gos

## Preprocessing

In [6]:
totimestamp = lambda s: np.int32(time.mktime(datetime.strptime(s, "%d/%m/%Y").timetuple()))

In [7]:
btc = crypto_df[crypto_df["Asset_ID"]==1].set_index("timestamp") # Asset_ID=1 for Bitcoin
eth = crypto_df[crypto_df["Asset_ID"]==6].set_index("timestamp") # Asset_ID=6 for Ethereum
eth.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1956200 entries, 1514764860 to 1632182400
Data columns (total 9 columns):
 #   Column    Non-Null Count    Dtype  
---  ------    --------------    -----  
 0   Asset_ID  1956200 non-null  int64  
 1   Count     1956200 non-null  float64
 2   Open      1956200 non-null  float64
 3   High      1956200 non-null  float64
 4   Low       1956200 non-null  float64
 5   Close     1956200 non-null  float64
 6   Volume    1956200 non-null  float64
 7   VWAP      1956200 non-null  float64
 8   Target    1955860 non-null  float64
dtypes: float64(8), int64(1)
memory usage: 149.2 MB


In [8]:
eth.isna().sum()

Asset_ID      0
Count         0
Open          0
High          0
Low           0
Close         0
Volume        0
VWAP          0
Target      340
dtype: int64

In [9]:
# Reindex to fill nan missing data(timestamp).
eth = eth.reindex(range(eth.index[0],eth.index[-1]+60,60),method='pad')

In [10]:
def log_return(series, periods=1):
    """Calculate the difference after log transformation from the previous timestamp.
    """
    return np.log(series).diff(periods=periods)


def upper_shadow(asset):
    return asset.High - np.maximum(asset.Close, asset.Open)


def lower_shadow(asset):
    return np.minimum(asset.Close, asset.Open) - asset.Low

In [11]:
feature_cols = [
    'volume_adi',
    'volume_obv',
    'volume_cmf',
    'volume_fi',
    'volume_em',
    'volume_sma_em',
    'volume_vpt',
    'volume_vwap',
    'volume_mfi',
    'volume_nvi',
    """
    'volatility_bbm',
    'volatility_bbh',
    'volatility_bbl',
    'volatility_bbw',
    'volatility_bbp',
    'volatility_bbhi',
    'volatility_bbli',
    'volatility_kcc',
    'volatility_kch',
    'volatility_kcl',
    'volatility_kcw',
    'volatility_kcp',
    'volatility_kchi',
    'volatility_kcli',
    'volatility_dcl',
    'volatility_dch',
    'volatility_dcm',
    'volatility_dcw',
    'volatility_dcp',
    'volatility_atr',
    'volatility_ui',
    'trend_macd',
    'trend_macd_signal',
    'trend_macd_diff',
    'trend_sma_fast',
    'trend_sma_slow',
    'trend_ema_fast',
    'trend_ema_slow',
    'trend_vortex_ind_pos',
    'trend_vortex_ind_neg',
    'trend_vortex_ind_diff',
    'trend_trix',
    'trend_mass_index',
    'trend_dpo',
    'trend_kst',
    'trend_kst_sig',
    'trend_kst_diff',
    'trend_ichimoku_conv',
    'trend_ichimoku_base',
    'trend_ichimoku_a',
    'trend_ichimoku_b',
    'trend_stc',
    'trend_adx',
    'trend_adx_pos',
    'trend_adx_neg',
    'trend_cci',
    'trend_visual_ichimoku_a',
    'trend_visual_ichimoku_b',
    'trend_aroon_up',
    'trend_aroon_down',
    'trend_aroon_ind',
    'trend_psar_up',
    'trend_psar_down',
    'trend_psar_up_indicator',
    'trend_psar_down_indicator',
    'momentum_rsi',
    'momentum_stoch_rsi',
    'momentum_stoch_rsi_k',
    'momentum_stoch_rsi_d',
    'momentum_tsi',
    'momentum_uo',
    'momentum_stoch',
    'momentum_stoch_signal',
    'momentum_wr',
    'momentum_ao',
    'momentum_roc',
    'momentum_ppo',
    'momentum_ppo_signal',
    'momentum_ppo_hist',
    'momentum_pvo',
    'momentum_pvo_signal',
    'momentum_pvo_hist',
    'momentum_kama',
    'others_dr',
    'others_dlr',
    'others_cr'
    """
]

class CryptoFeature():
    
    def __init__(self, asset, start_date: str, end_date: str):
        self.asset = asset
        self.start_date = totimestamp(start_date)
        self.end_date = totimestamp(end_date)
    
    def get_features(self):
        log5_return = log_return(self.asset["VWAP"], periods=5)
        log1_return = log_return(self.asset["VWAP"], periods=1).abs()
        upper = upper_shadow(self.asset)
        lower = lower_shadow(self.asset)
        
        dst_df = add_all_ta_features(
            self.asset,
            open="Open",
            high="High",
            low="Low",
            close="Close",
            volume="Volume",
        )
        
        features =  pd.concat([
            self.asset["Open"],
            self.asset["High"],
            self.asset["Low"],
            self.asset["Close"],
            self.asset["Volume"],
            self.asset["VWAP"],
            dst_df[feature_cols],
            log5_return,
            log1_return,
            upper,
            lower
        ], axis=1)
        
        features = features.loc[self.start_date:self.end_date].fillna(0)
        return features

    def get_target(self):
        return self.asset.loc[self.start_date:self.end_date, "Target"].fillna(0)

In [12]:
def dump_pickle(filename, data):
    with open(filename, "wb") as f:
        pickle.dump(data, f)        

In [13]:
params = dict(
    objective="regression",
    metric="rmse",
    learning_rate=5e-2,
    max_depth=7,
    subsample_freq=3,
    subsample=0.9,
    min_child_samples=20,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42,
    verbosity=1,
)

In [15]:
train_start_date, train_end_date = "01/04/2021", "30/04/2021"
test_start_date, test_end_date = "01/05/2021", "15/05/2021"

all_pred = []
all_true = []
assets = []

all_train_pred = []
all_train_true = []

results = pd.DataFrame(columns=["Asset_Name", "mse", "pearson_corr"])
for idx, row in asset_details.sort_values(by="Asset_ID").iterrows():
    print(f"\nTraining {row.Asset_Name} model")
    
    data = crypto_df[crypto_df["Asset_ID"]==row["Asset_ID"]].set_index("timestamp")
    data = data.reindex(range(data.index[0], data.index[-1]+60, 60),method='pad')
    
    train_feature = CryptoFeature(data, start_date=train_start_date, end_date=train_end_date)
    X_train = train_feature.get_features().to_numpy()
    y_train= train_feature.get_target().to_numpy()

    test_feature = CryptoFeature(data, start_date=test_start_date, end_date=test_end_date)
    X_test= test_feature.get_features().to_numpy()
    y_test = test_feature.get_target().to_numpy()

    train_set = lgb.Dataset(X_train, y_train)
    valid_set = lgb.Dataset(X_test, y_test)
    
    model = lgb.train(
        params, 
        train_set, 
        valid_sets=[train_set, valid_set],
        num_boost_round=100,
        callbacks=[
            lgb.log_evaluation(period=1),
            lgb.early_stopping(stopping_rounds=20)
        ]
    )

    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    all_pred.append(y_pred_test)
    all_true.append(y_test)
    assets.append([row["Asset_ID"] for _ in range(y_test.shape[0])])

    all_train_pred.append(y_pred_train)
    all_train_true.append(y_train)
    
    error = mean_squared_error(y_test, y_pred_test, squared=False)
    metric = pearsonr(y_test, y_pred_test)[0]
    
    results.loc[idx] = (row.Asset_Name, error, metric)
    
    
    # Save model and scaler.
#     os.makedirs(f"./{row.Asset_ID}/", exist_ok=True)
#     dump_pickle(f"./{row.Asset_ID}/model.pkl", model)


Training Binance Coin model


KeyError: '["\\n    \'volatility_bbm\',\\n    \'volatility_bbh\',\\n    \'volatility_bbl\',\\n    \'volatility_bbw\',\\n    \'volatility_bbp\',\\n    \'volatility_bbhi\',\\n    \'volatility_bbli\',\\n    \'volatility_kcc\',\\n    \'volatility_kch\',\\n    \'volatility_kcl\',\\n    \'volatility_kcw\',\\n    \'volatility_kcp\',\\n    \'volatility_kchi\',\\n    \'volatility_kcli\',\\n    \'volatility_dcl\',\\n    \'volatility_dch\',\\n    \'volatility_dcm\',\\n    \'volatility_dcw\',\\n    \'volatility_dcp\',\\n    \'volatility_atr\',\\n    \'volatility_ui\',\\n    \'trend_macd\',\\n    \'trend_macd_signal\',\\n    \'trend_macd_diff\',\\n    \'trend_sma_fast\',\\n    \'trend_sma_slow\',\\n    \'trend_ema_fast\',\\n    \'trend_ema_slow\',\\n    \'trend_vortex_ind_pos\',\\n    \'trend_vortex_ind_neg\',\\n    \'trend_vortex_ind_diff\',\\n    \'trend_trix\',\\n    \'trend_mass_index\',\\n    \'trend_dpo\',\\n    \'trend_kst\',\\n    \'trend_kst_sig\',\\n    \'trend_kst_diff\',\\n    \'trend_ichimoku_conv\',\\n    \'trend_ichimoku_base\',\\n    \'trend_ichimoku_a\',\\n    \'trend_ichimoku_b\',\\n    \'trend_stc\',\\n    \'trend_adx\',\\n    \'trend_adx_pos\',\\n    \'trend_adx_neg\',\\n    \'trend_cci\',\\n    \'trend_visual_ichimoku_a\',\\n    \'trend_visual_ichimoku_b\',\\n    \'trend_aroon_up\',\\n    \'trend_aroon_down\',\\n    \'trend_aroon_ind\',\\n    \'trend_psar_up\',\\n    \'trend_psar_down\',\\n    \'trend_psar_up_indicator\',\\n    \'trend_psar_down_indicator\',\\n    \'momentum_rsi\',\\n    \'momentum_stoch_rsi\',\\n    \'momentum_stoch_rsi_k\',\\n    \'momentum_stoch_rsi_d\',\\n    \'momentum_tsi\',\\n    \'momentum_uo\',\\n    \'momentum_stoch\',\\n    \'momentum_stoch_signal\',\\n    \'momentum_wr\',\\n    \'momentum_ao\',\\n    \'momentum_roc\',\\n    \'momentum_ppo\',\\n    \'momentum_ppo_signal\',\\n    \'momentum_ppo_hist\',\\n    \'momentum_pvo\',\\n    \'momentum_pvo_signal\',\\n    \'momentum_pvo_hist\',\\n    \'momentum_kama\',\\n    \'others_dr\',\\n    \'others_dlr\',\\n    \'others_cr\'\\n    "] not in index'

In [None]:
all_true = np.concatenate(all_true, axis=0)
all_pred = np.concatenate(all_pred, axis=0)
assets = np.concatenate(assets, axis=0)

all_train_pred = np.concatenate(all_train_pred, axis=0)
all_train_true = np.concatenate(all_train_true, axis=0)

In [None]:
error = mean_squared_error(all_true, all_pred, squared=False)
metric = pearsonr(all_true, all_pred)[0]

train_error = mean_squared_error(all_train_true, all_train_pred, squared=False)
train_metric = pearsonr(all_train_true, all_train_pred)[0]

print("train", train_error, train_metric)
print("test", error, metric)

In [None]:
results

In [None]:
results.plot(kind="bar", x="Asset_Name", y="pearson_corr")
plt.ylim(-0.2, 0.7)
plt.show()

In [None]:
plt.scatter(
    all_pred, 
    all_true, 
#     c=assets, 
    alpha=0.6
)
plt.show()