In [1]:
data_train = '/kaggle/input/g-research-crypto-forecasting/train.csv'
data_asset_details = '/kaggle/input/g-research-crypto-forecasting/asset_details.csv'
data_supplemental_train = '/kaggle/input/g-research-crypto-forecasting/supplemental_train.csv'

In [2]:
import random
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
import datetime
import plotly.graph_objects as go

In [3]:
%%time
df_train = pd.read_csv(data_train, 
                       dtype={'Asset_ID': 'int8', 'Count': 'int32', 'row_id': 'int32', 'Count': 'int32', 
                              'Open': 'float64', 'High': 'float64', 'Low': 'float64', 'Close': 'float64', 
                              'Volume': 'float64', 'VWAP': 'float64'
                             }
                      )
df_train.head()

CPU times: user 30.6 s, sys: 2.52 s, total: 33.1 s
Wall time: 55 s


Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,2,40,2376.58,2399.5,2357.14,2374.59,19.233005,2373.116392,-0.004218
1,1514764860,0,5,8.53,8.53,8.53,8.53,78.38,8.53,-0.014399
2,1514764860,1,229,13835.194,14013.8,13666.11,13850.176,31.550062,13827.062093,-0.014643
3,1514764860,5,32,7.6596,7.6596,7.6567,7.6576,6626.71337,7.657713,-0.013922
4,1514764860,7,5,25.92,25.92,25.874,25.877,121.08731,25.891363,-0.008264


In [4]:
import gresearch_crypto

In [5]:
df_assets = pd.read_csv(data_asset_details).sort_values(by='Asset_ID')
df_assets.head()

Unnamed: 0,Asset_ID,Weight,Asset_Name
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
0,2,2.397895,Bitcoin Cash
10,3,4.406719,Cardano
13,4,3.555348,Dogecoin


In [6]:
for i in range(14):
 
    dfcrop=df_train[df_train['Asset_ID']==i]
    print('Percentage of values not nan',(1-(np.sum((dfcrop['Target'].isnull()).astype(int))/dfcrop.shape[0]))*100)
  

Percentage of values not nan 99.3094374141301
Percentage of values not nan 99.9844603180932
Percentage of values not nan 99.75116928934543
Percentage of values not nan 98.95466572016784
Percentage of values not nan 87.46838441098623
Percentage of values not nan 99.88225907096167
Percentage of values not nan 99.9826193640732
Percentage of values not nan 99.52201983776557
Percentage of values not nan 87.87076707006158
Percentage of values not nan 99.97336441670117
Percentage of values not nan 94.03054748939965
Percentage of values not nan 85.91615278314144
Percentage of values not nan 96.52474857329506
Percentage of values not nan 98.87899026971662


In [7]:
import xgboost as xgb

def upper_shadow(df):
    return df['High'] - np.maximum(df['Close'], df['Open'])

def lower_shadow(df):
    return np.minimum(df['Close'], df['Open']) - df['Low']


def get_features(df):
    df_feat = df[['Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP']].copy()
    df_feat['Upper_Shadow'] = upper_shadow(df_feat)
    df_feat['Lower_Shadow'] = lower_shadow(df_feat)
    
    df_feat["Close/Open"] = df_feat["Close"] / df_feat["Open"] 
    df_feat["Close-Open"] = df_feat["Close"] - df_feat["Open"] 
    df_feat["High-Low"] = df_feat["High"] - df_feat["Low"] 
    df_feat["High/Low"] = df_feat["High"] / df_feat["Low"]
    
    df_feat['Mean'] = df_feat[['Open', 'High', 'Low', 'Close']].mean(axis=1)

    df_feat["Median"] = df_feat[["Open", "High", "Low", "Close"]].median(axis=1)

    return df_feat

In [8]:
from sklearn.preprocessing import StandardScaler

In [9]:
def get_Xy_and_model_for_asset(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.dropna(how="any")
    #df_proc = df_proc.reset_index()
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    
    if asset_id == 0:
        model = xgb.XGBRegressor(
            n_estimators=317,
            max_depth=8,
            learning_rate= 0.008967159857886885,
            subsample=0.8074685834714562,
            colsample_bytree=0.6156249507619749,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 1:
        model = xgb.XGBRegressor(
            n_estimators=637,
            max_depth=13,
            learning_rate= 0.09253396014321574,
            subsample=0.700624738784116,
            colsample_bytree=0.73896289605807,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 2:
        model = xgb.XGBRegressor(
            n_estimators=452,
            max_depth=8,
            learning_rate=  0.4167642609563461,
            subsample=0.582006239504628,
            colsample_bytree=0.6829989973347863,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 3:
        model = xgb.XGBRegressor(
            n_estimators=452,
            max_depth=8,
            learning_rate= 0.4167642609563461,
            subsample=0.5820062395046286,
            colsample_bytree=0.6829989973347863,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 4:
        model = xgb.XGBRegressor(
            n_estimators=471,
            max_depth=9,
            learning_rate= 0.05918488024797159,
            subsample=0.7693819367697938,
            colsample_bytree=0.7266084230952958,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 5:
        model = xgb.XGBRegressor(
            n_estimators=452,
            max_depth=8,
            learning_rate= 0.4167642609563461,
            subsample=0.582006239504628,
            colsample_bytree=0.6829989973347863,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 6:
        model = xgb.XGBRegressor(
            n_estimators=476,
            max_depth=8,
            learning_rate= 0.4202769113980745,
            subsample=0.5563315209270074,
            colsample_bytree=0.6951993738259458,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 7:
        model = xgb.XGBRegressor(
            n_estimators=520,
            max_depth=15,
            learning_rate=0.19184853364231427,
            subsample=0.8869731830313443,
            colsample_bytree=0.6855158027999262,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 8:
        model = xgb.XGBRegressor(
            n_estimators=471,
            max_depth=9,
            learning_rate= 0.05918488024797159,
            subsample=0.7693819367697938,
            colsample_bytree=0.7266084230952958,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 9:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate= 0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 10:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate=0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 11:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate=0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 12:
        model = xgb.XGBRegressor(
            n_estimators=229,
            max_depth=9,
            learning_rate=0.23016519709096778,
            subsample=0.7928998128269837,
            colsample_bytree=0.5299924747454009,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    elif asset_id == 13:
        model = xgb.XGBRegressor(
            n_estimators=121,
            max_depth=6,
            learning_rate=0.3062149270836522,
            subsample=0.7361485971162751,
            colsample_bytree=0.685244452888315,
            missing=-999,
            random_state=2022,
            tree_method='gpu_hist'
        )
    model.fit(X, y)
    del X
    del y
    #return X, y, model
    return model

In [10]:
def get_Xy_and_model_for_asset_with_reset_index(df_train, asset_id):
    df = df_train[df_train["Asset_ID"] == asset_id]
    
    df_proc = get_features(df)
    df_proc['y'] = df['Target']
    df_proc = df_proc.reset_index(drop=True)
    df_proc = df_proc.drop(labels=np.where(np.isinf(df_proc))[0], axis=0)
    df_proc = df_proc.dropna(how="any")
    df_proc = df_proc.reset_index(drop=True)
    
    X = df_proc.drop("y", axis=1)
    y = df_proc["y"]
    
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    
    model = xgb.XGBRegressor(
        n_estimators=229,
        max_depth=9,
        learning_rate=0.23016519709096778,
        subsample=0.7928998128269837,
        colsample_bytree=0.5299924747454009,
        missing=-999,
        random_state=2022,
        tree_method='gpu_hist'
    )
    model.fit(X, y)
    del X
    del y
    #return X, y, model
    return model

In [11]:
%%time
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_assets['Asset_ID'], df_assets['Asset_Name']):
    print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
    try:
        if asset_id == 10:
            models[asset_id] = get_Xy_and_model_for_asset_with_reset_index(df_train, asset_id)
            continue
        #X, y, model = get_Xy_and_model_for_asset(df_train, asset_id) 
        model = get_Xy_and_model_for_asset(df_train, asset_id)
        models[asset_id] = model
        #Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
    except:         
        models[asset_id] = None
        #Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None    

Training model for Binance Coin     (ID=0 )
Training model for Bitcoin          (ID=1 )
Training model for Bitcoin Cash     (ID=2 )
Training model for Cardano          (ID=3 )
Training model for Dogecoin         (ID=4 )
Training model for EOS.IO           (ID=5 )
Training model for Ethereum         (ID=6 )
Training model for Ethereum Classic (ID=7 )
Training model for IOTA             (ID=8 )
Training model for Litecoin         (ID=9 )
Training model for Maker            (ID=10)
Training model for Monero           (ID=11)
Training model for Stellar          (ID=12)
Training model for TRON             (ID=13)
CPU times: user 6min 9s, sys: 4.45 s, total: 6min 14s
Wall time: 6min 12s


In [12]:
import traceback

In [13]:
models

{0: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6156249507619749,
              enable_categorical=False, gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.008967159857886885,
              max_delta_step=0, max_depth=8, min_child_weight=1, missing=-999,
              monotone_constraints='()', n_estimators=317, n_jobs=2,
              num_parallel_tree=1, predictor='auto', random_state=2022,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.8074685834714562, tree_method='gpu_hist',
              validate_parameters=1, verbosity=None),
 1: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.73896289605807,
              enable_categorical=False, gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.092533960143215

In [14]:
for asset_id, asset_name in zip(df_assets['Asset_ID'], df_assets['Asset_Name']):
    print(f"Feature importances for {asset_name:<16} (ID={asset_id:<2}) - {models[asset_id].feature_importances_}")

Feature importances for Binance Coin     (ID=0 ) - [0.05203707 0.06857431 0.08979764 0.06545305 0.06713746 0.04220316
 0.04459746 0.08334668 0.07215391 0.07205423 0.07178457 0.10233574
 0.02911036 0.07913464 0.06027972]
Feature importances for Bitcoin          (ID=1 ) - [0.05124256 0.05651923 0.06632137 0.07028487 0.08365922 0.05181173
 0.09342599 0.056445   0.05776067 0.05288294 0.05817401 0.05740716
 0.06619805 0.07964682 0.09822045]
Feature importances for Bitcoin Cash     (ID=2 ) - [0.06705978 0.07331704 0.08093957 0.08869611 0.09350222 0.06511884
 0.09165307 0.03162963 0.03342003 0.06485193 0.05708555 0.04677017
 0.04606028 0.0869408  0.07295498]
Feature importances for Cardano          (ID=3 ) - [0.05822699 0.05618514 0.06563041 0.071776   0.07851136 0.05691079
 0.08338644 0.06187809 0.06367917 0.05862213 0.06207085 0.06274657
 0.05692858 0.07552432 0.08792312]
Feature importances for Dogecoin         (ID=4 ) - [0.05197159 0.05391194 0.06225967 0.0671471  0.083698   0.04336471
 0

In [15]:
%%time
env = gresearch_crypto.make_env()
iter_test = env.iter_test()
#scaler = StandardScaler()
for i, (df_test, df_pred) in enumerate(iter_test):
    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(pd.DataFrame([row]))
                #x_test = scaler.fit_transform(x_test)
                y_pred = model.predict(x_test)[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    env.predict(df_pred)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
CPU times: user 1.1 s, sys: 2.99 ms, total: 1.1 s
Wall time: 570 ms
