## Setting up model

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRFRegressor

import scipy
from sklearn.model_selection import train_test_split
import ipywidgets as widgets
from sklearn.preprocessing import StandardScaler

import pyarrow.feather as feather

import pickle

In [22]:
pd.set_option('display.max_columns', None)

## Submission Example
example of what final submission should look like

In [23]:
example_submit = pd.read_csv('data/example_sample_submission.csv')

In [24]:
example_submit.head()

Unnamed: 0,group_num,row_id,Target
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0


## Test Example
example of what the test set will look like

In [25]:
example_test = pd.read_csv('data/example_test.csv')

In [26]:
example_test.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,group_num,row_id
0,1623542400,3,1201.0,1.478556,1.48603,1.478,1.483681,654799.561103,1.481439,0,0
1,1623542400,2,1020.0,580.306667,583.89,579.91,582.276667,1227.988328,581.697038,0,1
2,1623542400,0,626.0,343.7895,345.108,343.64,344.598,1718.832569,344.441729,0,2
3,1623542400,1,2888.0,35554.289632,35652.46465,35502.67,35602.004286,163.811537,35583.469303,0,3
4,1623542400,4,433.0,0.312167,0.3126,0.31192,0.312208,585577.410442,0.312154,0,4


## Coin Combo Dicitonary
dictionary to be used by models

In [27]:
coin_dict = {'Bitcoin Cash': 'DEFAULT',
             'Binance Coin': ['Bitcoin', 'EOS.IO', 'Dogecoin'],
             'Bitcoin': ['Dogecoin', 'Ethereum Classic', 'Cardano'],
             'EOS.IO': ['Bitcoin Cash', 'Bitcoin', 'Ethereum'],
             'Ethereum Classic': ['Bitcoin Cash'],
             'Ethereum': ['Stellar', 'Ethereum Classic'],
             'Litecoin': ['Dogecoin', 'Bitcoin Cash'],
             'Monero': ['TRON', 'Cardano'],
             'TRON': ['Stellar'],
             'Stellar': ['TRON'],
             'Cardano': ['IOTA', 'Bitcoin Cash', 'Monero'],
             'IOTA': 'DEFAULT',
             'Maker': ['Dogecoin', 'Stellar'],
             'Dogecoin': ['Ethereum Classic'],
            }

## Default Drops
features dropped for prediction

In [28]:
default_drops = ['Asset_ID', 'group_num', 'row_id']

## Load in asset details w coin names

In [29]:
asset_details = pd.read_csv('data/asset_details.csv')
asset_info = dict(zip(asset_details.Asset_ID, asset_details.Asset_Name))
asset_info_rev = dict(zip(asset_details.Asset_Name, asset_details.Asset_ID, ))

In [30]:
asset_details

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


## Rename Assets to coin names

In [31]:
example_test['Asset_ID'] = example_test['Asset_ID'].map(asset_info)

example_test.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,group_num,row_id
0,1623542400,Cardano,1201.0,1.478556,1.48603,1.478,1.483681,654799.561103,1.481439,0,0
1,1623542400,Bitcoin Cash,1020.0,580.306667,583.89,579.91,582.276667,1227.988328,581.697038,0,1
2,1623542400,Binance Coin,626.0,343.7895,345.108,343.64,344.598,1718.832569,344.441729,0,2
3,1623542400,Bitcoin,2888.0,35554.289632,35652.46465,35502.67,35602.004286,163.811537,35583.469303,0,3
4,1623542400,Dogecoin,433.0,0.312167,0.3126,0.31192,0.312208,585577.410442,0.312154,0,4


## Master Dataframe function
Makes a main dataframe for all models


In [81]:
def master_dataframe(df):
    #generate all timestamps
    all_time = df.drop_duplicates(subset=['timestamp'])[['timestamp']]

    #for each asset
    for i in list(asset_info.values()):
        #filter to just asset id values
        #df_1 = df[df['Asset_ID'] == i].drop(['Asset_ID'], axis=1)
        df_1 = df[df['Asset_ID'] == i].drop(default_drops, axis=1)
        
        #generate column dictionary
        column_dict = {}
        for i2 in df_1.drop('timestamp', axis=1).columns:
            column_dict[i2] = i2+"_{}".format(i)
        
        #rename columns to match asset_id
        df_1 = df_1.rename(columns=column_dict)
        
         #add to all time dictionary
        all_time = pd.merge(all_time, df_1 , how="outer", on='timestamp')
    all_time['Unnamed: 0'] = pd.to_datetime(all_time.timestamp, unit='s')
    all_time['week'] = all_time['Unnamed: 0'].dt.week
    all_time['year'] = all_time['Unnamed: 0'].dt.year
    all_time['month'] = all_time['Unnamed: 0'].dt.month

    all_time['minute'] = all_time['Unnamed: 0'].dt.minute
    all_time['dayofweek'] = all_time['Unnamed: 0'].dt.dayofweek
    all_time['day'] = all_time['Unnamed: 0'].dt.day
    all_time = all_time.drop('Unnamed: 0', axis=1)
    return all_time

## Make Master Dataframe

In [33]:
refined_dataset = master_dataframe(example_test)
refined_dataset.head()

Unnamed: 0,timestamp,Count_Bitcoin Cash,Open_Bitcoin Cash,High_Bitcoin Cash,Low_Bitcoin Cash,Close_Bitcoin Cash,Volume_Bitcoin Cash,VWAP_Bitcoin Cash,Count_Binance Coin,Open_Binance Coin,High_Binance Coin,Low_Binance Coin,Close_Binance Coin,Volume_Binance Coin,VWAP_Binance Coin,Count_Bitcoin,Open_Bitcoin,High_Bitcoin,Low_Bitcoin,Close_Bitcoin,Volume_Bitcoin,VWAP_Bitcoin,Count_EOS.IO,Open_EOS.IO,High_EOS.IO,Low_EOS.IO,Close_EOS.IO,Volume_EOS.IO,VWAP_EOS.IO,Count_Ethereum Classic,Open_Ethereum Classic,High_Ethereum Classic,Low_Ethereum Classic,Close_Ethereum Classic,Volume_Ethereum Classic,VWAP_Ethereum Classic,Count_Ethereum,Open_Ethereum,High_Ethereum,Low_Ethereum,Close_Ethereum,Volume_Ethereum,VWAP_Ethereum,Count_Litecoin,Open_Litecoin,High_Litecoin,Low_Litecoin,Close_Litecoin,Volume_Litecoin,VWAP_Litecoin,Count_Monero,Open_Monero,High_Monero,Low_Monero,Close_Monero,Volume_Monero,VWAP_Monero,Count_TRON,Open_TRON,High_TRON,Low_TRON,Close_TRON,Volume_TRON,VWAP_TRON,Count_Stellar,Open_Stellar,High_Stellar,Low_Stellar,Close_Stellar,Volume_Stellar,VWAP_Stellar,Count_Cardano,Open_Cardano,High_Cardano,Low_Cardano,Close_Cardano,Volume_Cardano,VWAP_Cardano,Count_IOTA,Open_IOTA,High_IOTA,Low_IOTA,Close_IOTA,Volume_IOTA,VWAP_IOTA,Count_Maker,Open_Maker,High_Maker,Low_Maker,Close_Maker,Volume_Maker,VWAP_Maker,Count_Dogecoin,Open_Dogecoin,High_Dogecoin,Low_Dogecoin,Close_Dogecoin,Volume_Dogecoin,VWAP_Dogecoin,week,year,month,minute,dayofweek,day
0,1623542400,1020.0,580.306667,583.89,579.91,582.276667,1227.988328,581.697038,626.0,343.7895,345.108,343.64,344.598,1718.832569,344.441729,2888.0,35554.289632,35652.46465,35502.67,35602.004286,163.811537,35583.469303,359.0,4.83255,4.8459,4.8229,4.837583,47143.548386,4.836607,541.0,55.22308,55.494,55.182,55.34468,6625.201803,55.298816,2186.0,2371.194286,2379.2,2369.67,2374.380714,1214.128692,2374.335307,560.0,161.933429,162.48,161.73,162.214714,1485.009496,162.23131,123.0,243.1375,243.81,242.96,243.5325,307.958853,243.452697,229.0,0.068132,0.06824,0.068038,0.068158,3046438.0,0.068158,383.0,0.327973,0.329272,0.32765,0.328829,536491.101687,0.328582,1201.0,1.478556,1.48603,1.478,1.483681,654799.561103,1.481439,35.0,1.00315,1.0198,0.9873,1.0033,7061.9276,1.002936,61.0,2939.86275,2952.16,2936.23,2947.078025,9.584785,2945.110614,433.0,0.312167,0.3126,0.31192,0.312208,585577.4,0.312154,23,2021,6,0,6,13
1,1623542460,1251.0,581.8,585.59,580.38,582.358333,1405.285079,583.451389,458.0,344.3535,344.79,343.62,344.0895,1217.352439,344.188716,2006.0,35596.771429,35621.0,35533.38,35555.397143,93.363659,35584.861196,644.0,4.83455,4.8495,4.8185,4.8383,86783.074424,4.83743,530.0,55.3209,55.4266,55.149,55.2864,6929.457933,55.265491,1261.0,2373.970101,2375.35,2369.37,2371.79,786.738453,2372.80983,381.0,162.141857,162.38,161.8,162.193,1344.809595,162.12131,62.0,243.368,243.5,242.94,243.33,143.50532,243.337398,395.0,0.068144,0.068334,0.06804,0.068217,4981365.0,0.068201,287.0,0.328656,0.32928,0.32778,0.328563,449638.981788,0.328548,672.0,1.48241,1.483759,1.4792,1.482043,285828.559844,1.481495,98.0,1.00305,1.0196,0.9841,1.00155,47719.510738,1.001078,35.0,2946.370567,2949.72,2943.71,2945.7119,2.794134,2946.633163,573.0,0.312274,0.3124,0.31154,0.311847,939643.3,0.31193,23,2021,6,1,6,13
2,1623542520,540.0,582.1,582.67,579.62,580.54,472.373005,580.850017,535.0,343.778,344.06,342.5,343.0095,975.797047,343.139652,3531.0,35550.27125,35576.59,35402.87,35488.2875,220.535164,35480.068897,380.0,4.83344,4.8357,4.8143,4.82208,41288.47955,4.824327,825.0,55.2257,55.3096,54.91,55.02605,12315.492361,55.09578,1856.0,2370.880011,2371.950076,2363.0,2365.59,764.080469,2367.128372,439.0,161.876143,162.03,161.46,161.829143,1265.96872,161.737656,49.0,242.745,243.0,242.23,242.3475,54.20834,242.45107,338.0,0.068159,0.06817,0.06795,0.068025,2790134.0,0.068057,264.0,0.328061,0.32837,0.326973,0.327497,742486.816952,0.327676,849.0,1.481492,1.482896,1.477801,1.479259,486854.589883,1.48064,53.0,1.00045,1.0164,0.9817,0.9975,12162.514865,0.998949,56.0,2939.582925,2941.8083,2930.03,2933.232,6.08464,2937.153112,1667.0,0.31167,0.312,0.31043,0.311006,3416122.0,0.311131,23,2021,6,2,6,13
3,1623542580,409.0,580.49,580.69,578.31,578.912,204.52097,579.470144,614.0,343.238,343.406,341.94,342.325,1295.198518,342.525876,2901.0,35478.867162,35503.460134,35381.01,35423.49,118.802511,35438.243466,465.0,4.819883,4.8221,4.8083,4.81345,34767.541544,4.815294,347.0,54.997567,55.0442,54.9168,54.94095,2735.095874,54.974253,2624.0,2365.769427,2367.5,2359.01,2360.505714,2253.662759,2362.394059,431.0,161.862333,161.93,161.27,161.485,1156.635873,161.595349,107.0,242.36,242.55,241.7,242.074,139.309081,242.123168,240.0,0.068015,0.068055,0.067866,0.067936,2572088.0,0.067958,276.0,0.327267,0.32741,0.32613,0.326406,257177.767601,0.326834,1023.0,1.479075,1.479399,1.4726,1.473527,328684.942928,1.476372,169.0,0.9966,1.0129,0.9751,0.991,62518.202934,0.992974,34.0,2930.15025,2931.52,2921.66,2925.0737,1.187095,2926.427322,1094.0,0.310923,0.3114,0.3104,0.310676,2403980.0,0.310894,23,2021,6,3,6,13


## Custom Dataframe Function
for each coin, make a cust dataframe using coin_dict

In [65]:
def custom_dataframe(df, coin, param_list):
    #print((coin, param_list))
    custom_dataset = df
    #For each coin
    for c in param_list:
        #DROP COIN
        dropped_coin = [k for k in df.columns if c in k]
        if (c == 'Bitcoin') | (c == 'Ethereum'):
            dropped_coin = [k for k in dropped_coin if ' ' not in k]
        custom_dataset = custom_dataset.drop(dropped_coin, axis=1)   
    return custom_dataset

## Make Predictions for all models
For each coin, load in a  scalar and model then use them to make a prediction

In [66]:
%%time

#stores_datasets with predictions & timestamps
predict_dict = {}

#for each coin
for i in asset_details.Asset_Name:

    
    ######MAYBE DROP AT SOME POINT???
    if coin_dict[i] == "DEFAULT":
        #use default dataframe
        dataframe_coin = refined_dataset.copy()
    else:
        #generate custom dataframe
        dataframe_coin = custom_dataframe(refined_dataset.copy(), i, coin_dict[i])
    ######MAYBE DROP AT SOME POINT???
    
    
    #load in scalar    
    sc = pickle.load(open('scalers/scaler_{}.pkl'.format(i),'rb'))
    
    X_values = dataframe_coin
    X_values.replace([np.inf, -np.inf], 0, inplace=True)
    #fill any missing values with median
    
    #load in model
    model = XGBRFRegressor()

    model.load_model("models/model_{}.json".format(i))
    
    #scale the data
    x_testScaled = sc.transform(X_values)
    
    #make a prediction
    y_pred = model.predict(x_testScaled)

    
    #make predictions
    X_values['Target'] = y_pred.tolist() #added in to list
    
    #set the asset it to the correct coin
    X_values['Asset_ID'] = i
    
    #add to the dictionary
    predict_dict[i] = X_values[['timestamp', 'Asset_ID', 'Target']]
    
    
    #delete the model and scaler
    del model
    del sc
    del dataframe_coin
    del x_testScaled
    del X_values

CPU times: user 12min 51s, sys: 6min 1s, total: 18min 53s
Wall time: 21min 58s


#  Matches coin to time stamp and save submission

In [69]:

def target_finder(time, coin):
    x_1 = predict_dict[coin]
    return float(x_1[(x_1['timestamp']==time) & (x_1['Asset_ID']==coin)].Target)


#
r = example_test
r['Target'] = r.apply(lambda x: target_finder(x.timestamp, x.Asset_ID), axis=1)
submission = r[['group_num', 'row_id', 'Target']]
submission.to_csv('submission.csv')

In [76]:
submission.head()

Unnamed: 0,group_num,row_id,Target
0,0,0,0.499833
1,0,1,0.499833
2,0,2,0.499833
3,0,3,0.499833
4,0,4,0.499833


In [77]:
example_submit.head()

Unnamed: 0,group_num,row_id,Target
0,0,0,0
1,0,1,0
2,0,2,0
3,0,3,0
4,0,4,0
