In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import polars as pl
import lightgbm as lgb
import joblib
from lightgbm import LGBMRegressor

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

pl.Config.set_tbl_rows(-1)
pl.Config.set_tbl_cols(-1)

polars.config.Config

In [2]:
multi_index = ['date_id', 'time_id', 'symbol_id']
feature_col = [f'feature_{i:02d}' for i in range(79)]
feature_lag = [f'responder_{i}' for i in range(9)]

weight = ['weight']
target = ['responder_6']
interest_col = multi_index + feature_col + weight + target

In [3]:
df = (pl.scan_parquet('data/train.parquet')
      .drop('partition_id')
      .filter(pl.col('date_id') >= 1099)
      .sort(['date_id', 'time_id','symbol_id'])
      .collect())

df.head()

date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
1099,0,0,2.051927,-0.248599,1.621542,-0.068824,-0.170314,1.43715,1.342398,1.531021,1.552279,-0.431288,11,7,76,-0.816879,1.157901,-0.307176,,1.271211,,-2.367924,-1.330201,0.849704,-0.186959,0.543052,0.992313,-1.379053,-0.933983,0.896124,1.100239,1.063759,0.047145,0.680417,-0.204366,,,-0.28419,0.085606,1.091508,-0.045187,-0.817954,,1.476158,,,0.669223,,-0.928933,1.719562,0.695733,4.563862,1.555399,,2.621678,,,-0.900057,,0.003544,1.151609,,3.867216,2.603795,0.755867,0.028564,0.182134,0.097254,-2.010324,-1.480906,-0.695279,0.084257,-0.529734,-0.788087,1.556511,-0.186578,,,-0.209222,-0.185264,-0.282284,-0.138465,-0.399422,-0.187836,-0.509768,-0.253608,0.341263,-0.140591,-0.023567,0.489448,0.356649
1099,0,1,3.392164,0.259097,1.392621,-0.503887,-0.241108,1.967715,1.591068,0.99859,1.799989,-0.289841,11,7,76,-0.787299,0.214656,-0.680841,,0.111953,,-1.31486,-1.232028,0.200484,0.041846,1.333064,1.03162,-0.782376,-0.239917,0.493164,2.318372,1.298617,-0.287292,-0.076574,0.038831,,,-0.278329,0.16223,-1.256653,-0.183401,-0.395968,,-0.086898,,,-0.301956,,-1.109824,1.467594,0.318831,0.938119,0.455269,,1.82838,,,0.588878,,-0.204858,2.419592,,3.387692,1.725406,0.755867,-0.198133,-0.039612,-0.276892,-1.136595,-1.583077,-0.713564,-0.069048,-0.604924,-0.835939,0.375211,-0.47134,,,-0.173509,-0.201918,-0.294918,-0.245487,-0.244696,-0.071257,-0.271139,-1.296458,-1.358795,-1.214743,-1.22101,-1.724037,-1.059506
1099,0,2,1.343134,0.517202,1.117549,0.080841,0.024981,1.712892,1.431327,1.80543,1.789339,-0.347833,81,2,59,-1.261471,0.12572,-0.531379,,-0.29918,,-1.335486,-1.386557,-0.914476,-0.340757,-0.192995,-0.2289,-1.687286,-0.952715,0.524215,1.776975,1.171555,-0.810213,-0.327307,-0.261107,,,-0.18068,0.005577,1.21581,0.089545,0.18772,,1.615333,,,-0.645962,,-1.377578,1.68653,1.274393,1.092009,0.898805,,-0.503623,,,-1.518755,,-2.224362,0.6409,,-0.612116,-0.056746,0.755867,-0.012848,0.492039,0.120181,-2.451575,-1.706964,-0.977077,0.035544,-0.466492,-0.983709,0.199564,-0.744791,,,-0.21554,-0.19324,-0.287705,-0.301047,-0.274953,-0.271465,-0.073639,0.889502,-3.223725,-0.810447,2.024764,-2.335526,-1.089055
1099,0,3,1.685052,0.145567,1.067854,-0.402879,0.202988,2.047291,0.905845,0.907996,1.029375,-0.244637,4,3,11,-0.380218,6.779099,1.204091,,-0.146815,,-0.701783,-1.612676,-0.157345,0.053005,-0.246823,-0.637094,0.226551,0.573292,-0.471628,-1.227767,-0.705375,-0.70922,-0.657256,0.032702,,,0.000239,-0.126731,1.269189,0.615831,1.192095,,-0.516175,,,-1.803262,,-1.641229,-0.304523,-1.825786,-5.618533,-3.320896,,-1.924352,,,-2.535444,,-2.457625,-0.304245,,-13.360477,-4.950114,0.755867,-0.280215,-0.137594,-0.242161,-1.060082,-1.280717,-0.376444,7.835725,0.979697,-0.389943,4.812237,1.215333,,,1.028849,0.9311,0.101705,-0.015961,0.586604,0.348421,-1.31232,-4.162784,-0.299012,-3.295336,-5.0,-0.680722,-4.535375
1099,0,4,1.272961,0.669465,1.049268,-0.477954,-0.350755,2.222908,0.862832,0.749962,1.494779,-0.229945,15,1,9,-0.95937,1.773412,-0.167947,,0.62956,,-1.270064,-1.316301,1.130272,-0.077613,-0.82394,-0.719687,-0.147642,0.799066,-1.279402,-1.77414,-0.461905,-0.526829,-0.385669,-0.102031,,,-0.783607,-0.629812,-0.240504,-0.330898,-0.72534,,0.457317,,,-1.908987,,-0.585512,1.084562,2.207275,-1.812357,0.121488,,1.762509,,,-2.293728,,-1.310177,0.338667,,0.875598,1.401105,0.755867,-0.041865,0.152188,-0.038114,-1.706901,-2.038764,-0.928739,0.996509,-0.286176,-1.100722,2.852242,0.018002,,,5.769077,5.607573,4.803722,3.580811,0.564022,0.230518,-0.548043,-2.166123,-1.690722,-1.073069,-3.175938,-1.134714,-1.686271


In [4]:
lags = df.select(multi_index + feature_lag)
lags = lags.rename({ f"responder_{idx}" : f"responder_{idx}_lag_1" for idx in range(9)})
lags = lags.with_columns(date_id = pl.col('date_id') + 1)  # lagged by 1 day

lags = lags.group_by(["date_id", "symbol_id"], maintain_order=True).last()  # pick up last record of previous date
lags = lags.drop("time_id")
lags.head()

date_id,symbol_id,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32
1100,0,-0.808547,-0.073002,1.378468,-1.702946,-0.630515,-0.150434,-0.337349,-0.135302,-0.976292
1100,1,-0.331117,0.026089,1.487575,0.798801,0.468176,0.920555,0.082525,0.080933,0.213214
1100,2,0.430136,-0.134319,2.250511,0.234775,0.181649,1.050369,-0.023226,0.015239,-0.070631
1100,3,-2.764422,-1.040809,-1.086835,-0.778257,-0.429215,0.033297,0.252173,0.087934,0.444123
1100,4,0.524972,0.20783,-2.304975,1.288136,0.803507,-0.599111,0.094728,0.086081,0.19587


In [5]:
train = df.select(multi_index + feature_col + weight + target)
train = train.join(lags, on=["date_id", "symbol_id"],  how="left").filter(pl.col('date_id') >= 1100)
train.head()

date_id,time_id,symbol_id,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,weight,responder_6,responder_0_lag_1,responder_1_lag_1,responder_2_lag_1,responder_3_lag_1,responder_4_lag_1,responder_5_lag_1,responder_6_lag_1,responder_7_lag_1,responder_8_lag_1
i16,i16,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,i8,i16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32
1100,0,0,0.347117,0.496246,-0.252778,0.300495,2.283821,-0.358535,1.499545,0.163361,0.522466,11,7,76,-1.018732,0.160037,-0.695186,,0.966189,,-1.041664,-1.200563,1.08635,-0.198352,0.497893,1.451918,-1.182287,-1.02238,0.891338,0.824079,0.987703,0.071278,0.675787,-0.179226,,,-0.432981,-0.387139,-1.033333,0.115463,0.144211,,-0.104414,,,-1.438618,,-2.239299,1.204056,-2.054071,-0.987095,-0.350367,,0.554282,,,-0.963888,,-2.11783,0.946514,,-0.378454,-0.231735,0.747711,-0.366339,-0.179765,-0.334015,-1.798993,-1.743881,-0.904236,0.557061,-0.345155,-0.721083,-0.315582,-0.879035,,,-0.435363,-0.307379,-0.185087,-0.343539,2.135293,0.814512,-0.808547,-0.073002,1.378468,-1.702946,-0.630515,-0.150434,-0.337349,-0.135302,-0.976292
1100,0,1,0.277247,1.012361,-0.094676,0.564339,2.562398,-0.420024,0.901861,0.168404,0.394671,11,7,76,-1.23418,0.84047,-0.258402,,-0.099234,,-1.50468,-1.788404,0.251073,0.056533,2.704134,1.489169,-1.148062,-0.183026,0.500423,1.205928,1.353426,-0.289504,-0.123262,0.030237,,,0.155032,0.1049,-0.954238,-0.04302,0.291611,,1.576069,,,1.017872,,-1.499961,2.011301,-0.478377,0.518929,0.060907,,1.888145,,,-0.340385,,-1.430165,1.854096,,0.254856,0.194542,0.747711,-0.242813,-0.213022,-0.279367,-1.828314,-1.445002,-0.939389,0.616925,-0.37009,-0.922807,1.492145,-0.220311,,,-0.239445,-0.2666,-0.263515,-0.376316,3.733894,0.584708,-0.331117,0.026089,1.487575,0.798801,0.468176,0.920555,0.082525,0.080933,0.213214
1100,0,2,-0.089896,1.077906,-0.554281,-0.120926,2.378587,-0.745054,1.448174,0.270152,0.809487,81,2,59,-0.912496,0.234643,-0.491267,,-0.222222,,-1.399685,-1.02082,-1.143791,-0.298319,-0.579372,-0.292496,-2.362143,-0.959014,0.905534,0.986316,1.334443,-0.847811,-0.424774,-0.288184,,,-0.202237,-0.460284,-1.253408,0.057346,-0.04258,,-0.6247,,,-0.51776,,-1.877281,0.993348,-0.710742,-0.272629,-0.382265,,0.977554,,,-0.297651,,-1.440637,2.45237,,0.353313,0.166703,0.747711,-0.24225,-0.378225,-0.267732,-1.667135,-1.733826,-0.793462,0.376235,-0.330867,-0.807339,-0.173345,-0.760232,,,-0.156225,-0.185842,-0.316094,-0.27062,1.488208,-2.109449,0.430136,-0.134319,2.250511,0.234775,0.181649,1.050369,-0.023226,0.015239,-0.070631
1100,0,3,-0.084937,0.441306,0.315186,0.524206,2.865959,-0.307868,0.639991,0.11417,0.274875,4,3,11,-0.904443,1.436964,-0.12921,,-0.463431,,-1.208118,-1.646518,-0.242091,0.046803,-0.224398,-0.457751,0.169098,0.46646,-0.425726,-1.016724,-0.478909,-0.62777,-0.642077,0.049566,,,-0.320838,0.08193,0.642198,0.053884,0.245737,,0.124157,,,-0.940643,,-2.078327,1.609066,-1.081323,0.156615,0.012454,,1.099446,,,-0.142181,,-1.948668,1.802449,,-0.8854,-0.345768,0.747711,-0.407633,-0.324136,-0.284354,-1.711711,-1.934527,-0.588162,2.146163,0.093359,-1.187405,1.05717,-0.300235,,,0.008673,-0.025044,-0.205968,-0.267327,1.529191,3.265174,-2.764422,-1.040809,-1.086835,-0.778257,-0.429215,0.033297,0.252173,0.087934,0.444123
1100,0,4,0.800764,0.936516,0.295581,0.63778,2.551722,-0.363379,0.706162,0.071007,0.330625,15,1,9,-0.678711,1.007621,-0.355546,,1.670957,,-1.988257,-2.415015,1.620926,-0.088764,-0.802655,-0.868845,-0.218735,0.645259,-1.960116,-1.310783,-0.417979,-0.317807,-0.371853,-0.063114,,,0.065471,0.054314,-0.43509,0.171743,0.460601,,0.614084,,,-0.305562,,-0.973221,0.4941,-0.915549,-1.20239,-1.055334,,2.470784,,,-1.193521,,-0.649144,0.848738,,-2.881672,-1.259184,0.747711,-0.396621,-0.12172,-0.182512,-1.713176,-2.141696,-0.558004,1.443734,-0.073243,-1.06991,-0.117371,-0.506948,,,4.950911,3.307678,0.740726,1.032403,1.302325,-0.576642,0.524972,0.20783,-2.304975,1.288136,0.803507,-0.599111,0.094728,0.086081,0.19587


In [6]:
train.write_parquet(
    f"./data/training.parquet", partition_by = "date_id",
)