In [1]:
import pandas as pd
import polars as pl
import numpy as np
import os
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import pickle

from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor
import lightgbm as lgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train = pl.scan_parquet("./preprocessed_dataset/training_null_nolags.parquet").collect().to_pandas()
valid = pl.scan_parquet("./preprocessed_dataset/validation_null_nolags.parquet").collect().to_pandas()
train.shape, valid.shape

((11731192, 94), (2900128, 94))

In [3]:
train.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
0,32496018,1301,0,0,4.534279,-0.626509,1.168237,0.841137,-0.164749,-2.617933,0.747355,0.742167,0.70665,-0.529501,11,7,76,-0.830131,1.07705,-0.343497,,-0.336991,,-1.35938,-1.810651,0.278623,0.10855,1.462774,1.040918,0.357053,0.191162,1.282461,0.990893,0.054168,-0.439425,-0.690112,0.077168,,,0.267566,0.544378,0.806811,-0.158676,-0.521487,,0.807561,,,-0.636782,,-1.488725,1.106105,0.710943,1.472678,1.39684,,0.814154,,,-0.637685,,-0.97244,1.921358,,1.513703,0.942349,0.266662,-0.409555,-0.455251,-0.267179,-2.112979,-1.745011,-0.753951,0.736045,-0.394136,-0.835504,1.251009,-0.33446,,,-0.262358,-0.216814,-0.251791,-0.183686,-0.301571,-0.062928,-0.250865,0.023525,-0.1359,0.07287,0.20301,-0.05056,0.440378,7
1,32496019,1301,0,1,5.797275,-0.356928,1.140389,-0.342267,-0.235083,-2.594887,1.120063,1.246772,0.816427,-0.621949,11,7,76,-0.733625,-0.157744,-0.970641,,-0.426202,,-1.394881,-1.095173,-0.914965,0.404173,1.313849,1.457173,-1.106613,-1.251292,0.487547,1.170769,1.105058,-0.802217,-0.551586,0.34285,,,0.097886,0.223581,0.572655,-0.0558,-0.42355,,0.202044,,,-0.018392,,-1.858036,0.969326,1.016119,0.025655,0.396607,,0.825236,,,0.39417,,-1.532965,1.740133,,0.916378,0.525666,0.266662,-0.386195,-0.353637,-0.243203,-1.556582,-1.406703,-1.031861,-0.16505,-0.216324,-0.872458,-0.092202,-0.468728,,,-0.333664,-0.222176,-0.306501,-0.313284,-0.091463,-0.012453,-0.391784,1.248448,0.501747,1.153719,1.473158,0.47282,1.175669,7
2,32496020,1301,0,2,1.530522,0.117436,1.086291,0.145496,-0.653965,-2.838544,1.39474,1.086029,0.75445,-0.640794,81,2,59,-1.151384,0.391417,-0.455426,,0.45713,,-2.589605,-1.367003,0.165818,-0.140906,-0.641746,-0.858845,-0.267364,-1.141551,0.864914,0.401389,0.111483,-0.528512,-0.489764,-0.294506,,,0.730083,0.1292,1.140404,-0.26318,-0.545856,,1.261461,,,-0.136209,,-0.25403,1.605022,1.374636,5.794483,2.691928,,2.346244,,,0.080502,,-0.221826,2.661624,,8.443212,3.469574,0.266662,-0.431326,-0.194226,-0.348176,-2.158284,-1.227555,-0.763014,-0.286154,-0.532978,-0.783536,1.160471,-0.337264,,,2.17335,1.761504,0.409557,0.420885,-0.559386,-0.43486,-0.54861,0.811398,-0.110127,1.060903,1.101417,0.122875,2.584967,7
3,32496021,1301,0,3,1.400832,-0.599904,1.460633,-0.225356,-1.264623,-2.860769,0.837587,0.797671,0.567392,-0.520822,4,3,11,-1.291897,0.024346,-0.573059,,-0.138923,,-2.093665,-1.051413,0.270792,0.099605,-0.804024,-0.971089,0.986695,1.050429,-0.412731,-0.795515,-0.922918,-0.547456,-0.788562,0.096073,,,-0.788471,-0.381188,1.830147,-0.217963,-0.450954,,1.429685,,,-0.688811,,-0.909992,0.708367,1.422506,-0.205077,0.524089,,2.403196,,,-0.282071,,-0.617205,1.993816,,1.470458,1.039777,0.266662,-0.294487,0.058985,-0.099771,-2.152787,-1.895801,-0.979656,-0.077958,-0.635146,-0.858337,0.078776,-0.548794,,,5.577743,4.346803,9.644039,10.049657,0.432914,0.240288,0.03095,2.152129,0.074429,0.970721,2.728106,-0.058052,1.917751,7
4,32496022,1301,0,4,1.517937,-0.519632,1.492868,-0.246293,-0.243128,-2.675119,0.912146,0.758439,0.83121,-0.547962,15,1,9,-0.766486,3.078034,0.185537,,0.234047,,-1.37149,-1.406495,-0.457296,0.35833,-0.736065,-1.35742,-0.275346,-0.715867,-1.859262,-1.589148,-0.929442,-0.717562,-0.671324,0.268874,,,-0.31161,-0.598788,-2.388349,-0.094777,-1.165328,,1.545321,,,-2.204641,,-0.971156,0.489344,0.58017,-0.580584,-0.872218,,2.203508,,,-1.812004,,-0.780027,0.947135,,4.041653,1.927541,0.266662,-0.359124,-0.199684,-0.329408,-1.489783,-1.972752,-0.505794,2.086797,-0.000828,-0.884418,2.77767,0.279086,,,4.978641,4.493278,4.664584,3.073956,-0.483422,-0.320952,-2.589452,0.146033,-0.910223,-0.059057,0.960657,-0.687864,2.251244,7


In [4]:
valid.head()

Unnamed: 0,id,date_id,time_id,symbol_id,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78,responder_0,responder_1,responder_2,responder_3,responder_4,responder_5,responder_6,responder_7,responder_8,partition_id
0,44227210,1621,0,0,3.279283,1.506968,1.886322,1.204109,1.2835,1.993611,-0.119326,-0.73207,-0.544571,-0.050266,11,7,76,-1.005612,0.828183,-0.425319,,-0.450916,,-1.063633,-2.203139,0.909179,-0.082804,1.278199,0.802219,2.626316,0.556909,0.755489,0.815104,0.597854,-0.826289,-0.825035,-0.064405,,,3.363106,3.52789,0.978193,0.026967,-0.072644,,0.330491,,,-0.145386,,-1.394958,1.678235,0.207068,-0.00339,0.152003,,-0.91844,,,-1.103922,,-2.575061,1.956364,,-0.44249,-0.108841,-1.910669,-0.223153,-0.264467,-0.302468,-1.625005,-1.629949,-1.011127,0.340772,-0.403617,-0.611603,0.623061,-0.563458,,,1.963911,1.804299,0.27889,0.319928,-0.092826,0.290921,1.601523,0.177724,0.607428,1.411111,0.370267,0.604243,1.676913,9
1,44227211,1621,0,1,3.519362,1.080509,1.353212,1.231068,1.981492,1.93726,-0.096809,-0.699783,-0.589898,-0.058192,11,7,76,-1.014564,2.291786,-0.073622,,-0.29735,,-2.090523,-2.497252,0.860195,0.155892,1.345179,1.086259,0.932155,0.231282,-1.421177,-0.026356,1.207051,-0.46718,-0.76292,0.220812,,,3.028423,2.522097,0.07735,0.117742,0.139471,,-0.503535,,,-1.189929,,-1.891631,1.385381,-1.947943,-0.839158,-0.546516,,0.680622,,,-1.937119,,-1.466941,0.903946,,-3.040524,-1.017308,-1.910669,-0.342794,-0.271864,-0.424236,-1.120066,-2.130798,-0.746923,1.996203,0.068269,-0.932582,0.889698,-0.322114,,,2.291338,2.456638,1.013229,0.991755,0.437928,-0.031521,0.858263,-0.190589,-0.326594,0.359683,-0.335365,-0.437546,0.155353,9
2,44227212,1621,0,2,2.865369,1.431129,1.906845,1.251957,2.056888,2.161489,-0.116489,-0.540817,-0.801407,-0.084442,81,2,59,-0.828069,0.687909,-0.33179,,0.04958,,-1.466411,-3.237322,0.283093,-0.181977,0.189309,0.539015,1.297813,-0.578895,0.730895,0.316338,-0.070835,-0.508568,-0.597638,-0.149536,,,3.13624,2.865167,1.934235,0.305,0.204532,,-1.827593,,,-2.056912,,-1.827173,0.574696,-1.161,-3.13798,-1.508759,,1.450387,,,-2.342139,,-1.139273,0.715682,,-3.815763,-2.25089,-1.910669,-0.40756,-0.341402,-0.335776,-1.740976,-1.810217,-0.957957,1.04179,-0.230586,-1.004894,0.206521,-0.401714,,,2.096583,2.095502,0.397592,0.378487,-0.532696,-0.530257,-0.172048,-0.926282,-2.444038,1.650112,-0.866584,-2.139081,1.666349,9
3,44227213,1621,0,3,2.417043,1.212606,1.871241,1.675459,0.935077,2.558348,-0.089674,-0.716721,-0.846766,-0.066391,4,3,11,-1.080315,0.235914,-0.64679,,-0.430875,,-1.222756,-2.841733,0.047951,-0.047061,0.173616,-0.957923,1.055894,-0.614808,-0.010027,0.053201,-0.292934,-1.106994,-1.192976,-0.028615,,,3.708019,3.312305,1.872287,0.210258,0.175181,,-1.395719,,,-1.029715,,-1.935152,1.434398,-0.517241,-1.366289,-0.906912,,0.052459,,,-2.000222,,-1.867128,0.991682,,-3.108264,-1.663144,-1.910669,-0.394961,-0.331666,-0.390299,-1.550434,-1.924489,-0.859927,0.30992,-0.346754,-0.66418,-0.044835,-0.869291,,,10.670677,10.121597,4.939338,4.218601,-0.490462,0.114698,0.36166,0.959814,1.619152,-1.343138,1.958878,1.64776,-3.527275,9
4,44227214,1621,0,5,2.715028,1.356251,2.251082,1.792402,1.619921,2.791031,-0.08917,-0.533984,-0.676304,-0.039363,2,10,171,-0.605385,0.014668,-0.817356,,-0.404603,,-1.362202,-2.237646,0.528122,-0.002447,0.985116,0.40872,0.961938,0.103223,-1.078316,0.140855,0.839164,-0.735289,-0.930754,-0.003219,,,2.650637,2.207669,2.430025,0.095843,-0.130812,,0.655777,,,1.54149,,-0.660573,1.659949,0.319181,-0.003787,0.369545,,0.979726,,,-0.483153,,-1.399185,0.861608,,0.10244,-0.343908,-1.910669,-0.306933,-0.41748,-0.347875,-1.336699,-2.254019,-0.894134,-0.070118,-0.670262,-1.328004,0.106209,-0.452772,,,3.592537,3.18273,1.094062,0.768024,2.592642,0.655411,3.16818,0.945571,0.226843,1.35388,0.362312,-0.10568,0.072814,9


In [5]:
class CONFIG:
    seed = 42
    target_col = "responder_6"
    cont_ftrs = ["weight"] + \
                [f"feature_{i:02d}" for i in range(79)]

    feature_cols = cont_ftrs

In [6]:
X_train = train[ CONFIG.feature_cols ]
y_train = train[ CONFIG.target_col ]
w_train = train[ "weight" ]
X_valid = valid[ CONFIG.feature_cols ]
y_valid = valid[ CONFIG.target_col ]
w_valid = valid[ "weight" ]

X_train.shape, y_train.shape, w_train.shape, X_valid.shape, y_valid.shape, w_valid.shape

((11731192, 80),
 (11731192,),
 (11731192,),
 (2900128, 80),
 (2900128,),
 (2900128,))

In [8]:
X_train.head()

Unnamed: 0,weight,feature_00,feature_01,feature_02,feature_03,feature_04,feature_05,feature_06,feature_07,feature_08,feature_09,feature_10,feature_11,feature_12,feature_13,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23,feature_24,feature_25,feature_26,feature_27,feature_28,feature_29,feature_30,feature_31,feature_32,feature_33,feature_34,feature_35,feature_36,feature_37,feature_38,feature_39,feature_40,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,feature_61,feature_62,feature_63,feature_64,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74,feature_75,feature_76,feature_77,feature_78
0,4.534279,-0.626509,1.168237,0.841137,-0.164749,-2.617933,0.747355,0.742167,0.70665,-0.529501,11,7,76,-0.830131,1.07705,-0.343497,,-0.336991,,-1.35938,-1.810651,0.278623,0.10855,1.462774,1.040918,0.357053,0.191162,1.282461,0.990893,0.054168,-0.439425,-0.690112,0.077168,,,0.267566,0.544378,0.806811,-0.158676,-0.521487,,0.807561,,,-0.636782,,-1.488725,1.106105,0.710943,1.472678,1.39684,,0.814154,,,-0.637685,,-0.97244,1.921358,,1.513703,0.942349,0.266662,-0.409555,-0.455251,-0.267179,-2.112979,-1.745011,-0.753951,0.736045,-0.394136,-0.835504,1.251009,-0.33446,,,-0.262358,-0.216814,-0.251791,-0.183686
1,5.797275,-0.356928,1.140389,-0.342267,-0.235083,-2.594887,1.120063,1.246772,0.816427,-0.621949,11,7,76,-0.733625,-0.157744,-0.970641,,-0.426202,,-1.394881,-1.095173,-0.914965,0.404173,1.313849,1.457173,-1.106613,-1.251292,0.487547,1.170769,1.105058,-0.802217,-0.551586,0.34285,,,0.097886,0.223581,0.572655,-0.0558,-0.42355,,0.202044,,,-0.018392,,-1.858036,0.969326,1.016119,0.025655,0.396607,,0.825236,,,0.39417,,-1.532965,1.740133,,0.916378,0.525666,0.266662,-0.386195,-0.353637,-0.243203,-1.556582,-1.406703,-1.031861,-0.16505,-0.216324,-0.872458,-0.092202,-0.468728,,,-0.333664,-0.222176,-0.306501,-0.313284
2,1.530522,0.117436,1.086291,0.145496,-0.653965,-2.838544,1.39474,1.086029,0.75445,-0.640794,81,2,59,-1.151384,0.391417,-0.455426,,0.45713,,-2.589605,-1.367003,0.165818,-0.140906,-0.641746,-0.858845,-0.267364,-1.141551,0.864914,0.401389,0.111483,-0.528512,-0.489764,-0.294506,,,0.730083,0.1292,1.140404,-0.26318,-0.545856,,1.261461,,,-0.136209,,-0.25403,1.605022,1.374636,5.794483,2.691928,,2.346244,,,0.080502,,-0.221826,2.661624,,8.443212,3.469574,0.266662,-0.431326,-0.194226,-0.348176,-2.158284,-1.227555,-0.763014,-0.286154,-0.532978,-0.783536,1.160471,-0.337264,,,2.17335,1.761504,0.409557,0.420885
3,1.400832,-0.599904,1.460633,-0.225356,-1.264623,-2.860769,0.837587,0.797671,0.567392,-0.520822,4,3,11,-1.291897,0.024346,-0.573059,,-0.138923,,-2.093665,-1.051413,0.270792,0.099605,-0.804024,-0.971089,0.986695,1.050429,-0.412731,-0.795515,-0.922918,-0.547456,-0.788562,0.096073,,,-0.788471,-0.381188,1.830147,-0.217963,-0.450954,,1.429685,,,-0.688811,,-0.909992,0.708367,1.422506,-0.205077,0.524089,,2.403196,,,-0.282071,,-0.617205,1.993816,,1.470458,1.039777,0.266662,-0.294487,0.058985,-0.099771,-2.152787,-1.895801,-0.979656,-0.077958,-0.635146,-0.858337,0.078776,-0.548794,,,5.577743,4.346803,9.644039,10.049657
4,1.517937,-0.519632,1.492868,-0.246293,-0.243128,-2.675119,0.912146,0.758439,0.83121,-0.547962,15,1,9,-0.766486,3.078034,0.185537,,0.234047,,-1.37149,-1.406495,-0.457296,0.35833,-0.736065,-1.35742,-0.275346,-0.715867,-1.859262,-1.589148,-0.929442,-0.717562,-0.671324,0.268874,,,-0.31161,-0.598788,-2.388349,-0.094777,-1.165328,,1.545321,,,-2.204641,,-0.971156,0.489344,0.58017,-0.580584,-0.872218,,2.203508,,,-1.812004,,-0.780027,0.947135,,4.041653,1.927541,0.266662,-0.359124,-0.199684,-0.329408,-1.489783,-1.972752,-0.505794,2.086797,-0.000828,-0.884418,2.77767,0.279086,,,4.978641,4.493278,4.664584,3.073956


In [9]:
y_train.head()

0    0.203010
1    1.473158
2    1.101417
3    2.728106
4    0.960657
Name: responder_6, dtype: float32

In [11]:
# Support Vector Machine Model
def SVM_model(seed):
    # SVM parameters
    SVM_Params = {
        'gamma': 1e-7,
        'C': 1e5,
        'kernel': 'rbf',
    }
    
    SVM_Model = SVR(**SVM_Params)
    return SVM_Model

# Random Forest Model
def RF_model(seed):
    # Random Forest parameters
    RF_Params = {
        'n_estimators': 100,
        'max_depth': 20,
        'random_state': seed
    }
    
    RF_Model = RandomForestRegressor(**RF_Params)
    return RF_Model


# XGBoost Model
def XGB_model(seed):
    # XGBoost parameters
    XGB_Params = {
        'learning_rate': 0.05,
        'max_depth': 6,
        'n_estimators': 200,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 1,
        'reg_lambda': 5,
        'random_state': seed,
        'tree_method': 'hist',
    }
    
    XGB_Model = XGBRegressor(**XGB_Params)
    return XGB_Model

In [12]:
%%time
model = XGB_model(CONFIG.seed)
model.fit(X_train, y_train, sample_weight=w_train)

CPU times: user 8min 37s, sys: 16.2 s, total: 8min 53s
Wall time: 1min 35s


In [13]:
y_pred_train = model.predict(X_train)
train_score = r2_score(y_train, y_pred_train, sample_weight=w_train )
train_score

0.044039427598210557

In [14]:
y_pred_valid = model.predict(X_valid)
valid_score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
valid_score

0.0056970421717220665

In [15]:
result = {
    "model" : model,
    # "preprocessor" : preprocessor,
}

with open("result_null_nolags.pkl", "wb") as fp:
    pickle.dump(result, fp)

In [16]:
# y_means = { symbol_id : -1 for symbol_id in range(39) }
# for symbol_id, gdf in train[["symbol_id", CONFIG.target_col]].groupby("symbol_id"):
#     y_mean = gdf[ CONFIG.target_col ].mean()
#     y_means[symbol_id] = y_mean
#     print(f"symbol_id = {symbol_id}, y_means = {y_mean:.5f}")

In [17]:
# cv_detail = { symbol_id : 0 for symbol_id in range(39) }
# for symbol_id, gdf in valid.groupby("symbol_id"):
#     X_valid = gdf[ CONFIG.feature_cols ]
#     y_valid = gdf[ CONFIG.target_col ]
#     w_valid = gdf[ "weight" ]
#     y_pred_valid = model.predict(X_valid)
#     score = r2_score(y_valid, y_pred_valid, sample_weight=w_valid )
#     cv_detail[symbol_id] = score
    
#     print(f"symbol_id = {symbol_id}, score = {score:.5f}")

In [18]:
# sids = list(cv_detail.keys())
# plt.bar(sids, [cv_detail[sid] for sid in sids])
# plt.grid()
# plt.xlabel("symbol_id")
# plt.ylabel("CV score")
# plt.show()