In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# load files

In [2]:
test = pd.read_parquet("../input/test_base_normalized.parquet")

In [3]:
submission = pd.read_csv("../input/sample_submission.csv")
submission["customer_ID_16"] = submission["customer_ID"].str[-16:]
    
hex_to_int = lambda x: int(x, 16)
submission[["customer_ID_16"]] = submission[["customer_ID_16"]].applymap(lambda x: int(x, 16))
submission["customer_ID_16"] = submission["customer_ID_16"].astype("int64")

In [4]:
test.head()

Unnamed: 0_level_0,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,B_1_nanmin,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223277493928322471,0.051276,0.255823,0.385323,0.348086,0.030696,-0.569252,-0.458575,-0.569252,0.047913,-0.588713,...,1,13,2,1,0,1,0,13,1,2
-9223220269070810982,0.194608,0.569106,0.99075,0.587732,0.0,-0.569252,-0.569252,-0.569252,0.06127,-0.58898,...,1,13,2,1,0,1,0,13,5,1
-9223219380479694318,0.067203,0.373986,0.6205,0.508076,0.066312,-0.569252,-0.347897,-0.569252,0.033638,-0.597878,...,1,13,0,1,0,1,0,13,5,1
-9223202973368451495,0.114133,0.097991,0.459981,0.459981,0.244931,-0.569252,0.316168,-0.569252,0.01504,-0.596622,...,1,13,0,1,0,1,0,13,5,1
-9223190037945288673,0.102632,1.140431,1.418187,1.233938,0.822813,-0.569252,1.644298,1.644298,0.181185,-0.573914,...,1,13,0,2,0,1,0,13,2,2


In [5]:
submission.head()

Unnamed: 0,customer_ID,prediction,customer_ID_16
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0,8717704911770597815
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0,4783907996972277493
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0,4616129756878093544
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0,-1916505587365783916
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0,7583456031722841431


# config

In [6]:
n_folds = 5
seed = 42

exclude_features = []

exclude_features += [
    "customer_ID", 
    "target",
    "number_of_observations",
]
features = [col for col in test.columns if col not in exclude_features]

# inference

In [7]:
def inference(df):
    
    preds = None
    
    for fold in range(n_folds):
        
        print("inference seed {} fold {}".format(seed, fold))
        
        x_test = df[features]
        xgb_test = xgb.DMatrix(data=x_test)

        model = xgb.Booster()
        model.load_model("../ckpt/base_normalize/xgb_{}_{}.xgb".format(fold, seed))
        
        if preds is None:
            preds = model.predict(xgb_test) / n_folds
        else:
            preds += model.predict(xgb_test) / n_folds

        del x_test, xgb_test, model
        _ = gc.collect()
        
    return preds

In [8]:
preds = inference(test)

inference seed 42 fold 0
inference seed 42 fold 1
inference seed 42 fold 2
inference seed 42 fold 3
inference seed 42 fold 4


In [10]:
test_prediction = pd.DataFrame({"customer_ID_16" : test.index, "prediction_orig" : preds})

In [11]:
test_prediction

Unnamed: 0,customer_ID_16,prediction_orig
0,-9223277493928322471,0.021320
1,-9223220269070810982,0.012178
2,-9223219380479694318,0.023670
3,-9223202973368451495,0.072177
4,-9223190037945288673,0.009027
...,...,...
924616,9223311419908670169,0.010892
924617,9223316227884056852,0.046076
924618,9223317482642190638,0.002591
924619,9223341949877516615,0.006378


In [12]:
submission = submission.merge(test_prediction, on="customer_ID_16")

In [13]:
submission = submission.drop(["customer_ID_16", "prediction"], axis=1).rename(columns={"prediction_orig" : "prediction"})

In [14]:
submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.077972
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.001503
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.159563
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.296909
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.913397


In [15]:
submission.to_csv("submission_base_normalized.csv", index=False)