In [14]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# load files

In [15]:
test = pd.read_parquet("../input/test_base_shifted_knn.parquet")

In [34]:
submission = pd.read_csv("../input/sample_submission.csv")
submission["customer_ID_16"] = submission["customer_ID"].str[-16:]
    
hex_to_int = lambda x: int(x, 16)
submission[["customer_ID_16"]] = submission[["customer_ID_16"]].applymap(lambda x: int(x, 16))
submission["customer_ID_16"] = submission["customer_ID_16"].astype("int64")

In [17]:
test.head()

Unnamed: 0,customer_ID,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,...,B_5_last_knn_500_mean,R_1_last_knn_500_mean,D_48_last_knn_500_mean,B_1_last_knn_500_mean,S_3_last_knn_500_mean,D_43_last_knn_500_mean,B_11_last_knn_500_mean,D_44_last_knn_500_mean,B_2_last_knn_500_mean,target_knn_500_mean
458913,-9223277493928322471,0.01266,0.720849,0.752822,0.743628,0.27735,0,1,0,0.010191,...,0.068747,0.013053,0.246778,0.077179,0.178769,0.114419,0.068687,0.221088,0.752562,0.035484
458914,-9223220269070810982,0.048047,0.798196,0.902296,0.802795,0.0,0,0,0,0.013031,...,0.121367,0.009432,0.200077,0.051752,0.165336,0.099288,0.044405,0.0875,0.804182,0.018293
458915,-9223219380479694318,0.016592,0.750023,0.810885,0.783128,0.599145,0,2,0,0.007154,...,0.107906,0.01371,0.216764,0.052946,0.17605,0.105777,0.046146,0.125806,0.795941,0.018405
458916,-9223202973368451495,0.028179,0.681882,0.771254,0.771254,2.213015,0,8,0,0.003199,...,0.109443,0.010873,0.245121,0.060688,0.170872,0.106883,0.053585,0.157407,0.777937,0.035821
458917,-9223190037945288673,0.025339,0.939252,1.007827,0.962337,7.434328,0,20,20,0.038536,...,0.086469,0.006536,0.080037,0.029174,0.147773,0.067908,0.021959,0.003115,0.859121,0.003077


In [35]:
submission.head()

Unnamed: 0,customer_ID,prediction,customer_ID_16
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0,8717704911770597815
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0,4783907996972277493
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0,4616129756878093544
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0,-1916505587365783916
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0,7583456031722841431


# config

In [19]:
n_folds = 5
seed = 42

exclude_features = []

exclude_features += [
    "customer_ID", 
    "target",
    "number_of_observations",
]
features = [col for col in test.columns if col not in exclude_features]
# features = [col for col in features if ("D_66" not in col) and ("D_87" not in col)]

target_encoded_features = [col for col in features if "target" in col]
features = [col for col in features if ("D_66" not in col) and ("D_87" not in col) and ("knn" not in col)] + target_encoded_features

# inference

In [20]:
def inference(df):
    
    preds = None
    
    for fold in range(n_folds):
        
        print("inference seed {} fold {}".format(seed, fold))
        
        x_test = df[features]
        xgb_test = xgb.DMatrix(data=x_test)

        model = xgb.Booster()
        model.load_model("../ckpt/base_knn/xgb_{}_{}.xgb".format(fold, seed))
        
        if preds is None:
            preds = model.predict(xgb_test) / n_folds
        else:
            preds += model.predict(xgb_test) / n_folds

        del x_test, xgb_test, model
        _ = gc.collect()
        
    return preds

In [21]:
preds = inference(test)

inference seed 42 fold 0
inference seed 42 fold 1
inference seed 42 fold 2
inference seed 42 fold 3
inference seed 42 fold 4


In [30]:
test_prediction = pd.DataFrame({"customer_ID_16" : test["customer_ID"], "prediction_orig" : preds})

In [31]:
test_prediction

Unnamed: 0,customer_ID_16,prediction_orig
458913,-9223277493928322471,0.010261
458914,-9223220269070810982,0.006230
458915,-9223219380479694318,0.011915
458916,-9223202973368451495,0.017023
458917,-9223190037945288673,0.003844
...,...,...
1383529,9223311419908670169,0.004994
1383530,9223316227884056852,0.033025
1383531,9223317482642190638,0.000866
1383532,9223341949877516615,0.003030


In [36]:
submission = submission.merge(test_prediction, on="customer_ID_16")

In [37]:
submission = submission.drop(["customer_ID_16", "prediction"], axis=1).rename(columns={"prediction_orig" : "prediction"})

In [38]:
submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.031913
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.000964
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.083659
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.252436
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.88221


In [39]:
submission.to_csv("submission_base_knn.csv", index=False)