In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb

# load files

In [2]:
test = pd.read_parquet("../input/test_full_features.parquet")

In [3]:
submission = pd.read_csv("../input/sample_submission.csv")
submission["customer_ID_16"] = submission["customer_ID"].str[-16:]
    
hex_to_int = lambda x: int(x, 16)
submission[["customer_ID_16"]] = submission[["customer_ID_16"]].applymap(lambda x: int(x, 16))
submission["customer_ID_16"] = submission["customer_ID_16"].astype("int64")

In [4]:
test.head()

Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_nunique,B_38_count,B_38_last,B_38_nunique,D_114_count,D_114_last,D_114_nunique,...,D_136_drawup_duration,D_137_drawup_duration,D_138_drawup_duration,D_139_drawup_duration,D_140_drawup_duration,D_141_drawup_duration,D_142_drawup_duration,D_143_drawup_duration,D_144_drawup_duration,D_145_drawup_duration
0,-9223277493928322471,13,0,1,13,0,3,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-9223220269070810982,13,0,1,13,1,1,13,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
2,-9223219380479694318,13,0,1,13,0,1,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,-9223202973368451495,13,0,1,13,0,3,13,0,1,...,0.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0,1.0,5.0
4,-9223190037945288673,13,0,1,13,1,1,13,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0


In [5]:
submission.head()

Unnamed: 0,customer_ID,prediction,customer_ID_16
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0,8717704911770597815
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0,4783907996972277493
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0,4616129756878093544
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0,-1916505587365783916
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0,7583456031722841431


# config

In [6]:
n_folds = 5
seed = 1024

exclude_features = []

exclude_features += [
    "customer_ID", 
    "target",
    "number_of_observations",
]

features = load("selected_features.pkl")

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = []
for feature in features:
    for cat_feature_base in cat_features_base:
        if cat_feature_base in feature:
            cat_features.append(feature)

# inference

In [7]:
def inference(df):
    
    preds = None
    
    for fold in range(n_folds):
        
        print("inference seed {} fold {}".format(seed, fold))
        
        x_test = df[features]
        xgb_test = xgb.DMatrix(data=x_test)

        model = xgb.Booster()
        model.load_model("../ckpt/xgb_seed_1024/xgb_{}_{}.xgb".format(fold, seed))
        
        if preds is None:
            preds = model.predict(xgb_test) / n_folds
        else:
            preds += model.predict(xgb_test) / n_folds

        del x_test, xgb_test, model
        _ = gc.collect()
        
    return preds

In [8]:
preds = inference(test)

inference seed 1024 fold 0
inference seed 1024 fold 1
inference seed 1024 fold 2
inference seed 1024 fold 3
inference seed 1024 fold 4


In [9]:
test_prediction = pd.DataFrame({"customer_ID_16" : test["customer_ID"], "prediction_orig" : preds})

In [10]:
test_prediction

Unnamed: 0,customer_ID_16,prediction_orig
0,-9223277493928322471,0.007082
1,-9223220269070810982,0.002114
2,-9223219380479694318,0.008842
3,-9223202973368451495,0.009784
4,-9223190037945288673,0.001553
...,...,...
924616,9223311419908670169,0.002216
924617,9223316227884056852,0.030073
924618,9223317482642190638,0.000614
924619,9223341949877516615,0.001060


In [11]:
submission = submission.merge(test_prediction, on="customer_ID_16")

In [12]:
submission = submission.drop(["customer_ID_16", "prediction"], axis=1).rename(columns={"prediction_orig" : "prediction"})

In [13]:
submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.021937
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.000498
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.031003
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.19296
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.87811


In [14]:
submission.to_csv("submission_xgb_2014.csv", index=False)