In [1]:
import os
import gc
from joblib import dump, load
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

# define metric

In [2]:
def amex_metric(y_true, y_pred):
    
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    
    for i in [1, 0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
        
    return 0.5 * (gini[1]/gini[0] + top_four)

# define config

In [3]:
features = load("selected_features.pkl")

target = "target"

cat_features_base = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
] 
cat_features = [
    "{}_last".format(feature) for feature in cat_features_base
]
cat_features = [feature for feature in cat_features if feature in features]

In [4]:
seed = 42
n_folds = 5

In [5]:
save_folder = os.path.join("../ckpt/lgbm_seed_{}".format(seed))

# oof first

In [None]:
train = pd.read_parquet("../input/train_full_features.parquet")

In [None]:
oof_predictions = np.zeros(len(train))

kfold = StratifiedKFold(
    n_splits=n_folds, 
    shuffle=True, 
    random_state=seed
)

for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[target])):

    x_val, y_val = train.loc[val_ind, features], train.loc[val_ind, target]
    
    save_path = os.path.join(save_folder, "fold_{}/fold_{}.pkl".format(fold, fold))
    model = load(save_path)
    
    oof_predictions[val_ind] = model.predict(x_val)

In [None]:
score = amex_metric(train[target], oof_predictions)
print("oof:", score)

In [None]:
oof_df = pd.DataFrame({"customer_ID": train["customer_ID"], "target": train[target], "prediction": oof_predictions})
oof_df.to_parquet(os.path.join(save_folder, "lgbm_oof_{}.parquet".format(seed)))

# inference

In [12]:
test = pd.read_parquet("../input/test_full_features.parquet")
customer_ID = test["customer_ID"]
test = test[features]

In [8]:
submission = pd.read_csv("../input/sample_submission.csv")
submission["customer_ID_16"] = submission["customer_ID"].str[-16:]
    
hex_to_int = lambda x: int(x, 16)
submission[["customer_ID_16"]] = submission[["customer_ID_16"]].applymap(lambda x: int(x, 16))
submission["customer_ID_16"] = submission["customer_ID_16"].astype("int64")

In [9]:
submission.head()

Unnamed: 0,customer_ID,prediction,customer_ID_16
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0,8717704911770597815
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0,4783907996972277493
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0,4616129756878093544
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0,-1916505587365783916
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0,7583456031722841431


In [10]:
preds = None

for fold in range(n_folds):

    print("inference seed {} fold {}".format(seed, fold))

    save_path = os.path.join(save_folder, "fold_{}/fold_{}.pkl".format(fold, fold))
    model = load(save_path)

    if preds is None:
        preds = model.predict(test) / n_folds
    else:
        preds += model.predict(test) / n_folds

inference seed 42 fold 0
inference seed 42 fold 1
inference seed 42 fold 2
inference seed 42 fold 3
inference seed 42 fold 4


In [13]:
test_prediction = pd.DataFrame({"customer_ID_16" : customer_ID, "prediction_orig" : preds})

In [14]:
test_prediction

Unnamed: 0,customer_ID_16,prediction_orig
0,-9223277493928322471,0.005936
1,-9223220269070810982,0.001885
2,-9223219380479694318,0.004992
3,-9223202973368451495,0.007666
4,-9223190037945288673,0.001667
...,...,...
924616,9223311419908670169,0.001988
924617,9223316227884056852,0.037868
924618,9223317482642190638,0.000609
924619,9223341949877516615,0.001561


In [15]:
submission = submission.merge(test_prediction, on="customer_ID_16")

In [16]:
submission = submission.drop(["customer_ID_16", "prediction"], axis=1).rename(columns={"prediction_orig" : "prediction"})

In [17]:
submission.head()

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.017304
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.000377
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.041863
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.167246
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.884664


In [18]:
submission.to_csv("submission_lgbm_42.csv", index=False)