In [1]:
import gc
import os
import json
import joblib
import random
import requests
import itertools

import scipy as sp
import numpy as np
import pandas as pd
from tqdm import tqdm
import lightgbm as lgb

pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.options.display.max_info_columns = 100000

import warnings; warnings.filterwarnings('ignore')

from itertools import combinations
from sklearn.model_selection import StratifiedKFold, train_test_split

# 学習と推論

## 準備

In [2]:
# train = pd.read_csv('/content/drive/MyDrive/competition/Amex0824/data/train_all.csv')
train = pd.read_pickle("/content/drive/MyDrive/competition/Amex0824/data/df_train_all.pkl")

In [3]:
# test = pd.read_pickle("/content/drive/MyDrive/competition/Amex0824/data/df_test_all.pkl")

In [4]:
train_labels = pd.read_csv("/content/drive/MyDrive/competition/Amex0824/data/train_labels.csv")
train = pd.merge(train, train_labels, on="customer_ID")

In [5]:
class CFG:
    seed = 42
    n_folds = 5
    target = 'target'
    input_dir = ''

In [6]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [7]:
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

In [8]:
def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [9]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

## モデル

In [10]:
def train_and_evaluate(train):
  # Get feature list
  features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
  models = []
  scores = 0.0
  params = {
      'objective': 'binary',
      'metric': "binary_logloss",
      'boosting': 'dart',
      'seed': CFG.seed,
      'num_leaves': 100,
      'learning_rate': 0.01,
      'feature_fraction': 0.20,
      'bagging_freq': 10,
      'bagging_fraction': 0.50,
      'lambda_l2': 2,
      'n_jobs': -1,
      'min_data_in_leaf': 40,
      'force_col_wise' : True
      }
  # Create a numpy array to store out of folds predictions
  oof_predictions = np.zeros(len(train))
  kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
  for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
    print('-'*50)
    print(f'Training fold {fold} with {len(features)} features...')
    x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
    y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_val, y_val)
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        num_boost_round = 10500,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 100,
        verbose_eval = 100,
        feval = lgb_amex_metric,
        )
    # validation データでの予測結果
    val_pred = model.predict(x_val)
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')

    scores += score / 5
    models.append(model)
    print("*" * 100)

  # 各 fold における Validation スコアの平均
  print(f"All fold average score : {scores:.4f}")
  return models

In [None]:
seed_everything(CFG.seed)
models = train_and_evaluate(train)

--------------------------------------------------
Training fold 0 with 35 features...
[100]	training's binary_logloss: 0.522018	training's amex_metric: 0.568333	valid_1's binary_logloss: 0.522169	valid_1's amex_metric: 0.565896
[200]	training's binary_logloss: 0.514329	training's amex_metric: 0.56962	valid_1's binary_logloss: 0.514522	valid_1's amex_metric: 0.567111
[300]	training's binary_logloss: 0.476866	training's amex_metric: 0.573754	valid_1's binary_logloss: 0.477156	valid_1's amex_metric: 0.570999
[400]	training's binary_logloss: 0.459049	training's amex_metric: 0.578937	valid_1's binary_logloss: 0.459386	valid_1's amex_metric: 0.576087
[500]	training's binary_logloss: 0.437665	training's amex_metric: 0.580922	valid_1's binary_logloss: 0.438065	valid_1's amex_metric: 0.578053
[600]	training's binary_logloss: 0.424136	training's amex_metric: 0.582139	valid_1's binary_logloss: 0.424587	valid_1's amex_metric: 0.579192
[700]	training's binary_logloss: 0.399012	training's amex_metr

In [None]:
del train
gc.collect()

In [None]:
test = pd.read_csv('/content/drive/MyDrive/competition/Amex0824/data/test_all.csv')
# test データと同じ長さですべての要素が 0 の配列を用意
preds = np.zeros(len(test))

# 各モデルで推論
for model in models:
    pred = model.predict(test, num_iteration=model.best_iteration)
    preds += pred / len(models)
    
test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': preds})
test_df.to_csv(f'test_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)        