In [4]:
# library import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
import seaborn as sns
import dask
import dask.array as da
import dask.dataframe as dd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
import os
import re
import glob
import shutil
import gc
from pathlib import Path
from tqdm import tqdm
# showing module
from IPython.display import display

# output display option adjustment
# precision of floating point in numpy
np.set_printoptions(suppress=True, precision=4)

# precision of floating point in pandas
pd.options.display.float_format = '{:.4f}'.format

# display all columns in dataframe
pd.set_option("display.max_columns",None)

# default font size in graph
plt.rcParams["font.size"] = 14

# graph display
sns.set(rc={'figure.figsize':(12,5)});
plt.figure(figsize=(12,5));

# random seed
random_seed = 45

<Figure size 864x360 with 0 Axes>

## 機械学習とは
機械学習とは、ざっと言ってしまうとあるデータ X を入力として対応する予測値 y を取り出すような対応関係を作成することです。

例：タイタニック号で、乗客が生きるか死ぬかを予測する問題だと X は乗客の年齢, 性別, 船室のグレード… など乗客に紐づく情報のことを指します。通常、この情報のことを特徴量とよびます。

特徴量 X と 予測値 y が用意できれば学習用データ (X - y の関係がわかっているデータ) を元にして X をいれて y になるようにモデルを調整する。この調整の段階を学習とよびます。学習には様々なアルゴリズムがあるが、X, y を用意しなくてはならない部分は基本的に変わらないです。

# 今回のコンペに関して

コンペ自体のデータサイズが大きいのでデータハンドリングが僕たちに取っては肝になりそうです（この辺をよく知っている人教えてください！）

### データの読み込みに関して
* 形式を変えたデータセットをpandasで読み込む
  * [`feather`形式](https://www.kaggle.com/datasets/munumbutt/amexfeather) or [`Parquet`形式](https://www.kaggle.com/datasets/odins0n/amex-parquet)
* [`pyspark`](https://www.kaggle.com/code/rakkaalhazimi/export-large-dataset-to-spark) or [`dask`](https://docs.dask.org/en/latest/dataframe.html) で読み込む

### サイズが大きい場合の対処方法は以下のkaggle notebookを参考にしてみてください

[> How to Work with BIG Datasets on 16G RAM (+Dask)](https://www.kaggle.com/code/yuliagm/how-to-work-with-big-datasets-on-16g-ram-dask)

上記notebookの内容概要
* TIP 1 - 使用していない変数を [`delete`](https://www.sejuku.net/blog/74447) するか & gc.collect()` で[ガベージコレクション](https://techacademy.jp/magazine/19437)(不要になったメモリ領域を開放して再利用する機能)をする
* TIP 2 - データセット内の各カラムのデータタイプを予め定義しておく
  * eg: 本来 float64 だったものを -> float32 と定義してサイズを圧縮する
* TIP 3 - 読み込むデータセットを選択する (including generating your own subsamples)
  * 読み込む行数を選ぶ
  * (`skiprows`) で[読み飛ばす行数を指定する](https://bit.ly/3O90Ze7)
  * 読み飛ばすリストを作成して、読み飛ばす（以下例）
    * ```
      skiplines = np.random.choice(np.arange(1, lines), size=lines-1-1000000, replace=False)
      skiplines=np.sort(skiplines)
      train = pd.read_csv('../input/train.csv', skiprows=skiplines, dtype=dtypes)
      ```
* TIP 4 - バッチ処理をする
  * ひとまとまりのデータに対して、一連の処理を連続で実行する処理方式のこと。大きなデータに関しても、設定したデータ数(チャンク)ごとに処理をする
* TIP 5 - 特定のカラムのみ `import` する
  * 450,000行 × 150カラム より 100万行 × 2カラム の方がメモリ消費が少ないことは容易に想像できる
* TIP 6 - groupby などの処理をするときも一部カラムの一部データのみにするˆ
* TIP 7 - `Dask` を使用する。
  * [DaskについてのQiita記事](https://qiita.com/simonritchie/items/e174f243bc03fb25462e)

## コンペの概要
* コンペ名：[American Express - Default Prediction](https://www.kaggle.com/competitions/amex-default-prediction)

* 目的：毎月の顧客プロファイルから、顧客がクレジットカードの残高分を将来返済しない確率を予測すること
  * ターゲットのバイナリ変数は、最新のクレジットカード明細書から18ヶ月間のパフォーマンスウィンドウを観察することによって計算され、顧客が最新の明細書の日付から120日以内に支払額を支払わない場合、デフォルトとみなされる。

* 評価方法(Evaluation)
  * このコンペではクレカのデフォルト率を予測する。サブミットはちょっと特殊な評価方法で評価される。以下の通り:
    * ```
      M = 0.5*(G+D)  (*G = 正規化ジニ係数, D = デフォルト率 4% )
      ```
      機械学習における `正規化ジニ係数` は経済学などで使用される ジニ係数とは違うので注意です。以下記事を参照にしてみてください
      * [機械学習のモデル評価、説明可能性のための指標　その１。ジニ係数とAUC](https://qiita.com/Derek/items/4ded249f7a75f8da176c)
      * [DataRobot](https://docs.datarobot.com/ja/docs/modeling/reference/model-detail/opt-metric.html#gini-coefficient)
      * [GINI and AUC relationship](https://stats.stackexchange.com/questions/342329/gini-and-auc-relationship)
      * [Why use Normalized Gini Score instead of AUC as evaluation?](https://stats.stackexchange.com/questions/306287/why-use-normalized-gini-score-instead-of-auc-as-evaluation)

# データ管理環境整備

≒ディレクトリ整備

In [2]:
# # input_dir（input directory） を作る
current_note_path = os.path.dirname(os.path.abspath('__file__'))
INPUT_DIR = os.path.join(current_note_path, "data")

# INPUT_DIRがまだ作られていなければ作成
if not os.path.isdir(INPUT_DIR):
    os.mkdir(INPUT_DIR)

# output_dir(output directory) を作る
OUTPUT_DIR = os.path.join(current_note_path, 'outputs')

# OUTPUT_DIRがまだ作られていなければ作成
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)

In [3]:
# csv(or any file)を簡単に読み込めるようにする関数
# csv を読み取る関数を設定したあげると、pathや拡張子を書かずに読み込めるので入力が楽になります

# 通常のcsvファイルを読み取るときは以下
def read_csv(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.csv')
    print(f'Load: {path}')
    return pd.read_csv(path, **kwrgs)

# 今回はparquet形式を dask モジュールで読み込むため別途用意
def read_parquet(name, **kwrgs):
    path = os.path.join(INPUT_DIR, name + '.parquet')
    print(f'Load: {path}')
    return dd.read_parquet(path, **kwrgs)

# データ読み込み(test data 以外)

前記の通り、データサイズが大きく、安易に `pandas` などを使ってもローカルではメモリが足りないので、ここではcsv形式を `parquet` 形式に変えてimport します

(*本来は自分でcsv -> parquet形式に変える必要がありますが、心優しい人がparquet形式にしてくれているのでそれを参照します。)
* [該当データダウンロードページ](https://www.kaggle.com/datasets/odins0n/amex-parquet)
* [Load Parquet Files with Low Memory](https://www.kaggle.com/code/odins0n/load-parquet-files-with-low-memory)
* [parquetデータを使用したEDA](https://www.kaggle.com/code/odins0n/amex-default-prediction-detailed-eda)

In [4]:
train = read_parquet('train_data')

Load: /Users/satoshiido/Documents/coding_general/kaggle/amex-default-prediction/data/train_data.parquet


In [4]:
# parquet データを import
train = read_parquet('train_data')
train_labels = read_csv('train_labels')
sample_sub = read_csv('sample_submission')

Load: /Users/satoshiido/Documents/coding_general/kaggle/amex-default-prediction/data/test_data2.parquet
Load: /Users/satoshiido/Documents/coding_general/kaggle/amex-default-prediction/data/train_data.parquet
Load: /Users/satoshiido/Documents/coding_general/kaggle/amex-default-prediction/data/train_labels.csv
Load: /Users/satoshiido/Documents/coding_general/kaggle/amex-default-prediction/data/sample_submission.csv


In [5]:
# ガベージコレクション
gc.collect()

126

In [6]:
# 以下のカラムは頻出と考えられ、毎回入力するのはめんどくさいので、ポップアップされるように定義します
customer_ID = 'customer_ID'
TARGET = 'target'

# 各テーブルの概説

* データについて
  * データセットには各顧客の特徴が各明細書日付ごとに集約されたものが含まれている。特徴は匿名化され、正規化されており、以下のカテゴリに分類されている（カラムの prefix を見ると分かる）:
    * D_*: Delinquency variables
    * S_*: Spend variables
    * P_*: Payment variables
    * B_*: Balance variables
    * R_*: Risk variables
  * 以下のカラムはカテゴリ変数である:
    * B_30, B_38, D_63, D_64, D_66, D_68, D_114, D_116, D_117, D_120, D_126

# EDA (データ確認)

> Daskを使えたらいいなあ

* [PythonのDaskをしっかり調べてみた（大きなデータセットを快適に扱う）](https://qiita.com/simonritchie/items/e174f243bc03fb25462e)

> EDA はこの辺もまずは参考にしてみる
* [AMEX EDA (Comparison of training and test data)](https://www.kaggle.com/code/onodera1/amex-eda-comparison-of-training-and-test-data)

## train data

In [27]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5531451 entries, 0 to 5531450
Columns: 191 entries, customer_ID to target
dtypes: category(12), datetime64[ns](1), float32(176), int8(1), object(1)
memory usage: 3.8+ GB


In [28]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,D_43,D_44,B_4,D_45,B_5,R_2,D_46,D_47,D_48,D_49,B_6,B_7,B_8,D_50,D_51,B_9,R_3,D_52,P_3,B_10,D_53,S_5,B_11,S_6,D_54,R_4,S_7,B_12,S_8,D_55,D_56,B_13,R_5,D_58,S_9,B_14,D_59,D_60,D_61,B_15,S_11,D_62,D_63,D_64,D_65,B_16,B_17,B_18,B_19,D_66,B_20,D_68,S_12,R_6,S_13,B_21,D_69,B_22,D_70,D_71,D_72,S_15,B_23,D_73,P_4,D_74,D_75,D_76,B_24,R_7,D_77,B_25,B_26,D_78,D_79,R_8,R_9,S_16,D_80,R_10,R_11,B_27,D_81,D_82,S_17,R_12,B_28,R_13,D_83,R_14,R_15,D_84,R_16,B_29,B_30,S_18,D_86,D_87,R_17,R_18,D_88,B_31,S_19,R_19,B_32,S_20,R_20,R_21,B_33,D_89,R_22,R_23,D_91,D_92,D_93,D_94,R_24,R_25,D_96,S_22,S_23,S_24,S_25,S_26,D_102,D_103,D_104,D_105,D_106,D_107,B_36,B_37,R_26,R_27,B_38,D_108,D_109,D_110,D_111,B_39,D_112,B_40,S_27,D_113,D_114,D_115,D_116,D_117,D_118,D_119,D_120,D_121,D_122,D_123,D_124,D_125,D_126,D_127,D_128,D_129,B_41,B_42,D_130,D_131,D_132,D_133,R_28,D_134,D_135,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.9385,0.0017,0.0087,1.0068,0.0092,0.124,0.0088,0.0047,,,0.0006,0.081,0.7089,0.1706,0.0062,0.3586,0.5254,0.2557,,0.0639,0.0594,0.0065,0.1487,1.3359,0.0082,0.0014,0.2073,0.7365,0.0962,,0.0234,0.0028,0.0083,1.0015,0.0083,0.1613,0.1483,0.923,0.3546,0.152,0.1181,0.0019,0.1586,0.0657,0.0184,0.0636,0.1996,0.3082,0.0164,0.4016,0.0911,CR,O,0.0071,0.0077,,0.653,0.0085,,0.0047,6.0,0.272,0.0084,0.5152,0.0026,0.009,0.0048,0.0083,0.1194,0.0048,0.1083,0.0509,,0.0076,0.0804,0.0691,,0.0043,0.0076,,0.0077,0.0003,0.0016,0.0042,0.0014,,0.0023,0.0041,0.0071,0.0025,0.0023,0.0035,0.5066,0.008,1.0098,0.0847,0.0038,0.007,0.0004,0.0065,0.0008,0.0051,,0.0,0.0057,0.0071,,0.0002,0.0089,,1,0.0025,0.0052,0.0066,0.0097,0.0078,0.0024,1.0011,0.0027,0.0075,0.0069,1.5037,1.0061,0.0036,0.0089,0.0039,0.0036,0.005,0.8941,0.1356,0.9112,0.9745,0.0012,0.7667,1.0087,1.0046,0.8937,,0.67,0.01,0.0046,,1.0089,2.0,,0.0043,,,,1.0073,0.2101,0.6769,0.0079,1.0,0.2383,0.0,4.0,0.2321,0.2363,0.0,0.7023,0.4343,0.0031,0.6865,0.0087,1.0,1.0033,1.0078,1.0001,0.0068,,0.0021,0.006,,0.0043,0.0015,,,,,,0.0024,0.0037,0.0038,,0.0006,0.0006,0.0027,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.9367,0.0058,0.0049,1.0007,0.0062,0.1267,0.0008,0.0027,,,0.0025,0.0694,0.7128,0.1132,0.0062,0.3536,0.5213,0.2233,,0.0653,0.0577,0.0016,0.1497,1.3398,0.0084,0.002,0.2028,0.7209,0.0998,,0.0306,0.0027,0.0025,1.009,0.0051,0.141,0.1435,0.9194,0.3268,0.1562,0.1187,0.0016,0.1485,0.0939,0.013,0.0655,0.1514,0.265,0.0177,0.4063,0.0868,CR,O,0.0024,0.0071,,0.6471,0.0022,,0.0039,6.0,0.189,0.004,0.509,0.0042,0.0078,0.0013,0.0065,0.1406,0.0001,0.101,0.0405,,0.0048,0.0814,0.0742,,0.0042,0.0053,,0.0019,0.001,0.0099,0.0076,0.0005,,0.0098,0.0001,0.006,0.0004,0.0013,0.0078,0.5009,0.0008,1.0095,0.0818,0.0003,0.0078,0.0043,0.0023,0.0095,0.0038,,0.0,0.0076,0.0067,,0.0011,0.0059,,1,0.0084,0.009,0.0019,0.0099,0.006,0.0022,1.0068,0.0025,0.0068,0.0028,1.5036,1.0058,0.0006,0.0004,0.0084,0.0088,0.0032,0.9021,0.1363,0.9199,0.9756,0.0046,0.786,1.0001,1.0041,0.9068,,0.6686,0.0039,0.0047,,1.0032,2.0,,0.0087,,,,1.0077,0.1841,0.8223,0.0034,1.0,0.2472,0.0,4.0,0.2435,0.2419,0.0,0.707,0.4305,0.0013,0.6864,0.0008,1.0,1.0084,1.0043,1.0083,0.0044,,0.001,0.0048,,0.0075,0.0049,,,,,,0.004,0.0032,0.005,,0.0096,0.0055,0.0092,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.9542,0.0915,0.0217,1.0097,0.0068,0.124,0.0076,0.0094,,,0.0076,0.0688,0.7209,0.0605,0.0033,0.3347,0.5246,0.1894,,0.067,0.0566,0.0051,0.152,1.3372,0.0094,0.0074,0.2066,0.738,0.1341,,0.0484,0.0101,0.0005,1.0092,0.007,0.1122,0.137,1.002,0.3041,0.1538,0.1145,0.0063,0.1395,0.0848,0.0567,0.0706,0.3059,0.2122,0.064,0.4068,0.094,CR,O,0.0019,0.0036,,0.6458,0.0004,,0.0046,6.0,0.4953,0.0068,0.6793,0.0013,0.006,0.0094,0.0026,0.0759,0.0072,0.1032,0.0475,,0.0066,0.0789,0.0765,,0.0018,0.0014,,0.0054,0.0061,0.0096,0.0031,0.0083,,0.0094,0.001,0.0054,0.0073,0.0076,0.0088,0.5046,0.0041,1.0043,0.082,0.0027,0.0041,0.0071,0.0084,0.0023,0.0074,,0.0,0.0059,0.0012,,0.008,0.0089,,1,0.0073,0.002,0.0087,0.0084,0.0073,0.0078,1.001,0.0096,0.0098,0.0051,1.5034,1.0058,0.0074,0.0092,0.0025,0.0098,0.0054,0.9397,0.1349,0.9587,0.9741,0.0117,0.8068,1.003,1.0093,0.9287,,0.6709,0.0013,0.0192,,1.0008,2.0,,0.0041,,,,1.0043,0.1548,0.8535,0.0033,1.0,0.2399,0.0,4.0,0.2408,0.2397,0.0,0.7048,0.4344,0.004,0.6901,0.0096,1.0,1.0093,1.0078,1.0069,0.0032,,0.0057,0.0055,,0.0092,0.0091,,,,,,0.0033,0.0073,0.0004,,0.0034,0.007,0.0026,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.9604,0.0025,0.0137,1.0027,0.0014,0.1172,0.0007,0.0055,,,0.0064,0.0556,0.724,0.1668,0.0099,0.3233,0.5309,0.1356,,0.0837,0.0493,0.0014,0.1512,1.3399,0.0068,0.0035,0.2082,0.7418,0.1344,,0.0301,0.0097,0.0008,1.0075,0.0087,0.1028,0.129,0.704,0.2751,0.1558,0.1207,0.005,0.1381,0.0484,0.0125,0.0659,0.2736,0.2043,0.0227,0.4052,0.0949,CR,O,0.0059,0.0059,,0.6544,0.0059,,0.0052,6.0,0.5087,0.0082,0.5153,0.0087,0.0053,0.0046,0.0021,0.1502,0.0054,0.2064,0.0317,,0.0096,0.0775,0.0715,,0.0056,0.0064,,0.0006,0.0092,0.0086,0.0039,0.0052,,0.0049,0.0057,0.0019,0.005,0.0,0.0047,0.509,0.007,1.0047,0.0606,0.01,0.0088,0.0087,0.0074,0.0059,0.0088,,0.0,0.0025,0.0033,,0.0095,0.0083,,1,0.0071,0.0039,0.0025,0.0066,0.01,0.0077,1.0028,0.0078,0.0005,0.0073,1.5037,1.007,0.0007,0.0032,0.0085,0.0049,0.0001,0.9132,0.1401,0.9263,0.9755,0.0076,0.8082,1.0015,1.0045,0.9354,,0.6726,0.0027,0.0117,,1.0053,2.0,,0.0097,,,,1.0025,0.1539,0.8447,0.0001,1.0,0.2409,0.0,4.0,0.2394,0.2407,0.0,0.7115,0.4369,0.0051,0.6878,0.0046,1.0,1.0017,1.0035,1.0076,0.0077,,0.0071,0.0083,,0.0072,0.0024,,,,,,0.0061,0.0045,0.0032,,0.0084,0.0065,0.0096,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.9472,0.0025,0.0152,1.0007,0.0076,0.1173,0.0047,0.0093,,,0.0077,0.0389,0.7206,0.1436,0.0067,0.231,0.5293,,,0.0759,0.0489,0.0012,0.154,1.3417,0.0005,0.0014,0.2055,0.692,0.1215,,0.0542,0.0095,0.0067,1.0037,0.0038,0.0943,0.1295,0.9171,0.2311,0.1549,0.0952,0.0017,0.1264,0.0393,0.0279,0.0637,0.2331,0.1757,0.0312,0.4875,0.0939,CR,O,0.0095,0.0017,,0.6501,0.0078,,0.0059,6.0,0.2165,0.0086,0.5077,0.0068,0.0002,0.0001,0.0014,0.0964,0.008,0.106,0.0327,,0.0082,0.0766,0.0744,,0.0049,0.0048,,0.0018,0.0057,0.0033,0.0026,0.0073,,0.0074,0.0045,0.0061,0.0022,0.0021,0.0011,0.5062,0.0018,1.0009,0.0625,0.0059,0.0018,0.0078,0.0025,0.0055,0.0072,,0.0,0.0002,0.0015,,0.002,0.0027,,1,0.0077,0.0034,0.0022,0.0055,0.0041,0.0097,1.0065,0.0052,0.0033,0.0003,1.5099,1.0029,0.0031,0.0038,0.0072,0.003,0.0005,0.921,0.1316,0.9335,0.978,0.0182,0.8223,1.0061,1.0057,0.9534,,0.6739,0.01,0.0176,,1.0032,2.0,,0.0091,,,,1.0001,0.1207,0.8112,0.0087,1.0,0.2479,0.0,4.0,0.2442,0.2423,0.0,0.7053,0.4374,0.0028,0.6888,0.0001,1.0,1.0099,1.0051,1.0081,0.0098,,0.0097,0.0048,,0.0063,0.0045,,,,,,0.0037,0.0049,0.0089,,0.0017,0.0081,0.0098,0


In [8]:
# カラム名をわかりやすくする
titles=['Delinquency '+str(i).split('_')[1] if i.startswith('D') 
        else 'Spend '+str(i).split('_')[1] if i.startswith('S') 
        else 'Payment '+str(i).split('_')[1] if i.startswith('P') 
        else 'Balance '+str(i).split('_')[1] if i.startswith('B') 
        else 'Risk '+str(i).split('_')[1] if i.startswith('R')
        else customer_ID
        for i in train.columns[:-1]
        ]

In [9]:
titles.append('target')

In [12]:
import csv
# 設定したアウトプットファイル
f = open(os.path.join(OUTPUT_DIR, 'train_df_names.csv'), 'w')
writer = csv.writer(f)
writer.writerow(titles)
f.close()

In [9]:
# train データのカラム名変更
titles.append('target')
train.columns = titles

## train_labels 予測対象

In [47]:
train_labels.head()

Unnamed: 0,customer_ID,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0


In [12]:
# 予測するID数は 458193 個らしい
train_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB


# 前処理 + 特徴量エンジニアリング(train data)

クラッシュするため、各IDごとのレコード数を減らす

In [13]:
# train データに関して、各IDごと2レコードのみにして、プロットするために category カラムを追加
train2 = train.groupby(customer_ID).tail(2)

欠損値率95%以上のカラムを削除する

In [14]:
miss_val = ['Delinquency 87', 'Delinquency 88', 'Delinquency 108', 'Delinquency 111', 'Delinquency 110', 'Balance 39', 'Delinquency 73', 'Balance 42']

train2 = train2.drop(columns=miss_val, axis=1)

## 基本統計量の計算(train data)

In [15]:
# 改めて連続変数とカテゴリ変数のリスト作成
COLS = list(train2.columns[2:190])
cat_cols = ['Balance 30', 'Balance 38', 'Delinquency 63', 'Delinquency 64', 'Delinquency 66', 'Delinquency 68',
          'Delinquency 114', 'Delinquency 116', 'Delinquency 117', 'Delinquency 120', 'Delinquency 126']
# カテゴリー変数に加えて、customer_ID, timestamp のカラムは除く
con_cols = [col for col in COLS if col not in cat_cols and col != ['Spend 2', customer_ID]]

In [16]:
# 平均・分散・最小・最大値・最大・最後の値
train_num_agg = train2.groupby(customer_ID)[con_cols].agg(['mean', 'std', 'min', 'max', 'last'])
# マルチカラムになっているのでそれをシングルカラムにしてあげる
## 参考記事：(https://qiita.com/rinascimento741/items/e2fceb8626ac97ebf49b)
train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
train_num_agg.reset_index(inplace = True)

# customer_IDごとの各カテゴリーに該当する数、最後の値
train_cat_agg = train2.groupby(customer_ID)[cat_cols].agg(['count', 'last', 'nunique'])
# マルチカラムになっているのでそれをシングルカラムにしてあげる
train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
train_cat_agg.reset_index(inplace = True)

In [29]:
# 進捗状況を可視化するメソッドをdataframe オブジェクトに追加する
tqdm.pandas()
# 計算コスト削減のためにfloat64をfloat32に変換
cols = list(train_num_agg.dtypes[train_num_agg.dtypes == 'float64'].index)
train_num_agg.loc[:,cols] = train_num_agg.loc[:,cols].progress_apply(lambda x: x.astype(np.float32))

# 計算コスト削減のためにint64をint32に変換
cols = list(train_cat_agg.dtypes[train_cat_agg.dtypes == 'int64'].index)
train_cat_agg.loc[:,cols] = train_cat_agg.loc[:,cols].progress_apply(lambda x: x.astype(np.int32))

100%|██████████| 172/172 [00:00<00:00, 295.85it/s]
100%|██████████| 22/22 [00:00<00:00, 2115.37it/s]


## 差分の計算(train)

以下の方法がメモリを節約しながらかなり早くできる

In [33]:
train_diff = train2.loc[:,con_cols+[customer_ID]].groupby([customer_ID]).progress_apply(lambda x:np.diff(x.values[-2:,:], axis=0).squeeze().astype(np.float32))
index = train_diff.index
cols = [col + '_diff1' for col in train2[con_cols].columns]
train_diff = pd.DataFrame(train_diff.values.tolist(), columns = cols)
train_diff[customer_ID] = index

100%|██████████| 458913/458913 [00:13<00:00, 35043.21it/s]


以下の方法はnotebookに記載されていた方法だが、若干重い。自分のCPUではクラッシュしてしまうことがある

In [37]:
def get_difference(data, num_features):
    df1 = []
    customer_ids = []
    # `tqdm` customer_IDごとでグループ分けをした groupby オブジェクトをcustomer_ID, dfに分ける
    for customer_id, df in tqdm(data.groupby(['customer_ID'])):
        df1.append(
            # 各変数 ごとに同じcustomer_ID内で前のレコードとの差分を出して、そのうち一番最後のレコードを取得する
            df[num_features].diff(1).iloc[[-1]].values.astype(np.float32)
        )
        customer_ids.append(customer_id)
    # よくわからん
    df1 = np.concatenate(df1, axis = 0)
    # カラム名に `_diff1`と付けた上で、dataframe型に変える
    df1 = pd.DataFrame(df1, columns = [col + '_diff1' for col in df[num_features].columns])
    # customer_IDカラムを追加する
    df1['customer_ID'] = customer_ids
    return df1

## マージ(train data)

In [35]:
# train_num_agg, train_cat_agg, train_diff, train_labels をマージして
train2 = pd.merge(train_num_agg, train_cat_agg, how='inner', on=customer_ID).merge(train_diff, how='inner', on=customer_ID).merge(train_labels, how='inner', on=customer_ID)

del train_num_agg, train_cat_agg, train_diff
gc.collect()

0

## データの書き出し(train data)

In [40]:
train2.to_parquet('/Users/satoshiido/Documents/coding_general/kaggle/amex-default-prediction/outputs/train2.parquet')

In [None]:
del train2

# データ読み込み(test data )

In [None]:
test = read_parquet('test_data2')

In [None]:
test.info()

In [None]:
test.head()

In [None]:
# test データのカラム名変更
test.columns = titles[:-1]

# 前処理・特徴量エンジニアリング(test data)

クラッシュするため、各IDごとのレコード数を減らす

In [None]:
# 1000万個以上あったので各IDごと2レコードのみにして、category カラム target カラムを追加
test2 = test.groupby(customer_ID).tail(2)

欠損値率95%以上のカラムを削除する

In [None]:
test2 = test2.drop(columns=miss_val, axis=1)

## 基本統計量の計算(test data)

In [None]:
# test data向けに target を削除する
con_cols_test = [col for col in con_cols if col != TARGET]

In [None]:
# 平均・分散・最小・最大値・最大・最後の値
test_num_agg = test2.groupby(customer_ID)[con_cols_test].agg(['mean', 'std', 'min', 'max', 'last'])
# マルチカラムになっているのでそれをシングルカラムにしてあげる
## 参考記事：(https://qiita.com/rinascimento741/items/e2fceb8626ac97ebf49b)
test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
test_num_agg.reset_index(inplace = True)

# customer_IDごとの各カテゴリーに該当する数、最後の値
test_cat_agg = test2.groupby(customer_ID)[cat_cols].agg(['count', 'last', 'nunique'])
# マルチカラムになっているのでそれをシングルカラムにしてあげる
test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
test_cat_agg.reset_index(inplace = True)

In [None]:
# 進捗状況を可視化するメソッドをdataframe オブジェクトに追加する
tqdm.pandas()
# 計算コスト削減のためにfloat64をfloat32に変換
cols = list(test_num_agg.dtypes[test_num_agg.dtypes == 'float64'].index)
test_num_agg.loc[:,cols] = test_num_agg.loc[:,cols].progress_apply(lambda x: x.astype(np.float32))

# 計算コスト削減のためにint64をint32に変換
cols = list(test_cat_agg.dtypes[test_cat_agg.dtypes == 'int64'].index)
test_cat_agg.loc[:,cols] = test_cat_agg.loc[:,cols].progress_apply(lambda x: x.astype(np.int32))

## 差分の計算(test data)

In [None]:
test_diff = test2.loc[:,con_cols_test+[customer_ID]].groupby([customer_ID]).progress_apply(lambda x:np.diff(x.values[-2:,:], axis=0).squeeze().astype(np.float32))
index = test_diff.index
cols = [col + '_diff1' for col in test2[con_cols_test].columns]
test_diff = pd.DataFrame(test_diff.values.tolist(), columns = cols)
test_diff[customer_ID] = index

## データマージ・書き出し

In [None]:
# test_num_agg, test_cat_agg, test_diffをマージして
test2 = pd.merge(test_num_agg, test_cat_agg, how='inner', on=customer_ID).merge(test_diff, how='inner', on=customer_ID)

del test_num_agg, test_cat_agg, test_diff
gc.collect()

## カテゴリ変数の変換

カテゴリデータは基本的にそのまま特徴量として扱えないので、数値化する

* One-Hot Encoding -> gbdt系以外（線形モデル etc..）におすすめ
* Label Encoding-> gbdt系 にもおすすめ\
[【sklearn】LabelEncoderの使い方を丁寧に](https://gotutiyan.hatenablog.com/entry/2020/09/08/122621)
* Target Encoding -> gbdt系にはより効果的らしい\
[Target Encoding はなぜ有効なのか](https://speakerdeck.com/hakubishin3/target-encoding-hanazeyou-xiao-nafalseka)

## データマージ

# 学習・予測・サブミットファイル作成

## モデル構築

## モデル評価

## サブミットファイル作成