In [None]:
import os
import gc
import time
import numpy as np
import pandas as pd
from contextlib import contextmanager
import multiprocessing as mp
from functools import partial
from scipy.stats import kurtosis, iqr, skew
from lightgbm import LGBMClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
os.listdir('inputs')

In [None]:
pd.set_option('display.max_rows', 60)
pd.set_option('display.max_columns', 100)

# lightgbm-7th-place-solution.py 를 하나하나씩 다 뜯어보자.

In [None]:
def main(debug= False):
    num_rows = 30000 if debug else None
    with timer("application_train and application_test"):
        df = get_train_test(DATA_DIRECTORY, num_rows= num_rows)
        print("Application dataframe shape: ", df.shape)
    with timer("Bureau and bureau_balance data"):
        bureau_df = get_bureau(DATA_DIRECTORY, num_rows= num_rows)
        df = pd.merge(df, bureau_df, on='SK_ID_CURR', how='left')
        print("Bureau dataframe shape: ", bureau_df.shape)
        del bureau_df; gc.collect()
    with timer("previous_application"):
        prev_df = get_previous_applications(DATA_DIRECTORY, num_rows)
        df = pd.merge(df, prev_df, on='SK_ID_CURR', how='left')
        print("Previous dataframe shape: ", prev_df.shape)
        del prev_df; gc.collect()
    with timer("previous applications balances"):
        pos = get_pos_cash(DATA_DIRECTORY, num_rows)
        df = pd.merge(df, pos, on='SK_ID_CURR', how='left')
        print("Pos-cash dataframe shape: ", pos.shape)
        del pos; gc.collect()
        ins = get_installment_payments(DATA_DIRECTORY, num_rows)
        df = pd.merge(df, ins, on='SK_ID_CURR', how='left')
        print("Installments dataframe shape: ", ins.shape)
        del ins; gc.collect()
        cc = get_credit_card(DATA_DIRECTORY, num_rows)
        df = pd.merge(df, cc, on='SK_ID_CURR', how='left')
        print("Credit card dataframe shape: ", cc.shape)
        del cc; gc.collect()
    # Add ratios and groupby between different tables
    df = add_ratios_features(df)
    df = reduce_memory(df)
    lgbm_categorical_feat = [
        'CODE_GENDER', 'FLAG_OWN_CAR', 'NAME_CONTRACT_TYPE', 'NAME_EDUCATION_TYPE',
        'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'NAME_INCOME_TYPE', 'OCCUPATION_TYPE',
        'ORGANIZATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'NAME_TYPE_SUITE', 'WALLSMATERIAL_MODE']
    with timer("Run LightGBM"):
        feat_importance = kfold_lightgbm_sklearn(df, lgbm_categorical_feat)
        print(feat_importance)


In [None]:
debug = True

In [None]:
num_rows = 30000 if debug else None

In [None]:
df = get_train_test(DATA_DIRECTORY, num_rows= num_rows)

In [None]:
NUM_THREADS = 4
DATA_DIRECTORY = "inputs/"
SUBMISSION_SUFIX = "_model2_04"


In [None]:
path = DATA_DIRECTORY
num_rows= num_rows

In [None]:
train = pd.read_csv(os.path.join(path, 'application_train.csv'), nrows = num_rows)

In [None]:
test = pd.read_csv(os.path.join(path, 'application_test.csv'), nrows = num_rows)

In [None]:
train.head()

맨 먼저 해야할 일 - 타겟값이 불균형한지 확인

In [None]:
train['TARGET'].value_counts().plot.bar()

In [None]:
df = train.append(test) # pd.concat([]) 와 동일


In [None]:
del train, test
gc.collect() # 메모리 정리

# preprocessing

- EDA를 하고 해야함. 그런데 캐글의 경우, 다른 전문가들의 글이 많기에 이를 참고하면서 해보자.
- 하지만, 강의때문에 이렇게 진행하는 것이지 자신이 직접 EDA 하는 능력을 길러야 함.

In [None]:
def get_train_test(path, num_rows = None):
    """ Process application_train.csv and application_test.csv and return a pandas dataframe. """
    train = pd.read_csv(os.path.join(path, 'application_train.csv'), nrows= num_rows)
    test = pd.read_csv(os.path.join(path, 'application_test.csv'), nrows= num_rows)
    df = train.append(test)
    del train, test; gc.collect()
    # Data cleaning
    df = df[df['CODE_GENDER'] != 'XNA']  # 4 people with XNA code gender
    df = df[df['AMT_INCOME_TOTAL'] < 20000000]  # Max income in test is 4M; train has a 117M value
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)
    df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

    # Flag_document features - count and kurtosis
    docs = [f for f in df.columns if 'FLAG_DOC' in f]
    df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)
    df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)
    # Categorical age - based on target=1 plot
    df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: get_age_label(x))

    # New features based on External sources
    df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
    df['EXT_SOURCES_WEIGHTED'] = df.EXT_SOURCE_1 * 2 + df.EXT_SOURCE_2 * 1 + df.EXT_SOURCE_3 * 3
    np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')
    for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

    # Credit ratios
    df['CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']
    df['CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
    # Income ratios
    df['ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL']
    df['INCOME_TO_EMPLOYED_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_EMPLOYED']
    df['INCOME_TO_BIRTH_RATIO'] = df['AMT_INCOME_TOTAL'] / df['DAYS_BIRTH']
    # Time ratios
    df['EMPLOYED_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['ID_TO_BIRTH_RATIO'] = df['DAYS_ID_PUBLISH'] / df['DAYS_BIRTH']
    df['CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']
    df['CAR_TO_EMPLOYED_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']
    df['PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']

    # Groupby: Statistics for applications in the same group
    group = ['ORGANIZATION_TYPE', 'NAME_EDUCATION_TYPE', 'OCCUPATION_TYPE', 'AGE_RANGE', 'CODE_GENDER']
    df = do_median(df, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_MEDIAN')
    df = do_std(df, group, 'EXT_SOURCES_MEAN', 'GROUP_EXT_SOURCES_STD')
    df = do_mean(df, group, 'AMT_INCOME_TOTAL', 'GROUP_INCOME_MEAN')
    df = do_std(df, group, 'AMT_INCOME_TOTAL', 'GROUP_INCOME_STD')
    df = do_mean(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_MEAN')
    df = do_std(df, group, 'CREDIT_TO_ANNUITY_RATIO', 'GROUP_CREDIT_TO_ANNUITY_STD')
    df = do_mean(df, group, 'AMT_CREDIT', 'GROUP_CREDIT_MEAN')
    df = do_mean(df, group, 'AMT_ANNUITY', 'GROUP_ANNUITY_MEAN')
    df = do_std(df, group, 'AMT_ANNUITY', 'GROUP_ANNUITY_STD')

    # Encode categorical features (LabelEncoder)
    df, le_encoded_cols = label_encoder(df, None)
    df = drop_application_columns(df)
    return df

In [None]:
df['CODE_GENDER'].value_counts() # 지금 debug 라 없는데 , 원래 데이터셋에는 'XNA' 라는 값이 4개가 있음

In [None]:
df = df[df['CODE_GENDER'] != 'XNA']

In [None]:
# train 과 test를 맞추기 위해서 이렇게 조정함.
# train 가장 큰값은 117백만인데, test 가장 큰 값은 400백만이라서 두 값을 맞추기 위해 조정함.
# EDA로 확인해봐야 함.
df = df[df['AMT_INCOME_TOTAL'] < 20000000]

In [None]:
df[df['DAYS_EMPLOYED'] == 365243] # 아웃라이어가 11120개나 됨. 따라서 우선은 na 값으로 바꿔놓자.

In [None]:
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace = True)

In [None]:
df[df['DAYS_LAST_PHONE_CHANGE'] == 0] # 이 또한 아웃라이어

In [None]:
df['DAYS_LAST_PHONE_CHANGE'].replace(0, np.nan, inplace=True)

In [None]:
[f for f in df.columns if 'FLAG_DOC' in f]

In [None]:
docs = [f for f in df.columns if 'FLAG_DOC' in f]

In [None]:
df['DOCUMENT_COUNT'] = df[docs].sum(axis=1)

In [None]:
# 20개 항목에 예, 아니오 한걸 예 기준으로 더한건데 3개 초과해서 예 한게 없음.
df['DOCUMENT_COUNT'].value_counts() # hist()로 그래프 볼 수 있음

In [None]:
# kurtosis: 통계학 첨도
df['NEW_DOC_KURT'] = df[docs].kurtosis(axis=1)

In [None]:
df[docs].kurtosis(axis=1).hist()

In [None]:
def get_age_label(days_birth):
    """ Return the age group label (int). """
    age_years = -days_birth / 365
    if age_years < 27: return 1
    elif age_years < 40: return 2
    elif age_years < 50: return 3
    elif age_years < 65: return 4
    elif age_years < 99: return 5
    else: return 0

In [None]:
df['AGE_RANGE'] = df['DAYS_BIRTH'].apply(lambda x: get_age_label(x))
# 연속형 변수를 범주형 변수로 바꾸면, 정보 손실이 생길수도 있으나
# 범주형 변수가 또 다른 정보를 발생시킬 수 있고,
# 범주형 변수를 groupby 등 가공하여 새로운 피쳐 만들수있기에 시도해볼만함.

In [None]:
# EXT_SOURCE_ 피쳐가 피쳐중요도가 매번 중요하게 나오는데, 사람들이 정확하게 뭔지는 몰랐었음
# 하지만, 피쳐중요도가 높은 피쳐끼리 결합시키면 좋은 피쳐가 나오는 경우가 많아
# 이렇게 결합시켜줌.

# 곱할때 어느 하나의 피쳐에 na 값이 있으면 곱하면 na 로 바뀌어 버리기 때문에 정보손실이 많아짐
# 이에 df['EXT_SOURCE_1'] * df['EXT_SOURCE_2']
# df['EXT_SOURCE_1'] * df['EXT_SOURCE_3']
# df['EXT_SOURCE_2'] * df['EXT_SOURCE_3'] 이렇게 피쳐 여러개 만들기도 함

# 어떻게든 피쳐를 많이 만들어서 점수 높이는게 캐글의 목적임.

df['EXT_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']

In [None]:
# 저런 가중치가 나온 이유는 여러 테스트를 해본것임.
# 테스트를 하고 피쳐중요도가 가장 높은 가중치를 뽑아낸 것임.

df['EXT_SOURCES_WEIGHTED'] = df['EXT_SOURCES_1'] * 2 + df['EXT_SOURCE_2'] * 1 + df['EXT_SOURCE_3'] * 3

In [None]:
np.warnings.filterwarnings('ignore', r'All-NaN (slice|axis) encountered')

In [None]:
# eval: eval('np.{}'.format())

for function_name in ['min', 'max', 'mean', 'nanmedian', 'var']:
        feature_name = 'EXT_SOURCES_{}'.format(function_name.upper())
        df[feature_name] = eval('np.{}'.format(function_name))(
            df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']], axis=1)

In [None]:
# eval 설명. 여러 함수를 한번에 실행할때 쓰임.
for i in ['min', 'max', 'std']:
    print(eval('np.{}'.format(i))([1, 2, 3]))

https://youtu.be/aoo1xrKQXFc?t=3219