In [None]:
import numpy as np
import pandas as pd
import gc
import time
import matplotlib.pyplot as plt
import seaborn as sns
#import warning

%matplotlib inline

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 200)

In [None]:
import os, sys
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
#default drvie

default_url = '/content/gdrive/MyDrive/home-credit-default-risk'
app_train = pd.read_csv(os.path.join(default_url, 'application_train.csv'))
app_test = pd.read_csv(os.path.join(default_url, 'application_test.csv'))

In [None]:
app_train.head()

In [None]:
print(app_train.shape, app_test.shape)

#### TARGET 값 분포 및 AMT_INCOME_TOTAL 값 Histogram
* TARGET값 별 분포도, Pandas, Matplotlib,
Seaborn으로 Histogram 표현

In [None]:
app_train['TARGET'].value_counts()

In [None]:
app_train['AMT_CREDIT'].hist()

In [None]:
plt.hist(app_train['AMT_INCOME_TOTAL'])

In [None]:
sns.distplot(app_train['AMT_INCOME_TOTAL'])

In [None]:
sns.boxplot(app_train['AMT_INCOME_TOTAL'])

In [None]:
cond_1 = app_train['AMT_INCOME_TOTAL'] < 1000000
app_train[cond_1]['AMT_INCOME_TOTAL'].hist(
)

In [None]:
sns.distplot(app_train[cond_1]['AMT_CREDIT'])

In [None]:
# TARGET값에 따른 Filtering 조건 각각 설정
cond_1 = app_train['TARGET'] == 1
cond_0 = app_train['TARGET'] == 0
# AMT_INCOME_TOTAL은 매우 큰 값이 있으므로 이는 제외.
cond_amt = app_train['AMT_INCOME_TOTAL'] < 500000
# displot으로 TARGET=1 이면 빨간색으로, 0이면 푸른색으로 Histogram 표현
sns.distplot(app_train[cond_0 & cond_amt]['AMT_INCOME_TOTAL'], label='0', color='blue')
sns.distplot(app_train[cond_1 & cond_amt]['AMT_INCOME_TOTAL'], label='1', color='red')

In [None]:
# violinplot을 이용하면 Category 값별로 연속형 값의 분포도를 알수 있음. x는 category컬럼, y는 연속형 컬럼 
sns.violinplot(x='TARGET', y="AMT_INCOME_TOTAL", data= app_train[cond_amt])

In [None]:
fig, axs = plt.subplots(figsize=(12,4), nrows=1, ncols=2)

In [None]:
cond_1 = app_train['TARGET'] == 1
cond_0 = app_train['TARGET'] == 0
cond_amt = app_train['AMT_INCOME_TOTAL'] < 500000
# subplot에 그림
fig, axs = plt.subplots(figsize=(12,4), nrows=1, ncols=2, squeeze=False)
sns.violinplot(x='TARGET', y='AMT_INCOME_TOTAL', data=app_train[cond_amt], ax=axs[0][0] )
sns.distplot(app_train[cond_0 & cond_amt]['AMT_INCOME_TOTAL'], label='0', color='blue', ax=axs[0][1])
sns.distplot(app_train[cond_1 & cond_amt]['AMT_INCOME_TOTAL'], label='1', color='red', ax=axs[0][1])

In [None]:
def show_column_hist_by_target(df, column, is_amt=False):

    cond1 = df['TARGET'] == 1
    cond0 = df['TARGET'] == 0

    fig, axs = plt.subplots(figsize=(12,4), nrows=1, ncols=2, squeeze=False)
    # is_amt가 True이면 < 500000 조건으로 filtering
    cond_amt = True
    if is_amt:
        cond_amt = df[column] < 500000
    sns.violinplot(x='TARGET', y=column, data=df[cond_amt], ax=axs[0][0])
    sns.distplot(df[cond0 & cond_amt][column], label='0', color='blue', ax=axs[0][1])
    sns.distplot(df[cond1 & cond_amt][column], label='1', color='red', ax=axs[0][1])

show_column_hist_by_target(app_train, 'AMT_INCOME_TOTAL',is_amt=True)

#### app_train과 app_test를 합쳐서 한번에 데이터 전처리 수행

In [None]:
print(app_train.shape, app_test.shape)

In [None]:
# pandas의 concat()을 이용하여 app_train과 app_test를 결합
apps = pd.concat([app_train,app_test])
apps.shape

In [None]:
# app_train TARGET 값을 Null로 입력됨
apps['TARGET'].value_counts(dropna=False)

####  Object feature들을 Label Encoding
* pandas의 factorize()를 이용

In [None]:
apps.info()

In [None]:
# pd.factorize()는 편리하게 범주형 칼럼을 Label 인코딩 수행
# pd.factorize(Category컬럼 Series)는 Label인코딩된 Series와 uniq한 Category값을 반환함. [0]을 이용하여 Label인코딩 Series만 취함. 
apps

In [None]:
apps.dtypes

In [None]:
apps.dtypes[apps.dtypes == 'object'].index

In [None]:
object_columns = apps.dtypes[apps.dtypes == 'object'].index.tolist()

In [None]:
object_columns

In [None]:
for column in object_columns:
    apps[column] = pd.factorize(apps[column])[0]

In [None]:
apps

In [None]:
apps.dtypes

#### NULL값 일괄 변환

In [None]:
apps.isnull().sum().head(100)

In [None]:
# -999로 모든 컬럼의 Null값 변환
apps = apps.fillna(-999)

In [None]:
apps.isnull().sum().head(100)

#### 학습 데이터와 테스트 데이터 다시 분리

In [None]:
# app_test의 TARGET 컬럼은 원래 null이었는데 앞에서 fillna(-999)로 -999로 변환됨. 이를 추출함. 
app_train = apps[apps['TARGET'] != -999]
app_test = apps[apps['TARGET'] == -999]
print(app_train.shape, app_test.shape)

In [None]:
# app_test의 TARGET 칼럼을 DROP
app_test = app_test.drop('TARGET', axis=1)

In [None]:
print(app_train.shape, app_test.shape)

#### 학습 데이터를 검증 데이터로 분리하고 LGBM Classifier로 학습 수행
* 피처용 데이터와 타겟 데이터 분리
* 학습용/검증용 데이터 세트 분리

In [None]:
ftr_app = app_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = app_train['TARGET']

In [None]:
from sklearn.model_selection import train_test_split

train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=2020)
print(train_x.shape, valid_x.shape)

In [None]:
print(train_y.shape, valid_y.shape)

In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier(
    n_jobs=-1,
    n_estimators=1000,
    learning_rate=0.02,
    num_leaves=32,
    subsample=0.8,
    max_depth=12,
    silent=-1,
    verbose=-1
)

clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
eval_metric='auc', verbose=100, early_stopping_rounds=50)

In [None]:
from lightgbm import plot_importance
plot_importance(clf, figsize=(16,32))


#### 학습된 Classifier를 이용하여 테스트 데이터를 예측하고 결과를 Kaggle로 Submit 수행

In [None]:
# 학습된 classifier의 predict_proba()를 이용
preds = clf.predict_proba(app_test.drop(['SK_ID_CURR'], axis=1))

In [None]:
# [0이될 확률, 1이될 확률]
preds

In [None]:
#이진 분류에서 1이될 확률만 계산
preds = preds[:, 1]

In [None]:
app_test['TARGET'] = preds
app_test['TARGET'].head(10)

##### 코랩 버전은 google drive로 예측결과 csv 생성

In [None]:
import os, sys
from google.colab import drive

drive.mount('/content/gdrive')

In [None]:
# SK_ID_CURR과 TARGET 값만 csv 형태로 생성. 코랩 버전은 구글 드라이브 절대 경로로 입력  
app_test[['SK_ID_CURR', 'TARGET']].to_csv(os.path.join(default_url, 'app_baseline_01.csv'), index=False)

In [None]:
%ls "/content/gdrive/MyDrive/home-credit-default-risk"