# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [1]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [2]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train).copy()
app_train.shape

(307511, 122)

In [3]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
app_train['DAYS_EMPLOYED'].value_counts()

 365243    55374
-200         156
-224         152
-199         151
-230         151
           ...  
-11115         1
-11371         1
-11883         1
-13536         1
-8795          1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

In [5]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [6]:
top_5 = app_train.corr()['TARGET'].nlargest(n=5).index
top_5

Index(['TARGET', 'DAYS_EMPLOYED', 'REGION_RATING_CLIENT_W_CITY',
       'REGION_RATING_CLIENT', 'DAYS_LAST_PHONE_CHANGE'],
      dtype='object')

In [7]:
small_5 = app_train.corr()['TARGET'].nsmallest(n=5).index
small_5

Index(['EXT_SOURCE_3', 'EXT_SOURCE_2', 'EXT_SOURCE_1', 'DAYS_BIRTH',
       'DAYS_EMPLOYED_ANOM'],
      dtype='object')

In [8]:
app_top5 = app_train[top_5].copy()
app_small5 = app_train[small_5].copy()

In [9]:
app_top5.isnull().sum()

TARGET                             0
DAYS_EMPLOYED                  55374
REGION_RATING_CLIENT_W_CITY        0
REGION_RATING_CLIENT               0
DAYS_LAST_PHONE_CHANGE             1
dtype: int64

In [10]:
app_top5['DAYS_EMPLOYED'].fillna(app_top5['DAYS_EMPLOYED'].mean(), inplace=True)

In [11]:
app_top5.isnull().sum()

TARGET                         0
DAYS_EMPLOYED                  0
REGION_RATING_CLIENT_W_CITY    0
REGION_RATING_CLIENT           0
DAYS_LAST_PHONE_CHANGE         1
dtype: int64

In [12]:
# small
app_small5.isnull().sum()

EXT_SOURCE_3           60965
EXT_SOURCE_2             660
EXT_SOURCE_1          173378
DAYS_BIRTH                 0
DAYS_EMPLOYED_ANOM         0
dtype: int64

In [13]:
app_small5['EXT_SOURCE_3'].fillna(value=app_small5['EXT_SOURCE_3'].mean(),inplace=True)
app_small5['EXT_SOURCE_2'].fillna(value=app_small5['EXT_SOURCE_2'].mean(),inplace=True)
app_small5['EXT_SOURCE_1'].fillna(value=app_small5['EXT_SOURCE_1'].mean(),inplace=True)

In [14]:
# small
app_small5.isnull().sum()

EXT_SOURCE_3          0
EXT_SOURCE_2          0
EXT_SOURCE_1          0
DAYS_BIRTH            0
DAYS_EMPLOYED_ANOM    0
dtype: int64

In [15]:
app_small5 = app_small5.drop('DAYS_EMPLOYED_ANOM', axis=1)

In [16]:
app_top5

Unnamed: 0,TARGET,DAYS_EMPLOYED,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE
0,1,-637.000000,2,2,-1134.0
1,0,-1188.000000,1,1,-828.0
2,0,-225.000000,2,2,-815.0
3,0,-3039.000000,2,2,-617.0
4,0,-3038.000000,2,2,-1106.0
...,...,...,...,...,...
307506,0,-236.000000,1,1,-273.0
307507,0,-2384.169325,2,2,0.0
307508,0,-7921.000000,3,3,-1909.0
307509,1,-4786.000000,2,2,-322.0


In [17]:
app_small5

Unnamed: 0,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_BIRTH
0,0.139376,0.262949,0.083037,9461
1,0.510853,0.622246,0.311267,16765
2,0.729567,0.555912,0.502130,19046
3,0.510853,0.650442,0.502130,19005
4,0.510853,0.322738,0.502130,19932
...,...,...,...,...
307506,0.510853,0.681632,0.145570,9327
307507,0.510853,0.115992,0.502130,20775
307508,0.218859,0.535722,0.744026,14966
307509,0.661024,0.514163,0.502130,11961


In [18]:
df = pd.concat([app_top5, app_small5], axis=1)
df

Unnamed: 0,TARGET,DAYS_EMPLOYED,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_BIRTH
0,1,-637.000000,2,2,-1134.0,0.139376,0.262949,0.083037,9461
1,0,-1188.000000,1,1,-828.0,0.510853,0.622246,0.311267,16765
2,0,-225.000000,2,2,-815.0,0.729567,0.555912,0.502130,19046
3,0,-3039.000000,2,2,-617.0,0.510853,0.650442,0.502130,19005
4,0,-3038.000000,2,2,-1106.0,0.510853,0.322738,0.502130,19932
...,...,...,...,...,...,...,...,...,...
307506,0,-236.000000,1,1,-273.0,0.510853,0.681632,0.145570,9327
307507,0,-2384.169325,2,2,0.0,0.510853,0.115992,0.502130,20775
307508,0,-7921.000000,3,3,-1909.0,0.218859,0.535722,0.744026,14966
307509,1,-4786.000000,2,2,-322.0,0.661024,0.514163,0.502130,11961


In [19]:
df.isnull().sum()

TARGET                         0
DAYS_EMPLOYED                  0
REGION_RATING_CLIENT_W_CITY    0
REGION_RATING_CLIENT           0
DAYS_LAST_PHONE_CHANGE         1
EXT_SOURCE_3                   0
EXT_SOURCE_2                   0
EXT_SOURCE_1                   0
DAYS_BIRTH                     0
dtype: int64

In [20]:
df = df.dropna()
df.isnull().sum()

TARGET                         0
DAYS_EMPLOYED                  0
REGION_RATING_CLIENT_W_CITY    0
REGION_RATING_CLIENT           0
DAYS_LAST_PHONE_CHANGE         0
EXT_SOURCE_3                   0
EXT_SOURCE_2                   0
EXT_SOURCE_1                   0
DAYS_BIRTH                     0
dtype: int64

In [21]:
train_Y = df['TARGET']
train_X = df.drop('TARGET', axis=1)

In [22]:
train_X

Unnamed: 0,DAYS_EMPLOYED,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_BIRTH
0,-637.000000,2,2,-1134.0,0.139376,0.262949,0.083037,9461
1,-1188.000000,1,1,-828.0,0.510853,0.622246,0.311267,16765
2,-225.000000,2,2,-815.0,0.729567,0.555912,0.502130,19046
3,-3039.000000,2,2,-617.0,0.510853,0.650442,0.502130,19005
4,-3038.000000,2,2,-1106.0,0.510853,0.322738,0.502130,19932
...,...,...,...,...,...,...,...,...
307506,-236.000000,1,1,-273.0,0.510853,0.681632,0.145570,9327
307507,-2384.169325,2,2,0.0,0.510853,0.115992,0.502130,20775
307508,-7921.000000,3,3,-1909.0,0.218859,0.535722,0.744026,14966
307509,-4786.000000,2,2,-322.0,0.661024,0.514163,0.502130,11961


In [23]:
train_Y

0         1
1         0
2         0
3         0
4         0
         ..
307506    0
307507    0
307508    0
307509    1
307510    0
Name: TARGET, Length: 307510, dtype: int64

In [24]:
# this is the result before pd.cut
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
train_X = StandardScaler().fit_transform(train_X)
estimator = LogisticRegression()
cross_val_score(estimator, train_X, train_Y, cv=5).mean()

0.9191473448017952

In [25]:
train_X1 = df.drop('TARGET', axis=1)

In [26]:
train_X1

Unnamed: 0,DAYS_EMPLOYED,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,DAYS_LAST_PHONE_CHANGE,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,DAYS_BIRTH
0,-637.000000,2,2,-1134.0,0.139376,0.262949,0.083037,9461
1,-1188.000000,1,1,-828.0,0.510853,0.622246,0.311267,16765
2,-225.000000,2,2,-815.0,0.729567,0.555912,0.502130,19046
3,-3039.000000,2,2,-617.0,0.510853,0.650442,0.502130,19005
4,-3038.000000,2,2,-1106.0,0.510853,0.322738,0.502130,19932
...,...,...,...,...,...,...,...,...
307506,-236.000000,1,1,-273.0,0.510853,0.681632,0.145570,9327
307507,-2384.169325,2,2,0.0,0.510853,0.115992,0.502130,20775
307508,-7921.000000,3,3,-1909.0,0.218859,0.535722,0.744026,14966
307509,-4786.000000,2,2,-322.0,0.661024,0.514163,0.502130,11961


In [27]:
train_X1['DAYS_EMPLOYED'], year_bins = pd.cut(abs(train_X1['DAYS_EMPLOYED'])/365,4,retbins=True)

In [28]:
print(year_bins)

[-0.04907397 12.26849315 24.5369863  36.80547945 49.0739726 ]


In [29]:
train_X1['DAYS_BIRTH'], birth_bins = pd.cut(abs(train_X1['DAYS_BIRTH'])/365, 4, retbins=True)

In [30]:
print(birth_bins)

[20.46920548 32.66849315 44.81917808 56.96986301 69.12054795]


In [31]:
train_X1['DAYS_LAST_PHONE_CHANGE'], phone_bins = pd.cut(abs(train_X1['DAYS_LAST_PHONE_CHANGE'])/365, 4, retbins=True)

In [32]:
print(phone_bins)

[-0.0117589   2.93972603  5.87945205  8.81917808 11.75890411]


In [33]:
em_dum = pd.get_dummies(train_X1['DAYS_EMPLOYED'],drop_first=True)
birth_dum = pd.get_dummies(train_X1['DAYS_BIRTH'],drop_first=True)
phone_dum = pd.get_dummies(train_X1['DAYS_LAST_PHONE_CHANGE'],drop_first=True)

train_X1 = pd.concat([train_X1, em_dum, birth_dum, phone_dum], axis=1)

In [34]:
# drop the old data
train_X1 = train_X1.drop(['DAYS_EMPLOYED','DAYS_BIRTH','DAYS_LAST_PHONE_CHANGE'], axis=1)

In [35]:
train_X1

Unnamed: 0,REGION_RATING_CLIENT_W_CITY,REGION_RATING_CLIENT,EXT_SOURCE_3,EXT_SOURCE_2,EXT_SOURCE_1,"(12.268, 24.537]","(24.537, 36.805]","(36.805, 49.074]","(32.668, 44.819]","(44.819, 56.97]","(56.97, 69.121]","(2.94, 5.879]","(5.879, 8.819]","(8.819, 11.759]"
0,2,2,0.139376,0.262949,0.083037,0,0,0,0,0,0,1,0,0
1,1,1,0.510853,0.622246,0.311267,0,0,0,0,1,0,0,0,0
2,2,2,0.729567,0.555912,0.502130,0,0,0,0,1,0,0,0,0
3,2,2,0.510853,0.650442,0.502130,0,0,0,0,1,0,0,0,0
4,2,2,0.510853,0.322738,0.502130,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307506,1,1,0.510853,0.681632,0.145570,0,0,0,0,0,0,0,0,0
307507,2,2,0.510853,0.115992,0.502130,0,0,0,0,1,0,0,0,0
307508,3,3,0.218859,0.535722,0.744026,1,0,0,1,0,0,1,0,0
307509,2,2,0.661024,0.514163,0.502130,1,0,0,1,0,0,0,0,0


In [41]:
# this is the result before pd.cut
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
train_X1 = StandardScaler().fit_transform(train_X1)
estimator = LogisticRegression()
cross_val_score(estimator, train_X1, train_Y, cv=5).mean()

0.91924815453156