In [58]:
# Import 需要的套件
import os
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

### 之前做過的處理

In [59]:
# 設定 data_path
dir_data = './data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

# Create an anomalous flag column
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
# also apply to testing dataset
app_test['DAYS_EMPLOYED_ANOM'] = app_test["DAYS_EMPLOYED"] == 365243
app_test["DAYS_EMPLOYED"].replace({365243: np.nan}, inplace = True)

# absolute the value of DAYS_BIRTH
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])
app_test['DAYS_BIRTH'] = abs(app_test['DAYS_BIRTH'])


In [35]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_No,EMERGENCYSTATE_MODE_Yes,DAYS_EMPLOYED_ANOM
0,100002,1,0,0,1,0,202500.0,406597.5,24700.5,351000.0,...,0,0,0,0,0,1,0,1,0,False
1,100003,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,...,1,0,0,0,0,0,0,1,0,False
2,100004,0,1,1,1,0,67500.0,135000.0,6750.0,135000.0,...,0,0,0,0,0,0,0,0,0,False
3,100006,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,...,0,0,0,0,0,0,0,0,0,False
4,100007,0,0,0,1,0,121500.0,513000.0,21865.5,513000.0,...,0,0,0,0,0,0,0,0,0,False


### 做好前處理
開始擬合模型之前，我們要確保 training & testing data 的欄位數量一致，原因是因為 One hot encoding 會製造多的欄位，有些類別出現在 training data 而沒有出現 testing data 中，我們就要把這些多餘的欄位去除

In [60]:
train_labels = app_train['TARGET']

# Align the training and testing data, keep only columns present in both dataframes
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)

# Add the target back in
app_train['TARGET'] = train_labels



Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

## Imputer 說明範例 (fit & transform)
使用數組X去“訓練”一個Imputer類，
然後用該類的對象去處理數組Y中的缺失值  
缺失值的處理方式是使用X中的均值（axis=0表示按列進行）代替Y中的缺失值。  
當然也可以使用imp對象來對X數組本身進行處理

In [57]:
train_X = np.array([[1, 2], [np.nan, 3], [7, 6]])
imp = Imputer(missing_values=np.nan , strategy='mean', axis=0)
print(imp.fit(train_X))

test_X = np.array([[np.nan, 2], [6, np.nan], [7, 6]])
print(train_X)
print(test_X)

imp.transform(test_X)
# imp.transform(train_X)

Imputer(axis=0, copy=True, missing_values=nan, strategy='mean', verbose=0)
[[ 1.  2.]
 [nan  3.]
 [ 7.  6.]]
[[nan  2.]
 [ 6. nan]
 [ 7.  6.]]




array([[4.        , 2.        ],
       [6.        , 3.66666667],
       [7.        , 6.        ]])

## MinMaxScaler 說明範例 (fit & transform)
轉換過程: (max - min) / (X.max(axis=0) - X.min(axis=0))

In [84]:
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print("",data[0],"\n",data[1], "\n",data[2],"\n",data[3])
print("")
print(scaler.fit(data))
MinMaxScaler(copy=True, feature_range=(0, 1))
print(scaler.data_max_)
print("")
print(scaler.transform(data))
print("")
print(scaler.transform([[2, 2]]))


 [-1, 2] 
 [-0.5, 6] 
 [0, 10] 
 [1, 18]

MinMaxScaler(copy=True, feature_range=(0, 1))
[ 1. 18.]

[[0.   0.  ]
 [0.25 0.25]
 [0.5  0.5 ]
 [1.   1.  ]]

[[1.5 0. ]]


## 先處理缺失值(Imputer)
## 再處理Scale(MinMaxScaler)

In [61]:
from sklearn.preprocessing import MinMaxScaler, Imputer

# Drop the target from the training data
if 'TARGET' in app_train:
    train = app_train.drop(labels = ['TARGET'], axis=1)
else:
    train = app_train.copy()
    
# Feature names
features = list(train.columns)

# Copy of the testing data
test = app_test.copy()

# Median imputation of missing values
imputer = Imputer(strategy = 'median')

# Scale each feature to 0-1
scaler = MinMaxScaler(feature_range = (0, 1))

# Fit on the training data
imputer.fit(train)



Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)

In [62]:
# Transform both training and testing data
train = imputer.transform(train)
test = imputer.transform(app_test)

# Repeat with the scaler
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

print('Training data shape: ', train.shape)
print('Testing data shape: ', test.shape)

Training data shape:  (307511, 240)
Testing data shape:  (48744, 240)


### Fit the model
參數 C 是 Logistic Regression 中作 regularization 的參數，也可以稱為懲罰項。  
Regularization（正則化）的目的是為了不要讓模型變得太負責而導致 Overfitting（過擬合）的問題，所以加這個變數使模型會被限縮在某個維度。  
Logistic Reg. 的 C 與其他模型 (例如 :SVM) 的 C 概念類似
都是用來調整正規化程度用的, 數字接近0, 正規化成度越高(越接近線性迴歸)  
http://arbu00.blogspot.com/2017/02/5-logistic-regressionoverfitting-and.html  
https://murphymind.blogspot.com/2017/05/machine.learning.regularization.html  
https://blog.csdn.net/jark_/article/details/78342644

In [85]:
from sklearn.linear_model import LogisticRegression

# Make the model with the specified regularization parameter
log_reg = LogisticRegression(C = 0.0001)

# Train on the training data
log_reg.fit(train, train_labels)



LogisticRegression(C=0.0001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [98]:
log_reg.predict_proba(test)

array([[0.93494885, 0.06505115],
       [0.87359914, 0.12640086],
       [0.91876117, 0.08123883],
       ...,
       [0.94297619, 0.05702381],
       [0.92586477, 0.07413523],
       [0.91016771, 0.08983229]])

In [97]:
log_reg.predict_proba(train)

array([[0.85203243, 0.14796757],
       [0.94467049, 0.05532951],
       [0.90635339, 0.09364661],
       ...,
       [0.92758858, 0.07241142],
       [0.92055768, 0.07944232],
       [0.94014869, 0.05985131]])

模型 fit 好以後，就可以用來預測 testing data 中的客戶違約遲繳貸款的機率咯! (記得要用 predict_proba 才會輸出機率)

In [100]:
# Make predictions
# Make sure to select the second column only
# log_reg.predict_proba(test)，分別計算違約率是yes的機率、是no的機率
# [No(0), Yes(1)]
log_reg_pred = log_reg.predict_proba(test)[:, 1]
log_reg_pred

array([0.06505115, 0.12640086, 0.08123883, ..., 0.05702381, 0.07413523,
       0.08983229])

### 儲存預測結果

In [103]:
# Submission dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = log_reg_pred

submit.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.065051
1,100005,0.126401
2,100013,0.081239
3,100028,0.061509
4,100038,0.128308


In [109]:
submit.set_index(['SK_ID_CURR'],inplace=True)

## 練習時間
將你的結果存成 csv, 上傳你的第一份 Kaggle 成績

Hints: https://stackoverflow.com/questions/16923281/pandas-writing-dataframe-to-csv-file

In [110]:
submit.to_csv('kgfish7105_out2.csv')
# submit.to_csv('test.csv', sep='\t')