# Pipeline - Loan Dataset

In [120]:
import pandas as pd
import numpy as np

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

# preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# modeling
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

In [90]:
np.set_printoptions(threshold=np.inf)

In [91]:
df = pd.read_csv("loan-clean.csv")
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Target Encoding

In [92]:
df["Loan_Status"] = np.where(df["Loan_Status"] == 'Y', 1, 0)
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


## Feature Selections

In [93]:
X = df.drop(columns=['Loan_ID', 'Loan_Status'])
y = df['Loan_Status']

X.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,Male,No,0,Graduate,No,5849,0.0,128.0,360.0,1.0,Urban
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban


## Split Dataset

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## Debug Object

In [113]:
class Debug(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y):
        print("FITTING...")
        print(pd.DataFrame(X).sample(10))
        print("*************")
        return self
    
    def transform(self, x):
        print("TRANSFORMING...")
        print(pd.DataFrame(x).sample(10))
        print("*************")
        return x

## Pipeline

In [114]:
columns_ohe = ["Gender", "Married", "Dependents",
                        "Education", "Self_Employed", "Property_Area"]
columns_scale = ["ApplicantIncome", "CoapplicantIncome",
                    "LoanAmount", "Loan_Amount_Term"]

pipe_columns = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns_ohe),
                                              ('scaler', StandardScaler(), columns_scale)],
                                remainder="passthrough")

pipe_ohe = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), columns_ohe)],
                                remainder="passthrough")
pipe_columns

In [115]:
hyperparams_nn = {"random_state": 42,
                  "max_iter": 1000,
                  "hidden_layer_sizes": (10,),
                  "n_iter_no_change": 200,
                  "early_stopping": True,
                  "verbose": True}

pipe_nn = Pipeline([('preprocessor', pipe_columns),
                    ('debug', Debug()),
                    ('model', MLPClassifier(**hyperparams_nn))])
pipe_nn

In [116]:
hyperparams_rf = {"n_estimators": 9,
                  "n_jobs": -1,
                  "verbose": True}

pipe_rf = Pipeline([('preprocessor', pipe_ohe),
                    ('model', RandomForestClassifier(**hyperparams_rf))])
pipe_rf


### Training

In [117]:
pipe_nn.fit(X_train, y_train)

FITTING...
      0    1    2    3    4    5    6    7    8    9    10   11   12   13  \
146  0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0   
198  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0   
436  0.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0  1.0  0.0   
225  0.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0   
467  0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0   
69   1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0   
24   0.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0   
60   0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0   
243  1.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0  1.0   
190  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0   

      14        15        16        17        18   19  
146  1.0 -0.621344  0.013392 -0.556964  0.287611  1.0  
198  1.0 -0.274620 -0.528127 

In [118]:
pipe_rf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    0.0s finished


### Evaluation

In [119]:
pipe_nn.score(X_test, y_test)

TRANSFORMING...
      0    1    2    3    4    5    6    7    8    9    10   11   12   13  \
19   0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0   
7    0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0   
42   0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0   
104  0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0   
69   0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  0.0  1.0  1.0  0.0  0.0  0.0   
84   0.0  1.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0   
91   0.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0   
14   0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0   
67   0.0  1.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0   
55   0.0  1.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0  1.0  0.0   

      14        15        16        17        18   19  
19   0.0 -0.152296 -0.528127 -0.869951 -2.403945  1.0  
7    1.0 -0.422252 -0.52

0.7886178861788617

In [81]:
pipe_rf.score(X_test, y_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   4 out of   9 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=8)]: Done   9 out of   9 | elapsed:    0.0s finished


0.7398373983739838