# Feature Engineering and Selection

In [53]:
import pandas as pd
import numpy as np
import joblib
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

Based on previous parts, we are going to make 2 model that distinguished by having or not having bureau data. We need to balance the data and we will use features:
- AGE (binned)
- INCOME (binned)
- GENDER
- EDUCATION
- LOAN_PURPOSE
- HAS_APPLIED_BEFORE
- HAS INCOME VERIFICATION
- Bureau data ("LOANS_WITHOUT_DELAYS", "LOANS_WITH_DELAYS")

## Load Data

In [9]:
data = joblib.load('output/data.pkl')

## Split with or without bureau

* because the data that does not have a data bureau are few, we will use all data that eliminates the data bureau.
* to overcome the data imbalance we will draw one balanced data set from raw data and then split the training, validation and test data.
* we will take less amount of data and adjust more data with that amount of data.

In [15]:
def split_data_bureau(dataset):
    df = data.copy()
    
    #data with Bureau
    df_1 = df.dropna(subset = ['LOANS_WITHOUT_DELAYS', 'LOANS_WITH_DELAYS'])
    # get all data with label 0
    df_2 = df_1.loc[(df_1.LOAN_WAS_PAID_BACK == 0) & (~df_1.LOANS_WITH_DELAYS.isna()), :]
    # get all labeled 1 data with the same number of row as label 0 data
    df_3 = df_1.loc[df_1.LOAN_WAS_PAID_BACK == 1, :].sample(n = df_2.shape[0])
    df_with_bureau = pd.concat([df_2, df_3], ignore_index=True)

    #data without Bureau
    df_4 = df.drop(columns=[ "LOANS_WITHOUT_DELAYS", "LOANS_WITH_DELAYS"])
    # get all data with label 0
    df_5 = df_4.loc[(df_4.LOAN_WAS_PAID_BACK == 0)]
    # get all labeled 1 data with the same number of row as label 0 data
    df_6 = df_4.loc[df_4.LOAN_WAS_PAID_BACK == 1, :].sample(n = df_5.shape[0])
    df_without_bureau = pd.concat([df_5, df_6], ignore_index=True)
    
    return df_with_bureau, df_without_bureau

In [18]:
#split data to with and without bureau data
df_with_bureau, df_without_bureau = split_data_bureau(data)

In [19]:
df_with_bureau.LOAN_WAS_PAID_BACK.value_counts()

0    761
1    761
Name: LOAN_WAS_PAID_BACK, dtype: int64

In [20]:
df_without_bureau.LOAN_WAS_PAID_BACK.value_counts()

0    963
1    963
Name: LOAN_WAS_PAID_BACK, dtype: int64

## Split Data Test, Valid, and Test

In [61]:
def split_input_output(dataset,
                       target_column,
                       save_file = True,
                       return_file = True,
                       data_bureau= False):
    
    output_df = dataset[target_column]
    input_df = dataset.drop([target_column],
                            axis = 1)
    
    if save_file:
        if data_bureau:
            joblib.dump(output_df, "output/output_df_with_bureau.pkl")
            joblib.dump(input_df, "output/input_df_with_bureau.pkl")
        else:
            joblib.dump(output_df, "output/output_df_without_bureau.pkl")
            joblib.dump(input_df, "output/input_df_without_bureau.pkl")
    if return_file:
        return output_df, input_df
    
def split_train_test(x, y, TEST_SIZE):
    # Do not forget to stratify if classification
    x_train, x_test,\
        y_train, y_test = train_test_split(x,
                                           y,
                                           test_size=TEST_SIZE,
                                           random_state=123)

    return x_train, x_test, y_train, y_test

def split_data(data_input,
               data_ouput,
               data_bureau= False,
               return_file=False, 
               TEST_SIZE=0.3):

    x_train, x_test, \
        y_train, y_test = split_train_test(
            data_input,
            data_ouput,
            TEST_SIZE)

    x_train, x_valid, \
        y_train, y_valid = split_train_test(
            x_train,
            y_train,
            TEST_SIZE)
    
    if data_bureau:
        joblib.dump(x_train, "output/x_train_with_bureau.pkl")
        joblib.dump(y_train, "output/y_train_with_bureau.pkl")
        joblib.dump(x_valid, "output/x_valid_with_bureau.pkl")
        joblib.dump(y_valid, "output/y_valid_with_bureau.pkl")
        joblib.dump(x_test, "output/x_test_with_bureau.pkl")
        joblib.dump(y_test, "output/y_test_with_bureau.pkl")
    else:
        joblib.dump(x_train, "output/x_train_without_bureau.pkl")
        joblib.dump(y_train, "output/y_train_without_bureau.pkl")
        joblib.dump(x_valid, "output/x_valid_without_bureau.pkl")
        joblib.dump(y_valid, "output/y_valid_without_bureau.pkl")
        joblib.dump(x_test, "output/x_test_without_bureau.pkl")
        joblib.dump(y_test, "output/y_test_without_bureau.pkl")
        
    if return_file:
        return x_train, y_train, \
            x_valid, y_valid, \
            x_test, y_test

In [103]:
TARGET = "LOAN_WAS_PAID_BACK"

#data with bureau
output_df_with_bureau, input_df_with_bureau = split_input_output(df_with_bureau,
                                                                 TARGET,
                                                                 save_file = False, 
                                                                 data_bureau= True)

x_train_with_bureau, y_train_with_bureau, \
x_valid_with_bureau, y_valid_with_bureau, \
x_test_with_bureau, y_test_with_bureau = split_data(input_df_with_bureau,
                                                    output_df_with_bureau,
                                                    data_bureau= True,
                                                    return_file=True)

#data without bureau
output_df_without_bureau, input_df_without_bureau = split_input_output(df_without_bureau,
                                                                       TARGET, 
                                                                       save_file = False, 
                                                                       data_bureau= False)

x_train_without_bureau, y_train_without_bureau, \
x_valid_without_bureau, y_valid_without_bureau, \
x_test_without_bureau, y_test_without_bureau = split_data(input_df_without_bureau,
                                                    output_df_without_bureau,
                                                    data_bureau= False,
                                                    return_file=True)

In [67]:
def binning_features(df):
    #binning Age
    df.loc[ df['AGE'] <= 20, 'AGE'] = 0
    df.loc[(df['AGE'] > 20) & (df['AGE'] <= 34), 'AGE'] = 1
    df.loc[(df['AGE'] > 34) & (df['AGE'] <= 40), 'AGE'] = 2
    df.loc[(df['AGE'] > 40) & (df['AGE'] <= 47), 'AGE'] = 3
    df.loc[ df['AGE'] > 47, 'AGE'] = 4
    
    #income
    df.loc[ df['INCOME'] <= 4e6, 'INCOME'] = 0
    df.loc[(df['INCOME'] > 4e6) & (df['INCOME'] <= 5e6), 'INCOME'] = 1
    df.loc[(df['INCOME'] > 5e6) & (df['INCOME'] <= 6e6), 'INCOME'] = 2
    df.loc[(df['INCOME'] > 6e6) & (df['INCOME'] <= 8e6), 'INCOME'] = 3
    df.loc[(df['INCOME'] > 8e6) & (df['INCOME'] <= 11e6), 'INCOME'] = 4
    df.loc[ df['INCOME'] > 11e6, 'INCOME'] = 5

    return df

In [68]:
def one_hot_encoder(x_cat):
    df = x_cat.copy()
    index = x_cat.index
    col = x_cat.columns
    
    encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
    encoder.fit(x_cat)

    encoded = encoder.transform(x_cat)
    feat_names = encoder.get_feature_names(col)
    encoded = pd.DataFrame(encoded)
    encoded.index = index
    encoded.columns = feat_names
    return encoded

In [117]:
#compile all engineering
def feature_engineering(df_raw, df):
    #drop some features
    df = df.drop(["APPLICATION_RECEIVE_TIME", "KNOWN_ASSETS"], axis=1)
    #dropna for Income and Loan Purpose
    df = df.dropna(subset = ['INCOME', 'LOAN_PURPOSE'])
    
    #binning
    binning_features(df)
    
    #converting categorical Features
    df['GENDER'] = df['GENDER'].map( {'Male': 1, 'Female': 0} ).astype(int)
    df['HAS_APPLIED_BEFORE'] = df['HAS_APPLIED_BEFORE'].map( {'Yes': 1, 'No': 0} ).astype(int)
    df['HAS_INCOME_VERIFICATION'] = df['HAS_INCOME_VERIFICATION'].map( {'Yes': 1, 'No': 0} ).astype(int)
    
    #one hot encoding
    df_num = df.drop(['EDUCATION','LOAN_PURPOSE'], axis=1)
    df_hot = one_hot_encoder(df[['EDUCATION','LOAN_PURPOSE']])
    df_ready = df_num.join(df_hot)
    
    #add columns features if not in column EDUCATION and LOAN_PURPOSE
    df_raw = df_raw.dropna(subset = ['EDUCATION','LOAN_PURPOSE'])
    df_hot_cek = one_hot_encoder(df_raw[['EDUCATION','LOAN_PURPOSE']])
    mis_col = set(df_hot_cek.columns).difference(set(df_hot.columns))
    print(mis_col)
    if len(mis_col) > 0:
        for i in mis_col :
            df_ready.loc[:, i] = 0
    else:
        pass
    
    return df_ready

In [118]:
x_train_with_bureau_ready = feature_engineering(data, x_train_with_bureau)
x_valid_with_bureau_ready = feature_engineering(data, x_valid_with_bureau)
x_test_with_bureau_ready = feature_engineering(data, x_test_with_bureau)

set()
{'EDUCATION_Other', 'LOAN_PURPOSE_Car/Motorcycle'}
{'LOAN_PURPOSE_Car/Motorcycle'}




In [119]:
x_train_without_bureau_ready = feature_engineering(data, x_train_without_bureau)
x_valid_without_bureau_ready = feature_engineering(data, x_valid_without_bureau)
x_test_without_bureau_ready = feature_engineering(data, x_test_without_bureau)

set()
{'LOAN_PURPOSE_Car/Motorcycle'}
{'LOAN_PURPOSE_Car/Motorcycle'}




In [121]:
x_train_with_bureau_ready.head()

Unnamed: 0,AGE,INCOME,GENDER,HAS_APPLIED_BEFORE,HAS_INCOME_VERIFICATION,LOANS_WITHOUT_DELAYS,LOANS_WITH_DELAYS,EDUCATION_Bachelor Degree,EDUCATION_Diploma,EDUCATION_High School,...,LOAN_PURPOSE_Credit card,LOAN_PURPOSE_Education,LOAN_PURPOSE_Electronic unsecured loan,LOAN_PURPOSE_Holiday,LOAN_PURPOSE_Housing loan,LOAN_PURPOSE_Investment,LOAN_PURPOSE_Other,LOAN_PURPOSE_Renovation,LOAN_PURPOSE_Venture capital,LOAN_PURPOSE_Working Capital
718,2,0.0,1,0,1,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
446,4,3.0,0,1,1,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1248,2,3.0,1,0,1,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
177,1,0.0,1,0,1,3.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
978,2,1.0,1,1,1,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [122]:
x_train_without_bureau_ready.head()

Unnamed: 0,AGE,INCOME,GENDER,HAS_APPLIED_BEFORE,HAS_INCOME_VERIFICATION,EDUCATION_Bachelor Degree,EDUCATION_Diploma,EDUCATION_High School,EDUCATION_Master's Degree/Post graduate,EDUCATION_Other,...,LOAN_PURPOSE_Credit card,LOAN_PURPOSE_Education,LOAN_PURPOSE_Electronic unsecured loan,LOAN_PURPOSE_Holiday,LOAN_PURPOSE_Housing loan,LOAN_PURPOSE_Investment,LOAN_PURPOSE_Other,LOAN_PURPOSE_Renovation,LOAN_PURPOSE_Venture capital,LOAN_PURPOSE_Working Capital
889,4,5.0,0,0,1,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1325,3,5.0,1,0,1,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1329,1,4.0,0,0,1,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
575,1,2.0,0,1,1,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1171,2,0.0,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
