# Data Prep 2

* Data Transformation
* Feature Selection
* Preliminary Model Testing


## Data Transformation
1. Sort variables by type
2. Build transformer
3. Recreate Dataframe

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

pd.set_option('display.max_columns', None)
plt.rcParams['figure.facecolor']='w'

import warnings
warnings.filterwarnings('ignore')

### Pre-processing

Manual transformations:
* crop table to loans only
* set churn variables
* fill NaNs
* convert datetime to meaningful data
* scale district data by population and change
* drop columns

Data transformation pipeline:
* One Hot Encode categorical
* Ordinal Encode gender

In [2]:
financial = pd.read_csv('data/financial.csv', index_col=0)
financial.head()

Unnamed: 0,account_id,account_frequency,account_date,n_orders,total_debit,n_trans,trans_amount,latest_balance,card_type,card_issued,gender,birth_date,district_id,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_date,loan_amount,loan_duration,loan_payments,loan_status
0,1,monthly,1995-03-24,1.0,2452.0,239,375192,13467,,,F,1970-12-13,18,Pisek,south Bohemia,70699,60,13,2,1,4,65.3,8968,2.8,3.35,131,1740.0,1910,1,,,,,
1,2,monthly,1993-02-26,2.0,10638.7,478,3151514,42628,,,M,1945-02-04,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677.0,99107,2,1994-01-05,80952.0,24.0,3373.0,A
2,3,monthly,1997-07-07,3.0,5001.0,117,295030,51096,,,M,1956-12-01,5,Kolin,central Bohemia,95616,65,30,4,1,6,51.4,9307,3.8,4.43,118,2616.0,3040,2,,,,,
3,4,monthly,1996-02-21,2.0,3363.0,186,350986,33721,,,M,1919-09-22,12,Pribram,central Bohemia,107870,84,29,6,1,6,58.0,8754,3.8,4.31,137,3804.0,3868,1,,,,,
4,5,monthly,1997-05-30,1.0,2668.0,84,166891,28088,,,M,1929-01-25,15,Cesky Krumlov,south Bohemia,58796,22,16,7,1,5,51.9,9045,3.1,3.6,124,1845.0,1879,1,,,,,


In [3]:
financial.columns

Index(['account_id', 'account_frequency', 'account_date', 'n_orders',
       'total_debit', 'n_trans', 'trans_amount', 'latest_balance', 'card_type',
       'card_issued', 'gender', 'birth_date', 'district_id', 'A2', 'A3', 'A4',
       'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15',
       'A16', 'n_clients', 'loan_date', 'loan_amount', 'loan_duration',
       'loan_payments', 'loan_status'],
      dtype='object')

In [4]:
financial.shape

(4500, 34)

In [5]:
#operate only on accounts that have loans (target variable)

financial = financial.dropna(subset=['loan_status'])
financial.shape

(682, 34)

In [6]:
# 606 approved loans + 72 denied = 682, values are as expected

financial['loan_status'].value_counts()

C    403
A    203
D     45
B     31
Name: loan_status, dtype: int64

In [7]:
# convert target to categorical accepted and unaccepted

financial = financial.replace({'loan_status':{'A':1, 'B':0, 'C':1, 'D':0}})


In [8]:
financial.head()

Unnamed: 0,account_id,account_frequency,account_date,n_orders,total_debit,n_trans,trans_amount,latest_balance,card_type,card_issued,gender,birth_date,district_id,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_date,loan_amount,loan_duration,loan_payments,loan_status
1,2,monthly,1993-02-26,2.0,10638.7,478,3151514,42628,,,M,1945-02-04,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677.0,99107,2,1994-01-05,80952.0,24.0,3373.0,1
18,19,monthly,1995-04-07,1.0,2523.2,303,1575532,10840,,,F,1939-04-23,21,Tabor,south Bohemia,103347,87,16,7,1,7,67.0,9104,1.5,2.07,123,2299.0,2354,1,1996-04-29,30276.0,12.0,2523.0,0
24,25,monthly,1996-07-28,4.0,10614.2,274,2958556,30199,,,M,1962-02-09,68,Frydek - Mistek,north Moravia,228848,15,40,18,2,6,57.2,9893,4.0,4.72,96,5623.0,5887,1,1997-12-08,30276.0,12.0,2523.0,1
35,37,monthly,1997-08-18,4.0,10305.5,130,948159,45905,,,M,1952-08-26,20,Strakonice,south Bohemia,70646,94,14,3,1,4,58.4,8547,2.6,3.64,120,1563.0,1542,1,1998-10-14,318480.0,60.0,5308.0,0
36,38,weekly,1997-08-08,4.0,9666.8,130,571952,44581,,,F,1940-01-30,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.1,3.98,120,999.0,1099,1,1998-04-19,110736.0,48.0,2307.0,1


In [9]:
financial.isna().sum()

account_id             0
account_frequency      0
account_date           0
n_orders               0
total_debit            0
n_trans                0
trans_amount           0
latest_balance         0
card_type            512
card_issued          512
gender                 0
birth_date             0
district_id            0
A2                     0
A3                     0
A4                     0
A5                     0
A6                     0
A7                     0
A8                     0
A9                     0
A10                    0
A11                    0
A12                    8
A13                    0
A14                    0
A15                    8
A16                    0
n_clients              0
loan_date              0
loan_amount            0
loan_duration          0
loan_payments          0
loan_status            0
dtype: int64

In [10]:
financial['card_type'] = financial['card_type'].fillna('none')
financial['A15'] = financial['A15'].fillna(financial['A16'])

In [11]:
financial.head()

Unnamed: 0,account_id,account_frequency,account_date,n_orders,total_debit,n_trans,trans_amount,latest_balance,card_type,card_issued,gender,birth_date,district_id,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_date,loan_amount,loan_duration,loan_payments,loan_status
1,2,monthly,1993-02-26,2.0,10638.7,478,3151514,42628,none,,M,1945-02-04,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677.0,99107,2,1994-01-05,80952.0,24.0,3373.0,1
18,19,monthly,1995-04-07,1.0,2523.2,303,1575532,10840,none,,F,1939-04-23,21,Tabor,south Bohemia,103347,87,16,7,1,7,67.0,9104,1.5,2.07,123,2299.0,2354,1,1996-04-29,30276.0,12.0,2523.0,0
24,25,monthly,1996-07-28,4.0,10614.2,274,2958556,30199,none,,M,1962-02-09,68,Frydek - Mistek,north Moravia,228848,15,40,18,2,6,57.2,9893,4.0,4.72,96,5623.0,5887,1,1997-12-08,30276.0,12.0,2523.0,1
35,37,monthly,1997-08-18,4.0,10305.5,130,948159,45905,none,,M,1952-08-26,20,Strakonice,south Bohemia,70646,94,14,3,1,4,58.4,8547,2.6,3.64,120,1563.0,1542,1,1998-10-14,318480.0,60.0,5308.0,0
36,38,weekly,1997-08-08,4.0,9666.8,130,571952,44581,none,,F,1940-01-30,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.1,3.98,120,999.0,1099,1,1998-04-19,110736.0,48.0,2307.0,1


In [12]:
# make ages from dates:
# account_age, client_age, loan_age
# datetime

from datetime import datetime

dates = ['account_date','birth_date', 'loan_date', 'card_issued']
ages = ['account_age', 'client_age', 'loan_age', 'card_age']

In [13]:
for i in dates:
    financial[i] = pd.to_datetime(financial[i])

In [14]:
#QC, dtypes as expected

financial[dates].dtypes

account_date    datetime64[ns]
birth_date      datetime64[ns]
loan_date       datetime64[ns]
card_issued     datetime64[ns]
dtype: object

In [15]:
for i, j in zip(ages, dates):
    financial[i] = 1999 - pd.DatetimeIndex(financial[j]).year

In [16]:
financial.head()

Unnamed: 0,account_id,account_frequency,account_date,n_orders,total_debit,n_trans,trans_amount,latest_balance,card_type,card_issued,gender,birth_date,district_id,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_date,loan_amount,loan_duration,loan_payments,loan_status,account_age,client_age,loan_age,card_age
1,2,monthly,1993-02-26,2.0,10638.7,478,3151514,42628,none,NaT,M,1945-02-04,1,Hl.m. Praha,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677.0,99107,2,1994-01-05,80952.0,24.0,3373.0,1,6,54,5,
18,19,monthly,1995-04-07,1.0,2523.2,303,1575532,10840,none,NaT,F,1939-04-23,21,Tabor,south Bohemia,103347,87,16,7,1,7,67.0,9104,1.5,2.07,123,2299.0,2354,1,1996-04-29,30276.0,12.0,2523.0,0,4,60,3,
24,25,monthly,1996-07-28,4.0,10614.2,274,2958556,30199,none,NaT,M,1962-02-09,68,Frydek - Mistek,north Moravia,228848,15,40,18,2,6,57.2,9893,4.0,4.72,96,5623.0,5887,1,1997-12-08,30276.0,12.0,2523.0,1,3,37,2,
35,37,monthly,1997-08-18,4.0,10305.5,130,948159,45905,none,NaT,M,1952-08-26,20,Strakonice,south Bohemia,70646,94,14,3,1,4,58.4,8547,2.6,3.64,120,1563.0,1542,1,1998-10-14,318480.0,60.0,5308.0,0,2,47,1,
36,38,weekly,1997-08-08,4.0,9666.8,130,571952,44581,none,NaT,F,1940-01-30,19,Prachatice,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.1,3.98,120,999.0,1099,1,1998-04-19,110736.0,48.0,2307.0,1,2,59,1,


In [17]:
financial['card_age'] = financial['card_age'].fillna(0).astype('int')


In [18]:
# drop columns that are no longer necessary

financial.drop(columns = ['account_id',
                         'account_date',
                         'card_issued',
                         'A2','birth_date',
                         'loan_date'], inplace=True)

In [19]:
financial.head()

Unnamed: 0,account_frequency,n_orders,total_debit,n_trans,trans_amount,latest_balance,card_type,gender,district_id,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_amount,loan_duration,loan_payments,loan_status,account_age,client_age,loan_age,card_age
1,monthly,2.0,10638.7,478,3151514,42628,none,M,1,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677.0,99107,2,80952.0,24.0,3373.0,1,6,54,5,0
18,monthly,1.0,2523.2,303,1575532,10840,none,F,21,south Bohemia,103347,87,16,7,1,7,67.0,9104,1.5,2.07,123,2299.0,2354,1,30276.0,12.0,2523.0,0,4,60,3,0
24,monthly,4.0,10614.2,274,2958556,30199,none,M,68,north Moravia,228848,15,40,18,2,6,57.2,9893,4.0,4.72,96,5623.0,5887,1,30276.0,12.0,2523.0,1,3,37,2,0
35,monthly,4.0,10305.5,130,948159,45905,none,M,20,south Bohemia,70646,94,14,3,1,4,58.4,8547,2.6,3.64,120,1563.0,1542,1,318480.0,60.0,5308.0,0,2,47,1,0
36,weekly,4.0,9666.8,130,571952,44581,none,F,19,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.1,3.98,120,999.0,1099,1,110736.0,48.0,2307.0,1,2,59,1,0


In [20]:
# convert unecessary floats to ints

notfloats = ['n_orders', 'A15', 'loan_amount', 'loan_duration', 'loan_payments']

for i in notfloats:
    financial[i] = financial[i].astype('int')

In [21]:
financial[notfloats].dtypes

n_orders         int64
A15              int64
loan_amount      int64
loan_duration    int64
loan_payments    int64
dtype: object

In [22]:
# transforming district data
# crime_rate_95, crime_rate_96, as crimes commited divided by population
# unemployment_change, crime_change as differences between 96 and 95


In [23]:
financial['crime_rate_95'] = financial['A15']/financial['A4']
financial['crime_rate_96'] = financial['A16']/financial['A4']

In [24]:
financial['unemployment_change'] = financial['A13'] - financial['A12']
financial['crime_change'] = financial['crime_rate_96'] - financial['crime_rate_95']

In [25]:
financial.head()

Unnamed: 0,account_frequency,n_orders,total_debit,n_trans,trans_amount,latest_balance,card_type,gender,district_id,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_amount,loan_duration,loan_payments,loan_status,account_age,client_age,loan_age,card_age,crime_rate_95,crime_rate_96,unemployment_change,crime_change
1,monthly,2,10638.7,478,3151514,42628,none,M,1,Prague,1204953,0,0,0,1,1,100.0,12541,0.2,0.43,167,85677,99107,2,80952,24,3373,1,6,54,5,0,0.071104,0.08225,0.23,0.011146
18,monthly,1,2523.2,303,1575532,10840,none,F,21,south Bohemia,103347,87,16,7,1,7,67.0,9104,1.5,2.07,123,2299,2354,1,30276,12,2523,0,4,60,3,0,0.022245,0.022778,0.57,0.000532
24,monthly,4,10614.2,274,2958556,30199,none,M,68,north Moravia,228848,15,40,18,2,6,57.2,9893,4.0,4.72,96,5623,5887,1,30276,12,2523,1,3,37,2,0,0.024571,0.025724,0.72,0.001154
35,monthly,4,10305.5,130,948159,45905,none,M,20,south Bohemia,70646,94,14,3,1,4,58.4,8547,2.6,3.64,120,1563,1542,1,318480,60,5308,0,2,47,1,0,0.022124,0.021827,1.04,-0.000297
36,weekly,4,9666.8,130,571952,44581,none,F,19,south Bohemia,51428,50,11,3,1,4,52.7,8402,3.1,3.98,120,999,1099,1,110736,48,2307,1,2,59,1,0,0.019425,0.02137,0.88,0.001944


In [26]:
# data transformations

cat_var = ['account_frequency', 'card_type', 'A3']
bin_var = ['gender']
target = ['loan_status']
non_cont = cat_var + bin_var + target
cont_var = list(financial.drop(columns = non_cont).columns)

In [27]:
#QC

financial[cat_var].dtypes, financial[cont_var].dtypes

(account_frequency    object
 card_type            object
 A3                   object
 dtype: object,
 n_orders                 int64
 total_debit            float64
 n_trans                  int64
 trans_amount             int64
 latest_balance           int64
 district_id              int64
 A4                       int64
 A5                       int64
 A6                       int64
 A7                       int64
 A8                       int64
 A9                       int64
 A10                    float64
 A11                      int64
 A12                    float64
 A13                    float64
 A14                      int64
 A15                      int64
 A16                      int64
 n_clients                int64
 loan_amount              int64
 loan_duration            int64
 loan_payments            int64
 account_age              int64
 client_age               int64
 loan_age                 int64
 card_age                 int64
 crime_rate_95          float64
 

In [28]:
# data transformation

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [29]:
data_transformer = ColumnTransformer(transformers = [
    ('cont', 'passthrough', cont_var),
    ('target', 'passthrough', target),
    ('binary', OrdinalEncoder(), bin_var),
    ('nominal', OneHotEncoder(sparse = False), cat_var)],
                                      remainder = 'drop')

In [30]:
data_transformer.fit(financial)

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cont', 'passthrough',
                                 ['n_orders', 'total_debit', 'n_trans',
                                  'trans_amount', 'latest_balance',
                                  'district_id', 'A4', 'A5', 'A6', 'A7', 'A8',
                                  'A9', 'A10', 'A11', 'A12', 'A13', 'A14',
                                  'A15', 'A16', 'n_clients', 'loan_amount',
                                  'loan_duration', 'loan_payments',
                                  'account_...
                                  'card_age', 'crime_rate_95', 'crime_rate_96',
                                  'unemployment_change', ...]),
                                ('target', 'passthrough', ['loan_status']),
                                ('binary',
                                 OrdinalEncoder(categories='auto',
              

In [31]:
nom_name=data_transformer.named_transformers_['nominal'].categories_

In [32]:
transformed_nomcat = []

for col, name in zip(cat_var, nom_name):
    for i in name:
        transformed_nomcat.append('_'.join([col,str(i)]))

In [33]:
transformed_nomcat

['account_frequency_monthly',
 'account_frequency_transaction',
 'account_frequency_weekly',
 'card_type_classic',
 'card_type_gold',
 'card_type_junior',
 'card_type_none',
 'A3_Prague',
 'A3_central Bohemia',
 'A3_east Bohemia',
 'A3_north Bohemia',
 'A3_north Moravia',
 'A3_south Bohemia',
 'A3_south Moravia',
 'A3_west Bohemia']

In [34]:
financial_trans = data_transformer.transform(financial)

In [35]:
trans_col = cont_var + target + bin_var  + transformed_nomcat

In [36]:
financial_df = pd.DataFrame(data = financial_trans, columns=trans_col)

In [37]:
financial_df

# it transformed all my integers into floats anyway ._.

Unnamed: 0,n_orders,total_debit,n_trans,trans_amount,latest_balance,district_id,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16,n_clients,loan_amount,loan_duration,loan_payments,account_age,client_age,loan_age,card_age,crime_rate_95,crime_rate_96,unemployment_change,crime_change,loan_status,gender,account_frequency_monthly,account_frequency_transaction,account_frequency_weekly,card_type_classic,card_type_gold,card_type_junior,card_type_none,A3_Prague,A3_central Bohemia,A3_east Bohemia,A3_north Bohemia,A3_north Moravia,A3_south Bohemia,A3_south Moravia,A3_west Bohemia
0,2.0,10638.7,478.0,3151514.0,42628.0,1.0,1204953.0,0.0,0.0,0.0,1.0,1.0,100.0,12541.0,0.2,0.43,167.0,85677.0,99107.0,2.0,80952.0,24.0,3373.0,6.0,54.0,5.0,0.0,0.071104,0.082250,0.23,0.011146,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,2523.2,303.0,1575532.0,10840.0,21.0,103347.0,87.0,16.0,7.0,1.0,7.0,67.0,9104.0,1.5,2.07,123.0,2299.0,2354.0,1.0,30276.0,12.0,2523.0,4.0,60.0,3.0,0.0,0.022245,0.022778,0.57,0.000532,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4.0,10614.2,274.0,2958556.0,30199.0,68.0,228848.0,15.0,40.0,18.0,2.0,6.0,57.2,9893.0,4.0,4.72,96.0,5623.0,5887.0,1.0,30276.0,12.0,2523.0,3.0,37.0,2.0,0.0,0.024571,0.025724,0.72,0.001154,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,4.0,10305.5,130.0,948159.0,45905.0,20.0,70646.0,94.0,14.0,3.0,1.0,4.0,58.4,8547.0,2.6,3.64,120.0,1563.0,1542.0,1.0,318480.0,60.0,5308.0,2.0,47.0,1.0,0.0,0.022124,0.021827,1.04,-0.000297,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,4.0,9666.8,130.0,571952.0,44581.0,19.0,51428.0,50.0,11.0,3.0,1.0,4.0,52.7,8402.0,3.1,3.98,120.0,999.0,1099.0,1.0,110736.0,48.0,2307.0,2.0,59.0,1.0,0.0,0.019425,0.021370,0.88,0.001944,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677,2.0,5325.0,75.0,559884.0,77353.0,7.0,94725.0,38.0,28.0,1.0,3.0,6.0,63.4,9920.0,2.2,2.87,130.0,4289.0,4846.0,1.0,39168.0,24.0,1632.0,2.0,20.0,1.0,0.0,0.045278,0.051159,0.67,0.005880,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
678,1.0,4674.0,146.0,1326829.0,63586.0,54.0,387570.0,0.0,0.0,0.0,1.0,1.0,100.0,9897.0,1.6,1.96,140.0,18721.0,18696.0,1.0,280440.0,60.0,4674.0,3.0,47.0,1.0,0.0,0.048304,0.048239,0.36,-0.000065,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
679,2.0,17884.0,304.0,3957378.0,33978.0,1.0,1204953.0,0.0,0.0,0.0,1.0,1.0,100.0,12541.0,0.2,0.43,167.0,85677.0,99107.0,2.0,419880.0,60.0,6998.0,4.0,54.0,4.0,0.0,0.071104,0.082250,0.23,0.011146,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
680,3.0,9274.3,378.0,2948097.0,75200.0,61.0,117897.0,139.0,28.0,5.0,1.0,6.0,53.8,8814.0,4.7,5.74,107.0,2112.0,2059.0,1.0,54024.0,12.0,4502.0,5.0,31.0,3.0,4.0,0.017914,0.017464,1.04,-0.000450,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Feature Selection

**Note**: It took a lot longer to get to this section than anticipated

### Variance Threshold
* can be applied to both continuous and categorical variables

In [38]:
# variance threshold

from sklearn.feature_selection import VarianceThreshold

In [39]:
# continuous data must be min max scalered

from sklearn.preprocessing import MinMaxScaler

In [40]:
scaler = MinMaxScaler()

financial_cont = financial_df[cont_var]

financial_cont_scaled = scaler.fit_transform(financial_cont)

In [41]:
selector = VarianceThreshold(threshold = 0.1)

selector.fit_transform(financial_cont_scaled)

array([[0.        , 1.        , 0.25      , 1.        ],
       [0.26315789, 0.        , 0.        , 0.5       ],
       [0.88157895, 0.        , 0.        , 0.25      ],
       ...,
       [0.        , 1.        , 1.        , 0.5       ],
       [0.78947368, 0.        , 0.        , 0.75      ],
       [0.86842105, 0.        , 0.25      , 0.5       ]])

In [42]:
selector.get_support()

array([False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False,  True, False,  True, False, False, False,
       False, False, False, False])

In [43]:
top_var = list(financial_cont.columns[selector.get_support(indices = True)])

In [44]:
top_var

['district_id', 'n_clients', 'loan_duration', 'account_age']

### Correlation

* basic correlation overview

In [45]:
top_10_corr = financial_df.corr()['loan_status'].abs().sort_values(ascending = False)[:10]
top_10_corr

loan_status          1.000000
latest_balance       0.213418
n_clients            0.184021
loan_payments        0.182440
loan_amount          0.167525
card_type_none       0.150203
n_orders             0.148669
card_type_classic    0.139023
card_age             0.119925
loan_age             0.113935
Name: loan_status, dtype: float64

In [46]:
top_10_corr = list(top_10_corr.index)
top_10_corr

['loan_status',
 'latest_balance',
 'n_clients',
 'loan_payments',
 'loan_amount',
 'card_type_none',
 'n_orders',
 'card_type_classic',
 'card_age',
 'loan_age']

### ANOVA

* Trying different selection methods due to unsatisfactory model result.

In [47]:
from sklearn.feature_selection import f_classif, SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [48]:
# first need to train test split
fin_cont_all = financial_cont.dropna()
X_all = fin_cont_all
y_all = financial.dropna()[target]

In [49]:
X_all_train, X_all_test, y_all_train, y_all_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 42)

In [50]:
# need to scale

anovascaler = StandardScaler()

X_all_train_scaled = anovascaler.fit_transform(X_all_train)
X_all_test_scaled = anovascaler.transform(X_all_test)

In [51]:
anovaselector = SelectKBest(score_func = f_classif)
anovaselector.fit(X_all_train_scaled, y_all_train)

SelectKBest(k=10, score_func=<function f_classif at 0x7f1e73421620>)

In [52]:
financial_anova = financial_cont.loc[:,list(anovaselector.get_support())]

In [53]:
top_10_anova = list(financial_anova.columns)

In [54]:
top_10_anova

['n_orders',
 'total_debit',
 'latest_balance',
 'A14',
 'n_clients',
 'loan_amount',
 'loan_payments',
 'account_age',
 'loan_age',
 'card_age']

### Final Dataframe

* apply combination of feature selection methods

In [55]:
keep_var = list(set(top_var + top_10_corr + top_10_anova))
keep_var 

['card_type_classic',
 'latest_balance',
 'loan_age',
 'A14',
 'loan_amount',
 'loan_payments',
 'card_type_none',
 'card_age',
 'district_id',
 'n_clients',
 'loan_duration',
 'loan_status',
 'account_age',
 'total_debit',
 'n_orders']

In [56]:
financial_final = financial_df[keep_var]
financial_final.head()

Unnamed: 0,card_type_classic,latest_balance,loan_age,A14,loan_amount,loan_payments,card_type_none,card_age,district_id,n_clients,loan_duration,loan_status,account_age,total_debit,n_orders
0,0.0,42628.0,5.0,167.0,80952.0,3373.0,1.0,0.0,1.0,2.0,24.0,1.0,6.0,10638.7,2.0
1,0.0,10840.0,3.0,123.0,30276.0,2523.0,1.0,0.0,21.0,1.0,12.0,0.0,4.0,2523.2,1.0
2,0.0,30199.0,2.0,96.0,30276.0,2523.0,1.0,0.0,68.0,1.0,12.0,1.0,3.0,10614.2,4.0
3,0.0,45905.0,1.0,120.0,318480.0,5308.0,1.0,0.0,20.0,1.0,60.0,0.0,2.0,10305.5,4.0
4,0.0,44581.0,1.0,120.0,110736.0,2307.0,1.0,0.0,19.0,1.0,48.0,1.0,2.0,9666.8,4.0


### Preliminary Model Testing

Check how models run after feature selection

* Train test split
* Decision Trees
* Random Forest Classifier
* SVM

In [57]:
# train test split

from sklearn.model_selection import train_test_split

In [58]:
# set aside test

train, test = train_test_split(financial_final, test_size = 0.2, random_state = 42)

In [59]:
# make sure sampling is decent

test['loan_status'].value_counts()

1.0    121
0.0     16
Name: loan_status, dtype: int64

In [60]:
X_test = test.drop(columns = 'loan_status')
y_test = test.loan_status

In [61]:
X_train = train.drop(columns = 'loan_status')
y_train = train.loan_status

In [62]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [63]:
# decision trees

from sklearn.tree import DecisionTreeClassifier

In [64]:
dt_model = DecisionTreeClassifier(random_state = 42)
dt_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')

In [65]:
dt_model.score(X_val, y_val)

#hey that's not bad!

0.7339449541284404

In [66]:
dt_model.score(X_test, y_test)

#cool! the classifier seems to work!

0.8029197080291971

In [67]:
from sklearn.metrics import recall_score

In [68]:
y_train_pred = dt_model.predict(X_train)
y_val_pred = dt_model.predict(X_val)
y_test_pred = dt_model.predict(X_test)

In [69]:
recall_score(y_train, y_train_pred)


1.0

In [70]:
recall_score(y_val, y_val_pred)

0.7789473684210526

In [71]:
recall_score(y_test, y_test_pred)

0.8429752066115702

In [72]:
# so this feature selection seemed to work well, export dataset to start tuning models

train.to_csv('data/train.csv')
test.to_csv('data/test.csv')



In [73]:
#create datasets with different feature selections
#top_var + top_10_corr + top_10_anova
financial_varred = financial_df[top_var]
financial_varred['loan_status'] = financial_df['loan_status']
train, test = train_test_split(financial_varred, test_size = 0.2, random_state = 42)
train.to_csv('data/train_varred.csv')
test.to_csv('data/test_varred.csv')

In [74]:
financial_corr = financial_df[top_10_corr]
train, test = train_test_split(financial_corr, test_size = 0.2, random_state = 42)
train.to_csv('data/train_corr.csv')
test.to_csv('data/test_corr.csv')

In [75]:
financial_corr = financial_df[top_10_anova]
financial_anova['loan_status'] = financial_df['loan_status']
train, test = train_test_split(financial_anova, test_size = 0.2, random_state = 42)
train.to_csv('data/train_anova.csv')
test.to_csv('data/test_anova.csv')

In [76]:
# original dataset still best dataset