# Credit Card Approval Prediction Using ML

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
data = pd.read_csv('data/application_record.csv', encoding='utf-8')
record = pd.read_csv('data/credit_record.csv', encoding = 'utf-8')

## Feature Engineering

In [3]:
begin_month = pd.DataFrame(record.groupby(['ID'])['MONTHS_BALANCE'].agg(min))
begin_month = begin_month.rename(columns = {'MONTHS_BALANCE' : 'begin_month'})
new_data = pd.merge(data, begin_month, how = 'left', on = 'ID')

In [4]:
record['dep_value'] = None
record['dep_value'][record['STATUS'] == '2'] = 'YES'
record['dep_value'][record['STATUS'] == '3'] = 'YES'
record['dep_value'][record['STATUS'] == '4'] = 'YES'
record['dep_value'][record['STATUS'] == '5'] = 'YES'

In [5]:
cpunt = record.groupby('ID').count()
cpunt['dep_value'][cpunt['dep_value'] > 0] = 'Yes'
cpunt['dep_value'][cpunt['dep_value'] == 0] = 'No'
cpunt = cpunt[['dep_value']]

new_data = pd.merge(new_data, cpunt, how = 'inner', on = 'ID')
new_data['target'] = new_data['dep_value']
new_data.loc[new_data['target'] == 'Yes', 'target'] = 1
new_data.loc[new_data['target'] == 'No', 'target'] = 0

In [6]:
print(cpunt['dep_value'].value_counts())
cpunt['dep_value'].value_counts(normalize = True)

No     45318
Yes      667
Name: dep_value, dtype: int64


No     0.985495
Yes    0.014505
Name: dep_value, dtype: float64

<b> rename

In [7]:
new_data.rename(columns={'CODE_GENDER':'Gender','FLAG_OWN_CAR':'Car','FLAG_OWN_REALTY':'Reality',
                         'CNT_CHILDREN':'ChldNo','AMT_INCOME_TOTAL':'inc',
                         'NAME_EDUCATION_TYPE':'edutp','NAME_FAMILY_STATUS':'famtp',
                        'NAME_HOUSING_TYPE':'houtp','FLAG_EMAIL':'email',
                         'NAME_INCOME_TYPE':'inctp','FLAG_WORK_PHONE':'wkphone',
                         'FLAG_PHONE':'phone','CNT_FAM_MEMBERS':'famsize',
                        'OCCUPATION_TYPE':'occyp'
                        },inplace=True)

In [8]:
new_data.dropna()
new_data = new_data.mask(new_data == 'NULL').dropna()

In [9]:
ivtable = pd.DataFrame(new_data.columns, columns = ['variable'])
ivtable['IV'] = None
namelist = ['FLAG_MOBIL', 'begin_month', 'dep_value', 'target', 'ID']

for i in namelist:
    ivtable.drop(ivtable[ivtable['variable'] == i].index, inplace = True)

In [10]:
ivtable.head()

Unnamed: 0,variable,IV
1,Gender,
2,Car,
3,Reality,
4,ChldNo,
5,inc,


###  binary features

In [11]:
# Calculate information value
def calc_iv(df, feature, target , pr = False):
    lst = []
    df[feature] = df[feature].fillna('NULL')
    
    for i in range(df[feature].nunique()):
        #  binary feature의 유니크한 값이 val로 들어감
        val = list(df[feature].unique())[i]
        lst.append(
# Variable
            [feature, 
# Value               
             val, # binary feature의 유니크한 값 (like Female or Male)
# All
             df[df[feature] == val].count()[feature], # 해당 값의  value_counts() 값
# Good (사기 X)
            df[(df[feature] == val) & (df[target] == 0)].count()[feature],
# Bad (사기)
            df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) 
        
    data = pd.DataFrame(lst, columns = ['Variable', 'Value', 'All', 'Good', 'Bad'])
    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = (data['All'] - data['Good']) / (data['All'].sum() - data['Good'].sum())
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
    
    data = data.replace({'WoE' : {np.inf : 0, -np.inf : 0}})
    
    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])
    
    data = data.sort_values(by = ['Variable', 'Value'], ascending = [True, True])
    data.index = range(len(data.index))
    
    if pr:
        print(data)
        print('IV = ', data['IV'].sum())
        
    iv = data['IV'].sum()
    print('\nThis Variable {} IV is:', iv )
    print(df[feature].value_counts())
    return iv, data

In [12]:
def convert_dummy(df, feature, rank = 0):
    pos = pd.get_dummies(df[feature], prefix = feature)
    mode = df[feature].value_counts().index[rank]
    biggest = feature + '_' + str(mode)
    pos.drop(biggest, axis = 1, inplace = True)
    df.drop(feature, axis = 1, inplace = True)
    df = df.join(pos)
    return df

In [13]:
def get_category(df, col, binsum, labels , qcut = False):
    if qcut:
        localdf = pd.qcut(df[col], q = binsum, labels = labels)
    else:
        localdf = pd.cut(df[col], bins = binsum, labels = labels)
    
    localdf = pd.DataFrame(localdf)
    name = 'gp' + '_' + col
    localdf[name] = localdf[col]
    df = df.join(localdf[name])
    df[name] = df[name].astpye(object)
    return df

<b> Gender

In [14]:
new_data['Gender'] = new_data['Gender'].replace(['F', 'M'], [0,1])
print(new_data['Gender'].value_counts())
iv, data = calc_iv(new_data, 'Gender', 'target')
ivtable.loc[ivtable['variable'] == 'Gender', 'IV'] = iv
data.head()

0    15630
1     9504
Name: Gender, dtype: int64

This Variable {} IV is: 0.02520350452745081
0    15630
1     9504
Name: Gender, dtype: int64


Unnamed: 0,Variable,Value,All,Good,Bad,Share,Bad Rate,Distribution Good,Distribution Bad,WoE,IV
0,Gender,0,15630,15400,230,0.621867,0.014715,0.623179,0.545024,0.134005,0.010473
1,Gender,1,9504,9312,192,0.378133,0.020202,0.376821,0.454976,-0.188475,0.01473


<b> Having a Car or Not

In [17]:
new_data['Car'] = new_data['Car'].replace(['N', 'Y'], [0,1])
print(new_data['Car'].value_counts())
iv, data = calc_iv(new_data, 'Car', 'target')
ivtable.loc[ivtable['variable'] == 'Car', 'IV'] = iv
data.head()

0    14618
1    10516
Name: Car, dtype: int64

This Variable {} IV is: 4.54248124999671e-06
0    14618
1    10516
Name: Car, dtype: int64


Unnamed: 0,Variable,Value,All,Good,Bad,Share,Bad Rate,Distribution Good,Distribution Bad,WoE,IV
0,Car,0,14618,14373,245,0.581603,0.01676,0.58162,0.580569,0.00181,2e-06
1,Car,1,10516,10339,177,0.418397,0.016831,0.41838,0.419431,-0.00251,3e-06
