In [2]:
# General
import pandas as pd
import numpy as np
import matplotlib as mlp
import matplotlib.pyplot as plt
import seaborn as sns


# EDA
from pandas_profiling import ProfileReport

# With missforest we can use random forest to impute data. This is better than mean by far
# to avoid errors on loading missingpy
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import MissForest

# Outliers with isolation forest.
from sklearn.ensemble import IsolationForest

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics

# Model to apply
import lightgbm as lgb

## Loading data

In [3]:
# loading data
application = pd.read_csv('../data/application_record.csv', sep = ',')
credit = pd.read_csv('../data/credit_record.csv', sep = ',')

## Pandas profiling

#uri code

In [4]:
#uri code
target_list = [1 if i in ['1','2','3','4','5'] else 0 for i in credit['STATUS'] ]
credit['TARGET'] = target_list
credit = credit.groupby('ID').sum()
target_list = [1 if i > 1 else 0 for i in credit['TARGET'] ]
credit['TARGET'] = target_list
data_frame = application.merge(credit, how='left', on = 'ID')

In [5]:
data_frame.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,MONTHS_BALANCE,TARGET
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-120.0,0.0
1,5008805,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,,2.0,-105.0,0.0
2,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,-435.0,0.0
3,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-10.0,0.0
4,5008809,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,-120.0,0.0


# Label encoder

In [6]:
# taking a look to categorical variables
cat_vars = data_frame.select_dtypes(exclude=[np.number])
cat_vars.describe()

Unnamed: 0,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,OCCUPATION_TYPE
count,438557,438557,438557,438557,438557,438557,438557,304354
unique,2,2,2,5,5,5,6,18
top,F,N,Y,Working,Secondary / secondary special,Married,House / apartment,Laborers
freq,294440,275459,304074,226104,301821,299828,393831,78240


In [7]:
# process columns, apply LabelEncoder to categorical features
for c in cat_vars.columns:
    lbl = LabelEncoder() 
    lbl.fit(list(data_frame[c].values)) 
    data_frame[c] = lbl.transform(list(data_frame[c].values))


# Dropping data
Drop Monthly data
Drop telephone

In [None]:
data_frame = data_frame.drop(['FLAG_MOBIL', 'MONTHS_BALANCE'], axis = 1)


In [19]:
data_frame.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,TARGET
0,5008804,1,1,1,0,427500.0,4,1,0,4,-12005,-4542,1,0,0,18,2.0,0.0
1,5008805,1,1,1,0,427500.0,4,1,0,4,-12005,-4542,1,0,0,18,2.0,0.0
2,5008806,1,1,1,0,112500.0,4,4,1,1,-21474,-1134,0,0,0,16,2.0,0.0
3,5008808,0,0,1,0,270000.0,0,4,3,1,-19110,-3051,0,1,1,14,1.0,0.0
4,5008809,0,0,1,0,270000.0,0,4,3,1,-19110,-3051,0,1,1,14,1.0,0.0


# Get only people with credits

In [28]:
# target to int
data_frame_credito = data_frame[data_frame['TARGET'].notna()]
data_frame_credito['TARGET'] = data_frame_credito['TARGET'].astype(int)

# cnt_fam_members to int
data_frame_credito = data_frame[data_frame['CNT_FAM_MEMBERS'].notna()]
data_frame_credito['CNT_FAM_MEMBERS'] = data_frame_credito['CNT_FAM_MEMBERS'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_frame_credito['TARGET'] = data_frame_credito['TARGET'].astype(int)


In [29]:
data_frame_credito.head()

Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,TARGET
0,5008804,1,1,1,0,427500.0,4,1,0,4,-12005,-4542,1,0,0,18,2,0.0
1,5008805,1,1,1,0,427500.0,4,1,0,4,-12005,-4542,1,0,0,18,2,0.0
2,5008806,1,1,1,0,112500.0,4,4,1,1,-21474,-1134,0,0,0,16,2,0.0
3,5008808,0,0,1,0,270000.0,0,4,3,1,-19110,-3051,0,1,1,14,1,0.0
4,5008809,0,0,1,0,270000.0,0,4,3,1,-19110,-3051,0,1,1,14,1,0.0


In [30]:
%store data_frame_credito

Stored 'data_frame_credito' (DataFrame)
