In [26]:
import pandas as pd
import lightgbm as lgb
from sklearn import preprocessing
import category_encoders as ce

In [2]:
all_train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')
all_data = pd.concat([all_train, testA]).drop(['id'], axis=1)

In [3]:
all_data['isDefault'].value_counts(dropna=False)

0.0    640390
NaN    200000
1.0    159610
Name: isDefault, dtype: int64

# data info

In [11]:
all_data.head().T

Unnamed: 0,0,1,2,3,4
loanAmnt,35000.0,18000.0,12000.0,11000.0,3000.0
term,5,5,5,3,3
interestRate,19.52,18.49,16.99,7.26,12.99
installment,917.97,461.9,298.17,340.96,101.07
grade,E,D,D,A,C
subGrade,E2,D2,D3,A4,C2
employmentTitle,320.0,219843.0,31698.0,46854.0,54.0
employmentLength,2 years,5 years,8 years,10+ years,
homeOwnership,2,0,0,1,1
annualIncome,110000.0,46000.0,74000.0,118000.0,29000.0


In [13]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 199999
Data columns (total 46 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   loanAmnt            1000000 non-null  float64
 1   term                1000000 non-null  int64  
 2   interestRate        1000000 non-null  float64
 3   installment         1000000 non-null  float64
 4   grade               1000000 non-null  object 
 5   subGrade            1000000 non-null  object 
 6   employmentTitle     999999 non-null   float64
 7   employmentLength    941459 non-null   object 
 8   homeOwnership       1000000 non-null  int64  
 9   annualIncome        1000000 non-null  float64
 10  verificationStatus  1000000 non-null  int64  
 11  issueDate           1000000 non-null  object 
 12  isDefault           800000 non-null   float64
 13  purpose             1000000 non-null  int64  
 14  postCode            999999 non-null   float64
 15  regionCode      

In [14]:
all_data.select_dtypes('int').columns

Index(['term', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode',
       'initialListStatus', 'applicationType'],
      dtype='object')

In [15]:
all_data.select_dtypes('object').columns

Index(['grade', 'subGrade', 'employmentLength', 'issueDate',
       'earliesCreditLine'],
      dtype='object')

In [16]:
all_data.select_dtypes('float').columns

Index(['loanAmnt', 'interestRate', 'installment', 'employmentTitle',
       'annualIncome', 'isDefault', 'postCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')

In [17]:
date_cols = ['issueDate', 'earliesCreditLine']
cat_cols = ['term', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode', 'initialListStatus', 'applicationType', 'grade', 'subGrade', 'employmentLength', 'employmentTitle', 'postCode', 'policyCode', 'title']
num_cols = ['loanAmnt', 'interestRate', 'installment', 'annualIncome', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc']
beh_cols = ['n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

# EDA

## process date

In [4]:
date_cols = ['issueDate', 'earliesCreditLine']
# pd.to_datetime('2022-1-1')
all_data['issueDatetoDay'] = (pd.to_datetime('2022-1-1') - pd.to_datetime(all_data['issueDate']))
all_data['earliesCreditLinetoDay'] = (pd.to_datetime('2022-1-1') - pd.to_datetime(all_data['earliesCreditLine']))

## category encoder

In [5]:
cat_cols = ['term', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode', 'initialListStatus',
            'applicationType', 'grade', 'subGrade', 'employmentLength', 
            'employmentTitle', 'postCode', 'title', 
            'issueDatetoDay', 'earliesCreditLinetoDay']

In [12]:
for col in cat_cols:
    print(col, all_data[col].nunique())
    print(all_data[col].value_counts(dropna=False).sort_index(), '\n')

term 2
3    758733
5    241267
Name: term, dtype: int64 

homeOwnership 6
0    494678
1    397051
2    107910
3       223
4        40
5        98
Name: homeOwnership, dtype: int64 

verificationStatus 3
0    301300
1    387568
2    311132
Name: verificationStatus, dtype: int64 

purpose 14
0     580226
1      11369
2      65065
3      22021
4     219331
5      57780
6       5416
7       6730
8      10812
9      11560
10      7048
11       692
12      1702
13       248
Name: purpose, dtype: int64 

regionCode 51
0      34001
1       2065
2      38179
3      32124
4      17777
5      11930
6       4405
7      28233
8     145952
9      28509
10     24351
11     16277
12     23120
13     81350
14     82017
15      8331
16      5495
17     12334
18     21572
19     32627
20     11454
21     71163
22     22116
23     26246
24     15862
25      2595
26     23086
27     13149
28      5036
29      1944
30     35933
31      2842
32     15091
33      4766
34      2663
35     12160
36     14572
37

### label encoder

In [19]:
le_cols = ['term', 'grade', 'subGrade', 'employmentLength', 'issueDatetoDay', 'earliesCreditLinetoDay']

In [51]:
# all_data['employmentLength'].map({'10+ years':'9+ years'})
all_data['employmentLengthM'] = all_data['employmentLength'].replace({'< 1 year': '0 year', '10+ years':'9+ years',})

In [52]:
le = preprocessing.LabelEncoder()
le_series = le.fit_transform(all_data['employmentLengthM'])

In [55]:
print(all_data['employmentLengthM'].value_counts(dropna=False).sort_index())
pd.Series(le_series).value_counts().sort_index()

0 year       80226
1 year       65671
2 years      90565
3 years      80163
4 years      59818
5 years      62645
6 years      46582
7 years      44230
8 years      45168
9 years      37866
9+ years    328525
NaN          58541
Name: employmentLengthM, dtype: int64


0      80226
1      65671
2      90565
3      80163
4      59818
5      62645
6      46582
7      44230
8      45168
9      37866
10    328525
11     58541
dtype: int64

# baseline model

In [60]:
baseline_train = all_train.select_dtypes(['int', 'float'])

In [67]:
X_train = baseline_train[:700000].drop('isDefault', axis = 1)
y_train = baseline_train[:700000]['isDefault']

In [68]:
X_test = baseline_train[700000:].drop('isDefault', axis = 1)
y_test = baseline_train[700000:]['isDefault']

In [69]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [86]:
params = {'device_type': 'gpu',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0
}

In [87]:
clf = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                valid_sets=lgb_eval,
                verbose_eval=50,
                callbacks=[lgb.early_stopping(stopping_rounds=10)])

Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.711989
[100]	valid_0's auc: 0.71822
[150]	valid_0's auc: 0.721612
[200]	valid_0's auc: 0.723417
[250]	valid_0's auc: 0.724592
[300]	valid_0's auc: 0.725209
Early stopping, best iteration is:
[290]	valid_0's auc: 0.725263
