In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

In [2]:
#loading data
train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')

## EDA

In [3]:
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea, list(train.columns)))
print(numerical_fea, "\n")
print(category_fea)

['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'isDefault', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']


In [4]:
# check variables in numerical features to judge if it is discrete or continuous
def get_numerical_serial_fea(data,feas):
    numerical_serial_fea = []
    numerical_noserial_fea = []
    for fea in feas:
        temp = data[fea].nunique()
        if temp <= 15:
            numerical_noserial_fea.append(fea)
            continue
        numerical_serial_fea.append(fea)
    return numerical_serial_fea,numerical_noserial_fea

numerical_serial_fea,numerical_noserial_fea = get_numerical_serial_fea(train,numerical_fea)

In [5]:
print(numerical_serial_fea)
#discrete vaiables in numerical features analysis
print(numerical_noserial_fea)
for fea in numerical_noserial_fea:
    print(train[fea].value_counts())

['id', 'loanAmnt', 'interestRate', 'installment', 'employmentTitle', 'annualIncome', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'revolBal', 'revolUtil', 'totalAcc', 'title', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n13', 'n14']
['term', 'homeOwnership', 'verificationStatus', 'isDefault', 'purpose', 'pubRecBankruptcies', 'initialListStatus', 'applicationType', 'policyCode', 'n11', 'n12']
3    606902
5    193098
Name: term, dtype: int64
0    395732
1    317660
2     86309
3       185
5        81
4        33
Name: homeOwnership, dtype: int64
1    309810
2    248968
0    241222
Name: verificationStatus, dtype: int64
0    640390
1    159610
Name: isDefault, dtype: int64
0     464096
4     175433
2      52129
5      46276
3      17579
9       9238
1       9106
8       8657
10      5652
7       5373
6       4354
12      1363
11       554
13       190
Name: purpose, dtype: int64
0.0     700076
1.0     

In [6]:
#categorical fatures analysis
print(category_fea)
for fea in category_fea:
    print(train[fea].value_counts())

['grade', 'subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine']
B    233690
C    227118
A    139661
D    119453
E     55661
F     19053
G      5364
Name: grade, dtype: int64
C1    50763
B4    49516
B5    48965
B3    48600
C2    47068
C3    44751
C4    44272
B2    44227
B1    42382
C5    40264
A5    38045
A4    30928
D1    30538
D2    26528
A1    25909
D3    23410
A3    22655
A2    22124
D4    21139
D5    17838
E1    14064
E2    12746
E3    10925
E4     9273
E5     8653
F1     5925
F2     4340
F3     3577
F4     2859
F5     2352
G1     1759
G2     1231
G3      978
G4      751
G5      645
Name: subGrade, dtype: int64
10+ years    262753
2 years       72358
< 1 year      64237
3 years       64152
1 year        52489
5 years       50102
4 years       47985
6 years       37254
8 years       36192
7 years       35407
9 years       30272
Name: employmentLength, dtype: int64
2016-03-01    29066
2015-10-01    25525
2015-07-01    24496
2015-12-01    23245
2014-10-01    21461
        

## Feature Engineering

### 对datatime类型数据处理

In [7]:
#train转化成时间格式  issueDateDT特征表示数据日期离数据集中日期最早的日期（2007-06-01）的天数
train['issueDate'] = pd.to_datetime(train['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
train['issueDate'] = train['issueDate'].apply(lambda x: x-startdate).dt.days
#testA转化成时间格式
testA['issueDate'] = pd.to_datetime(testA['issueDate'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
testA['issueDate'] = testA['issueDate'].apply(lambda x: x-startdate).dt.days

In [8]:
######################################
label = 'isDefault'
numerical_fea.remove(label)

### 对employmentLength类型数据处理

In [9]:
################################
train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64

In [10]:
# deal with employmentLength
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
#同时对test train 都进行修改
for data in [train, testA]:
    data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
    data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
    data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

### 对earliesCreditLine类型数据处理

In [11]:
train['earliesCreditLine']

0         Aug-2001
1         May-2002
2         May-2006
3         May-1999
4         Aug-1977
            ...   
799995    Aug-2011
799996    May-1989
799997    Jul-2002
799998    Jan-1994
799999    Feb-2002
Name: earliesCreditLine, Length: 800000, dtype: object

In [12]:
import calendar

def time2num(d):
    # d = d['earliesCreditLine']
    string = d[:]
    s1 = list(calendar.month_abbr).index(string[:3])
    if s1 < 10: s1 = '0' + str(s1)
    else: s1 = str(s1)
    # print(string[-4:], s1)
    return np.int(string[-4:] + s1)

# time2num(train['earliesCreditLine'][0])
for data in [train, testA]:
    data['earliesCreditLine'] = data['earliesCreditLine'].apply(time2num)

In [13]:
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea, list(train.columns)))
print(numerical_fea, "\n")
print(category_fea)

['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'employmentLength', 'homeOwnership', 'annualIncome', 'verificationStatus', 'issueDate', 'isDefault', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

['grade', 'subGrade']


### 类型特征数据映射

In [14]:
# map grade object into int
for data in [train, testA]:
    data['grade'] = data['grade'].map({'A':0,'B':1,'C':2,'D':3,'E':4,'F':5,'G':6})

In [17]:
train_x = train.copy()
testA_x = testA.copy()

In [18]:
for f in train_x.columns:
    print(f, '类型数：', train_x[f].nunique())

id类型数：800000
loanAmnt类型数：1540
term类型数：2
interestRate类型数：641
installment类型数：72360
grade类型数：7
subGrade类型数：35
employmentTitle类型数：248683
employmentLength类型数：11
homeOwnership类型数：6
annualIncome类型数：44926
verificationStatus类型数：3
issueDate类型数：139
isDefault类型数：2
purpose类型数：14
postCode类型数：932
regionCode类型数：51
dti类型数：6321
delinquency_2years类型数：30
ficoRangeLow类型数：39
ficoRangeHigh类型数：39
openAcc类型数：75
pubRec类型数：32
pubRecBankruptcies类型数：11
revolBal类型数：71116
revolUtil类型数：1286
totalAcc类型数：134
initialListStatus类型数：2
applicationType类型数：2
earliesCreditLine类型数：720
title类型数：39644
policyCode类型数：1
n0类型数：39
n1类型数：33
n2类型数：50
n3类型数：50
n4类型数：46
n5类型数：65
n6类型数：107
n7类型数：70
n8类型数：102
n9类型数：44
n10类型数：76
n11类型数：5
n12类型数：5
n13类型数：28
n14类型数：31


In [30]:
s = sorted(list(train_x['subGrade'].unique()))
dict = {}
for i, j in enumerate(s):
    dict[j] = i
print(dict)

{'A1': 0, 'A2': 1, 'A3': 2, 'A4': 3, 'A5': 4, 'B1': 5, 'B2': 6, 'B3': 7, 'B4': 8, 'B5': 9, 'C1': 10, 'C2': 11, 'C3': 12, 'C4': 13, 'C5': 14, 'D1': 15, 'D2': 16, 'D3': 17, 'D4': 18, 'D5': 19, 'E1': 20, 'E2': 21, 'E3': 22, 'E4': 23, 'E5': 24, 'F1': 25, 'F2': 26, 'F3': 27, 'F4': 28, 'F5': 29, 'G1': 30, 'G2': 31, 'G3': 32, 'G4': 33, 'G5': 34}


In [31]:
for data in [train_x, testA_x]:
    data['subGrade'] = data['subGrade'].map({'A1': 0, 'A2': 1, 'A3': 2, 'A4': 3, 'A5': 4, 'B1': 5, 'B2': 6, 'B3': 7, 'B4': 8, 'B5': 9, 'C1': 10, 'C2': 11, 'C3': 12, 'C4': 13, 'C5': 14, 'D1': 15, 'D2': 16, 'D3': 17, 'D4': 18, 'D5': 19, 'E1': 20, 'E2': 21, 'E3': 22, 'E4': 23, 'E5': 24, 'F1': 25, 'F2': 26, 'F3': 27, 'F4': 28, 'F5': 29, 'G1': 30, 'G2': 31, 'G3': 32, 'G4': 33, 'G5': 34})

In [36]:
cate_features = ['term', 'grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode', 'applicationType', 'initialListStatus', 'n11', 'n12']
for f in cate_features:
    print(f, '类型数：', train_x[f].nunique())

term类型数：2
grade类型数：7
subGrade类型数：35
homeOwnership类型数：6
verificationStatus类型数：3
purpose类型数：14
regionCode类型数：51
applicationType类型数：2
initialListStatus类型数：2
n11类型数：5
n12类型数：5


In [49]:
for c in cate_features:
    train_x[c] = train_x[c].astype('category')
    testA_x[c] = testA_x[c].astype('category')

In [50]:
train_x.dtypes

id                       int64
loanAmnt               float64
term                  category
interestRate           float64
installment            float64
grade                 category
subGrade              category
employmentTitle        float64
employmentLength       float64
homeOwnership         category
annualIncome           float64
verificationStatus    category
issueDate                int64
isDefault                int64
purpose               category
postCode               float64
regionCode            category
dti                    float64
delinquency_2years     float64
ficoRangeLow           float64
ficoRangeHigh          float64
openAcc                float64
pubRec                 float64
pubRecBankruptcies     float64
revolBal               float64
revolUtil              float64
totalAcc               float64
initialListStatus     category
applicationType       category
earliesCreditLine        int64
title                  float64
policyCode             float64
n0      

## Build model

In [51]:
features = [f for f in train_x.columns if f not in ['id', 'isDefault', 'issueDate', 'policyCode']]

x_train = train_x[features]
x_test = testA_x[features]

y_train = train_x['isDefault']

In [52]:
def cv_model(clf, train_x, train_y, test_x, clf_name, categorical_feats=''):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y, categorical_feature=categorical_feats)
            valid_matrix = clf.Dataset(val_x, label=val_y, categorical_feature=categorical_feats)

            # params = {
            #     'boosting_type': 'gbdt',
            #     'objective': 'binary',
            #     'metric': 'auc',
            #     'min_child_weight': 5,
            #     'num_leaves': 2 ** 5,
            #     'lambda_l2': 10,
            #     'feature_fraction': 0.8,
            #     'bagging_fraction': 0.8,
            #     'bagging_freq': 4,
            #     'learning_rate': 0.1,
            #     'seed': 2020,
            #     'nthread': 28,
            #     'n_jobs':24,
            #     'silent': True,
            #     'verbose': -1,
            # }
            
            #调整过的参数
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'learning_rate': 0.01,
                'num_leaves': 14,
                'max_depth': 19,
                'min_data_in_leaf': 37,
                'min_child_weight':1.6,
                'bagging_fraction': 0.98,
                'feature_fraction': 0.69,
                'bagging_freq': 96,
                'reg_lambda': 9,
                'reg_alpha': 7,
                'min_split_gain': 0.4,
                'nthread': 8,
                'seed': 2020,
                'silent': True
    }

            model = clf.train(params, train_matrix, num_boost_round=15000, valid_sets=[train_matrix, valid_matrix], verbose_eval=1000,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      'device': 'gpu'
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            # params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
            #           'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            # params = {'learning_rate' : 0.03, 'depth': 7, 'l2_leaf_reg': 3, 'bootstrap_type': 'Bernoulli',
            #           'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'task_type':'GPU', 'allow_writing_files': False}
            params = {'learning_rate':0.03, 'depth': 9, 'l2_leaf_reg': 3, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'random_seed': 42, 'task_type': 'GPU', 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [53]:
def lgb_model(x_train, y_train, x_test, cate_features):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb", cate_features)
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

In [None]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test,  cate_features)  

In [None]:
rh_test =lgb_test

In [None]:
testA['isDefault'] = rh_test

In [None]:
testA[['id','isDefault']].to_csv('lgb_sub.csv', index=False)