In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# reduce_mem_usage 函数通过调整数据类型，帮助我们减少数据在内存中占用的空间
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#         else:
#             df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
data_path = './'
save_path = '/conent/drive/loan_default/'

In [3]:
data = pd.read_csv('./results/data_5.csv')

In [4]:
data = reduce_mem_usage(data)

Memory usage of dataframe is 499059728.00 MB
Memory usage after optimization is: 132442868.00 MB
Decreased by 73.5%


In [6]:
data.drop(columns=['Unnamed: 0'], inplace=True)

In [14]:
data['employmentTitle_bin'] = pd.qcut(data['employmentTitle'], 10)
data['postCode_bin'] = pd.qcut(data['postCode'], 10)

In [15]:
# data['verificationStatus'] = data['verificationStatus'].astype('category')
# data['applicationType'] = data['applicationType'].astype('category')
# data['purpose'] = data['purpose'].astype('category')
# data['initialListStatus'] = data['initialListStatus'].astype('category')
# data['employmentTitle'] = data['employmentTitle'].astype('category')
# data['postCode'] = data['postCode'].astype('category')
# data['title'] = data['title'].astype('category')
data['postCode_bin'].astype('category')
data['employmentTitle_bin'].astype('category')

0               (184.0, 754.0]
1           (20026.0, 59624.0]
2           (20026.0, 59624.0]
3               (-0.001, 54.0]
4           (20026.0, 59624.0]
                  ...         
959725    (247864.2, 378351.0]
959726      (20026.0, 59624.0]
959727    (247864.2, 378351.0]
959728           (54.0, 184.0]
959729          (184.0, 754.0]
Name: employmentTitle_bin, Length: 959730, dtype: category
Categories (10, interval[float64]): [(-0.001, 54.0] < (54.0, 184.0] < (184.0, 754.0] < (754.0, 2330.0] ... (20026.0, 59624.0] < (59624.0, 138434.4] < (138434.4, 247864.2] < (247864.2, 378351.0]]

In [16]:
data.head()

Unnamed: 0,id,term,installment,subGrade,employmentTitle,employmentLength,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,dti,delinquency_2years,ficoRangeLow,openAcc,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,n0,n4,n5,n6,n8,n9,n11,n12,n13,n14,istest,employmentTitle_bin,Acc,postCode_bin,installment_median,installment_mean,dti_median,dti_mean,revolBal_median,revolBal_mean,revolUtil_median,revolUtil_mean,earliesCreditLine_median,earliesCreditLine_mean,ficoRangeLow_median,ficoRangeLow_mean,n6_median,n6_mean,totalAcc_median,totalAcc_mean,n9_median,n9_mean,n8_median,n8_mean,n5_median,n5_mean,issueDate_median,issueDate_mean,postCode_count,employmentTitle_count,title_count
0,0,5,918.0,22,320.0,2.0,110000.0,2,2353,1.0,1,137.0,17.046875,0.0,730.0,7.0,24178.0,48.90625,27.0,0,0,232,1.0,0.0,4.0,9.0,8.0,12.0,2.0,0.0,0.0,0.0,2.0,0,"(184.0, 754.0]",20.0,"(120.0, 157.0]",470.75,519.5,20.453125,20.90625,11424.0,16032.0,60.34375,58.625,234.0,248.75,675.0,682.0,7.0,8.828125,23.0,24.90625,5.0,6.007812,13.0,14.289062,7.0,7.707031,1988,2011.0,12126,12126,12126
1,2,5,298.25,18,31698.0,8.0,74000.0,2,1896,0.0,0,337.0,22.765625,0.0,675.0,11.0,4606.0,51.8125,27.0,0,0,175,0.0,0.0,0.0,0.0,21.0,5.0,3.0,0.0,0.0,0.0,4.0,0,"(20026.0, 59624.0]",16.0,"(263.0, 343.0]",405.0,475.75,19.96875,20.46875,10536.0,15184.0,58.1875,57.09375,233.0,247.875,680.0,683.0,7.0,8.695312,22.0,24.4375,5.0,5.90625,12.0,14.039062,7.0,7.589844,1957,1925.0,22267,22267,22267
2,3,3,341.0,4,46854.0,10.0,118000.0,1,1957,0.0,4,148.0,17.203125,0.0,685.0,9.0,9948.0,52.59375,28.0,1,0,259,4.0,6.0,4.0,16.0,4.0,21.0,6.0,0.0,0.0,0.0,1.0,0,"(20026.0, 59624.0]",19.0,"(120.0, 157.0]",373.25,442.25,15.53125,16.265625,12048.0,18288.0,40.3125,42.03125,260.0,279.0,715.0,718.5,7.0,8.5,25.0,26.234375,5.0,5.105469,14.0,15.429688,8.0,8.867188,1957,1938.0,28100,28100,28100
3,4,3,101.0625,12,54.0,0.0,29000.0,2,1744,0.0,10,301.0,32.15625,0.0,690.0,12.0,2942.0,32.0,27.0,0,0,520,11.0,1.0,2.0,4.0,9.0,15.0,7.0,0.0,0.0,0.0,4.0,0,"(-0.001, 54.0]",15.0,"(263.0, 343.0]",343.75,415.5,18.234375,18.8125,10776.0,15664.0,55.6875,54.84375,239.0,254.25,680.0,688.5,6.0,8.382812,23.0,24.484375,5.0,5.679688,13.0,14.273438,7.0,7.808594,1896,1930.0,44889,44889,44889
4,5,3,344.75,5,51727.0,7.0,39000.0,2,1348,0.0,9,512.0,17.140625,0.0,730.0,19.0,4047.0,31.09375,52.0,1,0,269,10.0,12.0,1.0,1.0,48.0,3.0,2.0,0.0,0.0,0.0,0.0,0,"(20026.0, 59624.0]",33.0,"(451.0, 555.0]",375.5,435.75,16.0,16.65625,12344.0,18592.0,44.1875,45.375,256.0,274.5,705.0,711.5,7.0,8.617188,24.0,26.09375,5.0,5.253906,14.0,15.28125,8.0,8.6875,1957,1958.0,35546,35546,35546


In [17]:
train = data[data['istest']==0].drop(columns=['istest'])
y_train = train['isDefault']
X_train = train.drop(columns=['isDefault', 'id'])
X_test = data[data.istest==1].drop(columns=['istest', 'isDefault'])
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(759730, 61)
(200000, 62)
(759730,)


In [22]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200000 entries, 759730 to 959729
Data columns (total 62 columns):
 #   Column                    Non-Null Count   Dtype   
---  ------                    --------------   -----   
 0   id                        200000 non-null  int32   
 1   term                      200000 non-null  int8    
 2   installment               200000 non-null  float16 
 3   subGrade                  200000 non-null  int8    
 4   employmentTitle           200000 non-null  float32 
 5   employmentLength          200000 non-null  float16 
 6   annualIncome              200000 non-null  float32 
 7   verificationStatus        200000 non-null  int8    
 8   issueDate                 200000 non-null  int16   
 9   purpose                   200000 non-null  int8    
 10  postCode                  200000 non-null  float16 
 11  dti                       199939 non-null  float16 
 12  delinquency_2years        200000 non-null  float16 
 13  ficoRangeLow            

 31       |  0.7283   |  0.8779   |  0.2106   |  0.1255   |  1.735    |  9.13     |  35.2     |  0.03568  |  74.18    |
 
 |   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
 
 | 31 | 0.7327 | 0.8779 | 0.2106 | 0.1255 | 1.735 | 9.13 | 35.2 | 0.03568 | 83.21 |

In [19]:
from sklearn.model_selection import StratifiedKFold
# 分离数据集，方便进行交叉验证

# 10折交叉验证
folds = 5
seed = 2020
skf = StratifiedKFold(n_splits=folds)

In [20]:
# 微调版
"""使用lightgbm 5折交叉验证进行建模预测"""
cv_scores = []
for i, (train_index, valid_index) in enumerate(skf.split(X_train, y_train)):
    print('************************************ {} ************************************'.format(str(i+1)))
    X_train_split, y_train_split, X_val, y_val = X_train.iloc[train_index], y_train[train_index], X_train.iloc[valid_index], y_train[valid_index]
    
    train_matrix = lgb.Dataset(X_train_split, label=y_train_split)
    valid_matrix = lgb.Dataset(X_val, label=y_val)

    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',      
        
        'learning_rate': 0.01,
        'num_leaves': 83,
        'max_depth': 9,
        
        'min_child_weight':35.2,
        'min_split_gain':0.03568,
        'bagging_fraction':0.8779,
        'feature_fraction':0.2106,
        'lambda_l1':0.1255,
        'lambda_l2':1.735,
        
 
        'seed': 2020,
        'silent': False,
        'categorical_feature':['name:employmentTitle_bin', 'name:postCode_bin']
    }
    
    model = lgb.train(params, train_set=train_matrix, num_boost_round=15000, valid_sets=valid_matrix, verbose_eval=1000, early_stopping_rounds=200)
    val_pred = model.predict(X_val, num_iteration=model.best_iteration)
    
    cv_scores.append(roc_auc_score(y_val, val_pred))
    print(cv_scores)

print("lgb_scotrainre_list:{}".format(cv_scores))
print("lgb_score_mean:{}".format(np.mean(cv_scores)))
print("lgb_score_std:{}".format(np.std(cv_scores)))

************************************ 1 ************************************


LightGBMError: Could not find categorical_feature employmentTitle_bin in data file

In [42]:
X_test['isDefault'] = model.predict(X_test.drop(columns=['id']))
X_test = X_test[['id', 'isDefault']]
X_test.head()

Unnamed: 0,id,isDefault
800000,800000,0.113073
800001,800001,0.304508
800002,800002,0.562978
800003,800003,0.30302
800004,800004,0.317593


In [43]:
X_test.to_csv('./submit_1206_2.csv')

In [44]:
model.save_model('./model_1206_2.txt')

<lightgbm.basic.Booster at 0x7f0aa8a08810>

In [None]:
X_test[X_test['isDefault']>0.51].count()

In [45]:
pd.DataFrame({'fea_name':model.feature_name(), 'importance':model.feature_importance()}).sort_values(by='importance')

Unnamed: 0,fea_name,importance
36,remainRec_bin,131
29,n12,209
37,pubRec_bin,538
35,remainRec,552
0,term,643
32,employmentTitle_bin,706
14,pubRecBankruptcies,735
30,n13,752
19,applicationType,855
38,homeOwnership_0,1033


In [28]:
X_train.head()

Unnamed: 0,Unnamed0,term,installment,subGrade,employmentTitle,employmentLength,annualIncome,verificationStatus,purpose,postCode,dti,delinquency_2years,ficoRangeLow,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,n0,n4,n5,n6,n8,n9,n11,n12,n13,n14,employmentTitle_bin,fico,Acc,remainRec,remainRec_bin,pubRec_bin,homeOwnership_0,homeOwnership_1,postCode_bin,n6_mean,dti_mean,revolUtil_mean,n9_mean,n4_mean,n11_mean,ficoRangeLow_mean
0,0,5,918.0,22,320.0,2.0,110000.0,2,1,137.0,17.046875,0.0,730.0,7.0,0.0,0.0,24178.0,48.90625,27.0,0,0,232,1.0,0.0,4.0,9.0,8.0,12.0,2.0,0.0,0.0,0.0,2.0,0,1,20.0,0.0,0,0,0,0,3.0,8.851562,20.609375,59.15625,6.007812,4.605469,0.001111,682.0
1,1,5,462.0,17,219843.0,5.0,46000.0,2,0,156.0,27.828125,0.0,700.0,13.0,0.0,0.0,15096.0,38.90625,18.0,1,0,223,1723.0,,10.0,,,,,,,,,0,1,5.0,0.0,0,0,1,0,3.0,8.570312,20.109375,57.5,5.875,4.554688,0.000898,683.5
2,2,5,298.25,18,31698.0,8.0,74000.0,2,0,337.0,22.765625,0.0,675.0,11.0,0.0,0.0,4606.0,51.8125,27.0,0,0,175,0.0,0.0,0.0,0.0,21.0,5.0,3.0,0.0,0.0,0.0,4.0,0,1,16.0,0.0,0,0,1,0,6.0,8.65625,20.234375,57.4375,5.882812,4.527344,0.000753,683.5
3,3,3,341.0,4,46854.0,10.0,118000.0,1,4,148.0,17.203125,0.0,685.0,9.0,0.0,0.0,9948.0,52.59375,28.0,1,0,259,4.0,6.0,4.0,16.0,4.0,21.0,6.0,0.0,0.0,0.0,1.0,0,1,19.0,0.0,0,0,0,1,3.0,8.515625,15.984375,41.71875,5.125,5.046875,0.000552,720.0
4,4,3,101.0625,12,54.0,0.0,29000.0,2,10,301.0,32.15625,0.0,690.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,520,11.0,1.0,2.0,4.0,9.0,15.0,7.0,0.0,0.0,0.0,4.0,1,1,15.0,0.0,0,0,0,1,6.0,8.414062,18.671875,55.0625,5.679688,4.601562,0.000731,688.5


In [33]:
data.drop(columns=['Unnamed0']).to_csv('./data.csv', index=False)