In [1]:
import pandas as pd
import os
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

import gc

In [9]:
path = 'C:/ZhangLI/Codes/DataSet/零基础入门金融风控-贷款违约预测'
train = pd.read_csv(path+'/train.csv')
test = pd.read_csv(path+'/testA.csv')

In [10]:
test.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3',
       'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')

In [20]:
# concat data
data = pd.concat([train, testA], axis=0, ignore_index=True)
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

AttributeError: 'float' object has no attribute 'split'

In [21]:
# data explore
print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))
data['employmentLength'].value_counts(dropna=False).sort_index()
data['earliesCreditLine'].sample(5)

['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']


840184    1989
899925    2003
807775    2003
85011     2005
870703    1999
Name: earliesCreditLine, dtype: int64

In [22]:
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903
policyCode 类型数： 1


In [23]:
# 类型数在2之上，又不是高维稀疏的
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [24]:
data

Unnamed: 0,id,loanAmnt,term,interestRate,installment,employmentTitle,employmentLength,annualIncome,issueDate,isDefault,...,regionCode_41,regionCode_42,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50
0,0,35000.0,5,19.52,917.97,320.0,2.0,110000.0,2014-07-01,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,18000.0,5,18.49,461.90,219843.0,5.0,46000.0,2012-08-01,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,12000.0,5,16.99,298.17,31698.0,8.0,74000.0,2015-10-01,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,11000.0,3,7.26,340.96,46854.0,10.0,118000.0,2015-08-01,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,3000.0,3,12.99,101.07,54.0,,29000.0,2016-03-01,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,999995,7000.0,3,11.14,229.64,330967.0,7.0,30000.0,2012-10-01,,...,0,0,0,0,0,0,0,0,0,0
999996,999996,6000.0,3,6.24,183.19,38930.0,1.0,56000.0,2015-10-01,,...,0,0,0,0,0,0,0,0,0,0
999997,999997,14000.0,5,15.88,339.57,282016.0,8.0,80000.0,2013-07-01,,...,0,0,0,0,0,0,0,0,0,0
999998,999998,8000.0,3,18.06,289.47,97.0,4.0,190000.0,2017-10-01,,...,0,0,0,0,0,0,0,0,0,0
