In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from scipy.stats import randint, uniform
from scipy import linalg
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
import xgboost as xgb
import random
%matplotlib inline

<a id='ecc'></a>
## Exploration, Cleaning, Creating Features

We have the following variables:

<ul>
<li><b><u>SeriousDlqin2yrs</u></b> (target): 两年内有超过90+天逾期的用户</li>
<li><b>age</b>: 年龄</li>
<li><b>RevolvingUtilizationOfUnsecuredLines</b>: 信用卡额度与限额 / 限额总额 </li>
<li><b>DebtRatio</b>: 每月债务，生活费用支出除以每月收入</li>
<li><b>NumberRealEstateLoansOrLines</b>: 不动产贷款或额度数量</li>
<li><b>NumberOfOpenCreditLinesAndLoans</b>: # 未清信贷额度和贷款</li>
<li><b>NumberOfTime30-59DaysPastDueNotWorse</b>: 35-59天逾期次数</li>
<li><b>NumberOfTime60-89DaysPastDueNotWorse</b>: 60-89天逾期次数</li>
<li><b>NumberOfTime90DaysPastDueNotWorse</b>: 90+天逾期次数</li>
<li><b>NumberOfDependents</b>: 家属人数（不包括自己）</li>
<li><b>MonthlyIncome</b>: 月收入</li>

In [2]:
df = pd.read_csv('./GiveMeSomeCredit/cs-training.csv', index_col=0)
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtype

### Balance