### 模型调优
使用网格搜索法对模型进行调优(调参时采用五折交叉验证的方式)，并进行模型评估

In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
import xgboost as xgb
import lightgbm as lgb
import pandas as pd

In [2]:
df = pd.read_csv('../../dataset/LoanUsersData/data.csv', encoding='gbk')

# 数据处理
# 删除无关特征
df.drop(['Unnamed: 0', 'trade_no', 'bank_card_no', 'source', 'id_name'], inplace=True, axis=1)

# 数据类型转换
df['reg_preference_for_trad'].fillna('其他城市', inplace=True)
df['reg_preference_for_trad'].replace({
    '一线城市': 1,
    '二线城市': 2,
    '三线城市': 3,
    '境外': 4,
    '其他城市': 5
}, inplace=True)

# 处理日期类型数据
df.drop(['latest_query_time', 'loans_latest_time'], inplace=True, axis=1)

# 缺失值处理
df['student_feature'].fillna(0, inplace=True)
for i in df.columns:
    df[i].fillna(df[i].mode()[0], inplace=True)    # 用众数填充其他属性的缺失值

#### 评估方法一：切分数据集

In [3]:
# 切分数据集
y=df['status']
x=df.drop('status',axis=1)
x_train,x_test,y_train,y_test =train_test_split(x,y,test_size=0.3,random_state = 2018)   

features = x_train.columns 
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [4]:
# 随机森林
clf_rf = RandomForestClassifier()
clf_rf.fit(x_train, y_train)
rf_y_pred = clf_rf.predict(x_test)

# 评估
ans = classification_report(y_test, rf_y_pred, digits=5)
print(ans)

              precision    recall  f1-score   support

           0    0.78891   0.95880   0.86560      1068
           1    0.65891   0.23677   0.34836       359

    accuracy                        0.77715      1427
   macro avg    0.72391   0.59779   0.60698      1427
weighted avg    0.75620   0.77715   0.73547      1427



#### 交叉验证

In [5]:
from sklearn.model_selection import cross_val_predict, cross_val_score

model_rf = RandomForestClassifier()
score_rf = cross_val_score(model_rf, x, y, cv=5)
pred_rf = cross_val_predict(model_rf, x, y, cv=5)
print(score_rf, score_rf.mean())
ans1 = classification_report(y, pred_rf, digits=5)
print(ans1)

[0.79705573 0.78969506 0.78338591 0.79915878 0.79789474] 0.7934380430571697
              precision    recall  f1-score   support

           0    0.80070   0.96462   0.87505      3561
           1    0.72845   0.28332   0.40797      1193

    accuracy                        0.79365      4754
   macro avg    0.76457   0.62397   0.64151      4754
weighted avg    0.78257   0.79365   0.75784      4754



#### 模型微调--网格搜索

In [6]:
from sklearn.model_selection import GridSearchCV
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
{'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]
rf_model = RandomForestClassifier()
grid_search = GridSearchCV(rf_model, param_grid, cv=5)
grid_search.fit(x, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [8]:
# 搜索结果
print('最高得分： %.5f' % grid_search.best_score_)
print('最优参数： n_estimators:{} \t max_features:{} \t bootstrap:{}'.format(grid_search.best_estimator_.n_estimators, \
                                                                 grid_search.best_estimator_.max_features, grid_search.best_estimator_.bootstrap))

最高得分： 0.78607
最优参数： n_estimators:30 	 max_features:8 	 bootstrap:True
