# 一起来打怪之 Credit Scoring 练习

-------
## >>>说明：
### 1. 答题步骤：
- 回答问题**请保留每一步**操作过程，请不要仅仅给出最后答案
- 请养成代码注释的好习惯

### 2. 解题思路：
- 为方便大家准确理解题目，在习题实战中有所收获，本文档提供了解题思路提示
- 解题思路**仅供参考**，鼓励原创解题方法
- 为督促同学们自己思考，解题思路内容设置为**注释**，请注意查看

### 3. 所用数据：
- 问题使用了多个数据库，请注意导入每个数据库后都先**查看和了解数据的基本性质**，后面的问题不再一一提醒

--------
## 操作题

### 信用卡欺诈项目

 #### 前期数据导入,预览及处理(此部分勿修改，涉及的数据文件无需复制移动)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import zipfile
with zipfile.ZipFile('KaggleCredit2.csv.zip', 'r') as z:
    f = z.open('KaggleCredit2.csv')
    data = pd.read_csv(f, index_col=0)
data.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45.0,2.0,0.802982,9120.0,13.0,0.0,6.0,0.0,2.0
1,0,0.957151,40.0,0.0,0.121876,2600.0,4.0,0.0,0.0,0.0,1.0
2,0,0.65818,38.0,1.0,0.085113,3042.0,2.0,1.0,0.0,0.0,0.0
3,0,0.23381,30.0,0.0,0.03605,3300.0,5.0,0.0,0.0,0.0,0.0
4,0,0.907239,49.0,1.0,0.024926,63588.0,7.0,0.0,1.0,0.0,0.0


In [2]:
# 检查数据维度
data.shape

(112915, 11)

In [3]:
# 查看数据缺失值情况
data.isnull().sum(axis=0)

SeriousDlqin2yrs                           0
RevolvingUtilizationOfUnsecuredLines       0
age                                     4267
NumberOfTime30-59DaysPastDueNotWorse       0
DebtRatio                                  0
MonthlyIncome                              0
NumberOfOpenCreditLinesAndLoans            0
NumberOfTimes90DaysLate                    0
NumberRealEstateLoansOrLines               0
NumberOfTime60-89DaysPastDueNotWorse       0
NumberOfDependents                      4267
dtype: int64

In [4]:
# 清除缺失值
data.dropna(inplace=True)
data.shapey = data['SeriousDlqin2yrs']
X = data.drop('SeriousDlqin2yrs', axis=1)

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
# 取出对应的X和y
y = data['SeriousDlqin2yrs']
X = data.drop('SeriousDlqin2yrs', axis=1)
# 查看平均的欺诈率
y.mean()

0.06742876076872101

### 以下为操作题

#### 1.把数据切分成训练集和测试集

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
y_train.value_counts().plot(kind='bar', color=['blue', 'red'])

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [8]:
#正负样本不均衡

In [9]:
y_test.value_counts()

0    20246
1     1484
Name: SeriousDlqin2yrs, dtype: int64

In [10]:
# 通过SeriousDlqin2yrs字段查看正负样本分布情况
# 提示：value_counts


# 绘制两种类别的柱状图
# 提示：dataframe可以直接plot(kind='bar')



#### 2.数据预处理之离散化

In [11]:
# 请对年龄按照3岁一个区间进行离散化
# 提示：可以先计算出分桶边界，再基于pandas的cut函数进行离散化(分箱、分桶)


In [12]:
X_train

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
40266,0.052899,80.0,0.0,0.342892,5683.0,14.0,0.0,1.0,0.0,1.0
102291,0.314817,55.0,0.0,0.133092,11600.0,5.0,0.0,1.0,0.0,1.0
1310,0.000000,36.0,4.0,0.437850,6250.0,11.0,0.0,2.0,1.0,0.0
63327,0.261331,54.0,0.0,0.395710,5733.0,16.0,0.0,1.0,0.0,1.0
48272,0.029445,58.0,0.0,0.130216,13300.0,8.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
57097,0.287522,30.0,0.0,0.221714,6778.0,10.0,0.0,2.0,0.0,0.0
79879,0.930403,38.0,0.0,0.204423,3345.0,7.0,0.0,0.0,0.0,2.0
107765,0.019931,75.0,0.0,0.004285,10500.0,7.0,0.0,0.0,0.0,0.0
898,0.087649,27.0,0.0,0.009995,2200.0,2.0,0.0,0.0,1.0,1.0


In [13]:
max(list([X_train['age'].max,X_test['age'].max()]))

TypeError: '>' not supported between instances of 'float' and 'method'

In [14]:
X_train['age'].max

<bound method NDFrame._add_numeric_operations.<locals>.max of 40266     80.0
102291    55.0
1310      36.0
63327     54.0
48272     58.0
          ... 
57097     30.0
79879     38.0
107765    75.0
898       27.0
16428     55.0
Name: age, Length: 86918, dtype: float64>

In [15]:
X_test['age'].max()

101.0

In [16]:
edges = list(range(0, int(X_test['age'].max())+4,3))

In [17]:
X_train['agegroup'] = pd.cut(X_train['age'], bins=edges,labels=False)

In [18]:
X_test['agegroup'] = pd.cut(X_test['age'], bins=edges,labels=False)

#### 3.数据预处理之独热向量编码

In [19]:
# 请对上述分箱后的年龄段进行独热向量编码
# 提示：使用pandas的get_dummies完成


In [20]:
one_hot = pd.get_dummies(X_train['agegroup'], prefix='agegroup')

X_train = pd.concat([X_train, one_hot], axis=1)

In [21]:
X_train

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,agegroup,agegroup_6.0,agegroup_7.0,agegroup_8.0,agegroup_9.0,agegroup_10.0,agegroup_11.0,agegroup_12.0,agegroup_13.0,agegroup_14.0,agegroup_15.0,agegroup_16.0,agegroup_17.0,agegroup_18.0,agegroup_19.0,agegroup_20.0,agegroup_21.0,agegroup_22.0,agegroup_23.0,agegroup_24.0,agegroup_25.0,agegroup_26.0,agegroup_27.0,agegroup_28.0,agegroup_29.0,agegroup_30.0,agegroup_31.0,agegroup_32.0,agegroup_33.0
40266,0.052899,80.0,0.0,0.342892,5683.0,14.0,0.0,1.0,0.0,1.0,26.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
102291,0.314817,55.0,0.0,0.133092,11600.0,5.0,0.0,1.0,0.0,1.0,18.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1310,0.000000,36.0,4.0,0.437850,6250.0,11.0,0.0,2.0,1.0,0.0,11.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
63327,0.261331,54.0,0.0,0.395710,5733.0,16.0,0.0,1.0,0.0,1.0,17.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
48272,0.029445,58.0,0.0,0.130216,13300.0,8.0,1.0,1.0,0.0,1.0,19.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57097,0.287522,30.0,0.0,0.221714,6778.0,10.0,0.0,2.0,0.0,0.0,9.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
79879,0.930403,38.0,0.0,0.204423,3345.0,7.0,0.0,0.0,0.0,2.0,12.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
107765,0.019931,75.0,0.0,0.004285,10500.0,7.0,0.0,0.0,0.0,0.0,24.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
898,0.087649,27.0,0.0,0.009995,2200.0,2.0,0.0,0.0,1.0,1.0,8.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
one_hot = pd.get_dummies(X_test['agegroup'], prefix='agegroup')

X_test = pd.concat([X_test, one_hot], axis=1)


#### 4.数据预处理之幅度缩放

In [23]:
# 请对连续值特征进行幅度缩放
# 提示：可以使用StandardScaler等幅度缩放器进行处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#连续特征有
features_to_scale = ['RevolvingUtilizationOfUnsecuredLines', 'DebtRatio','MonthlyIncome']

# 使用 fit_transform 对选择的特征列进行幅度缩放
X_train[features_to_scale] = scaler.fit_transform(X_train[features_to_scale])


In [24]:
X_test[features_to_scale] = scaler.fit_transform(X_test[features_to_scale])

In [25]:
X_train.drop(["agegroup","age"], axis=1, inplace=True)
X_test.drop(["agegroup","age"], axis=1, inplace=True)

In [26]:
X_train.shape

(86918, 37)

In [27]:
X_test.shape

(21730, 37)

#### 5.使用logistic regression建模，并且输出一下系数，分析重要度。   

In [124]:
# 提示：fit建模，建完模之后可以取出coef属性
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

lr.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [126]:
#取出特征的coef
coefficients = lr.coef_
coefficients[0]

array([-0.0156334 ,  0.467622  ,  0.30798023, -0.07341862, -0.02302149,
        0.44146934, -0.19549518, -0.87026371,  0.09492283, -0.04851426,
        0.51266316,  0.69370554,  0.51137   ,  0.39410818,  0.40821275,
        0.26042991,  0.28325902,  0.12604619,  0.21490908,  0.13932627,
        0.16228096, -0.03894009, -0.13400387, -0.24467298, -0.60754337,
       -0.82244628, -0.69520512, -0.77884549, -0.794883  , -0.76060993,
       -0.42927367, -0.4495571 , -0.33327799, -0.09117529, -0.05859908,
        0.00995178, -0.00305238])

In [127]:
feaures_names = X_train.columns

data = list(zip(feaures_names, coefficients[0]))

fea_importance = pd.DataFrame(data,columns = ['feature','coef'])

fea_importance.sort_values(by='coef', ascending=False)
#将特征按coef排序

Unnamed: 0,feature,coef
11,agegroup_8.0,0.693706
10,agegroup_7.0,0.512663
12,agegroup_9.0,0.51137
1,NumberOfTime30-59DaysPastDueNotWorse,0.467622
5,NumberOfTimes90DaysLate,0.441469
14,agegroup_11.0,0.408213
13,agegroup_10.0,0.394108
2,DebtRatio,0.30798
16,agegroup_13.0,0.283259
15,agegroup_12.0,0.26043


#### 6.使用网格搜索交叉验证进行调参
调整penalty和C参数，其中penalty候选为"l1"和"l2"，C的候选为[1,10,100,500]

In [133]:
# 提示：先按照上面要求准备好网格字典，再使用GridSearchCV进行调参
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
lr = LogisticRegression()

lr.fit(X_train, y_train)

model = LogisticRegression(solver='liblinear',max_iter=1000)#未收敛，增加迭代次数.
##ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.因此换为liblinera solver


# 定义要尝试的超参数组合
param = {'C': [1,10,100,500], 'penalty': ['l1', 'l2']}

# 创建 GridSearchCV 对象
gsc_lr = GridSearchCV(estimator=model, param_grid=param)

# 在训练集上拟合模型
gsc_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(estimator=LogisticRegression(max_iter=1000, solver='liblinear'),
             param_grid={'C': [1, 10, 100, 500], 'penalty': ['l1', 'l2']})

In [137]:
# 输出最好的超参数
# 输出最好的模型

best_params = gsc_lr.best_params_

best_model = gsc_lr.best_estimator_


#### 7.在测试集上进行预测，计算 查准率/查全率/auc/混淆矩阵/f1值 等测试指标

In [35]:
# 提示：在测试集上预测可以使用predict
# 提示：各种指标可以在sklearn.metrics中查到各种评估指标，分别是accuracy_score、recall_score、auc、confusion_matrix、f1_score
from sklearn.metrics import accuracy_score,recall_score,auc,roc_curve,confusion_matrix,f1_score


# 在测试集上进行预测
y_pred = best_model.predict(X_test)

# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
recall_score = recall_score(y_test, y_pred)

confusion_matrix = confusion_matrix(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)



fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# 计算曲线下面积（AUC）
roc_auc = auc(fpr, tpr)

In [147]:
accuracy,recall_score,confusion_matrix,f1_score,roc_auc

(0.933087896916705,
 0.04514824797843666,
 array([[20209,    37],
        [ 1417,    67]]),
 0.08438287153652393,
 0.5216603632463556)

#### 8.更多优化
银行通常会有更严格的要求，因为欺诈带来的后果通常比较严重，一般我们会调整模型的标准。   

比如在logistic regression当中，一般我们的概率判定边界为0.5，但是我们可以把阈值设定低一些，来提高模型的“敏感度”   
试试看把阈值设定为0.3，再看看这个时候的混淆矩阵等评估指标。

In [36]:
def cal_eval(y_test,y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    co_matrix = confusion_matrix(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)



    fpr, tpr, thresholds = roc_curve(y_test, y_pred)

    # 计算曲线下面积（AUC）
    roc_auc = auc(fpr, tpr)
    
    print(f"acc:{accuracy},recall:{recall},f1:{f1},roc:{roc_auc}")

In [178]:
# 提示：thresholds = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
# 根据predict_proba的结果和threshold的比较确定结果，再评估各种结果指标
lr_pred_new = []
lr_pred_prob = best_model.predict_proba(X_test)

for i in lr_pred_prob:
    if i[0]>=0.3:
        lr_pred_new.append(0)
    else:
        lr_pred_new.append(1)

lr_pred_new = np.array(lr_pred_new)

In [183]:
from sklearn.metrics import accuracy_score,recall_score,auc,roc_curve,confusion_matrix,f1_score
cal_eval(y_test,lr_pred_new)##0.3

acc:0.9321675103543489,recall:0.02021563342318059,f1:0.03911342894393741,roc:0.5096138919857185


In [184]:
lr_pred_new = []
lr_pred_prob = best_model.predict_proba(X_test)

for i in lr_pred_prob:
    if i[0]>=0.9:
        lr_pred_new.append(0)
    else:
        lr_pred_new.append(1)

lr_pred_new = np.array(lr_pred_new)
cal_eval(y_test,lr_pred_new)##0.9

acc:0.8534744592728947,recall:0.4359838274932615,f1:0.28896828941491737,roc:0.6600298471655777


#### 9.尝试对不同特征的重要度进行排序，通过特征选择的方式，对特征进行筛选。并重新建模，观察此时的模型准确率等评估指标。

In [185]:
# 可以根据逻辑回归的系数绝对值大小进行排序，也可以基于树模型的特征重要度进行排序
# 特征选择可以使用RFE或者selectFromModel

coef_abs = np.abs(best_model.coef_[0])

# 获取排序后的索引
sorted_indices = np.argsort(coef_abs)[::-1]

# 获取排序后的特征名和对应的系数
sorted_features = X_train.columns[sorted_indices]
sorted_coefficients = best_model.coef_[0][sorted_indices]

# 创建 DataFrame 显示排序后的结果
sorted_df = pd.DataFrame({'Feature': sorted_features, 'Coefficient': sorted_coefficients})


In [191]:
sorted_df.head(15)

Unnamed: 0,Feature,Coefficient
0,agegroup_29.0,-0.941463
1,NumberOfTime60-89DaysPastDueNotWorse,-0.861895
2,agegroup_28.0,-0.835204
3,agegroup_26.0,-0.800776
4,agegroup_8.0,0.755337
5,agegroup_31.0,-0.733585
6,agegroup_25.0,-0.707014
7,agegroup_22.0,-0.631208
8,agegroup_24.0,-0.627627
9,agegroup_9.0,0.620013


In [193]:
new_feature = sorted_features[0:15]

In [203]:

##只使用排名前15的特征

model = LogisticRegression(solver='liblinear',max_iter=1000)#未收敛，增加迭代次数.
##ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.因此换为liblinera solver


# 定义要尝试的超参数组合
param = {'C': [1,10,100,500], 'penalty': ['l1', 'l2']}

# 创建 GridSearchCV 对象
gsc_lr = GridSearchCV(estimator=model, param_grid=param)

# 在训练集上拟合模型
gsc_lr.fit(X_train[new_feature], y_train)

GridSearchCV(estimator=LogisticRegression(max_iter=1000, solver='liblinear'),
             param_grid={'C': [1, 10, 100, 500], 'penalty': ['l1', 'l2']})

In [204]:
best_params = gsc_lr.best_params_

best_model = gsc_lr.best_estimator_

In [205]:
best_params,best_model

({'C': 1, 'penalty': 'l1'},
 LogisticRegression(C=1, max_iter=1000, penalty='l1', solver='liblinear'))

In [206]:
y_pred = best_model.predict(X_test[new_feature])

In [207]:
X_test = X_test.rename(columns=dict(zip(X_test.columns, X_train.columns)))

In [208]:
y_pred = best_model.predict(X_test[new_feature])

In [209]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [210]:
#只用前15个特征的结果
cal_eval(y_test,y_pred)

acc:0.9320294523699953,recall:0.012129380053908356,f1:0.02379378717779246,roc:0.5057930314277248


In [None]:
#特征全用时，分别为。(0.933087896916705,0.04514824797843666,0.08438287153652393,0.5216603632463556)
#特征用少时，有些大部分指标反而上升了。

#### 10.其他模型算法尝试
使用RandomForestClassifier/SVM/KNN等sklearn分类算法进行分类，尝试上述超参数调优算法过程。

In [None]:
# 随机森林
from sklearn.ensemble import RandomForestClassifier
# 支持向量机
from sklearn.svm import SVC
# K最近邻
from sklearn.neighbors import KNeighborsClassifier



## 随机森林

In [211]:
from sklearn.ensemble import RandomForestClassifier

In [222]:
# 定义要尝试的超参数组合，树模型的主要参数有树深。
param = {'max_depth':[1,2,3],'n_estimators':[50,100]}


rf = RandomForestClassifier(random_state = 42)
# 创建 GridSearchCV 对象
gsc_rf = GridSearchCV(estimator=rf, param_grid=param)

# 在训练集上拟合模型
gsc_rf.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [1, 2, 3], 'n_estimators': [50, 100]})

In [225]:
best_params = gsc_rf.best_params_

best_model = gsc_rf.best_estimator_

y_pred = best_model.predict(X_test)

cal_eval(y_test,y_pred)

acc:0.9317073170731708,recall:0.0,f1:0.0,roc:0.5


In [226]:
feature_importance = best_model.feature_importances_

In [227]:
feature_importance

array([0.12, 0.12, 0.06, 0.04, 0.06, 0.22, 0.04, 0.12, 0.  , 0.  , 0.  ,
       0.02, 0.04, 0.02, 0.04, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.02, 0.04, 0.04, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
       0.  , 0.  , 0.  , 0.  ])

In [229]:
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importance})

In [234]:
new_feature = feature_importance_df.sort_values(by='Importance', ascending=False)["Feature"][0:10]

### 使用部分特征

In [235]:
# 定义要尝试的超参数组合，树模型的主要参数有树深。
param = {'max_depth':[1,2,3],'n_estimators':[50,100]}


rf = RandomForestClassifier(random_state = 42)
# 创建 GridSearchCV 对象
gsc_rf = GridSearchCV(estimator=rf, param_grid=param)

# 在训练集上拟合模型
gsc_rf.fit(X_train[new_feature], y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [1, 2, 3], 'n_estimators': [50, 100]})

In [237]:
best_params = gsc_rf.best_params_

best_model = gsc_rf.best_estimator_

y_pred = best_model.predict(X_test[new_feature])

cal_eval(y_test,y_pred)

acc:0.9331339162448228,recall:0.0431266846361186,f1:0.08096141682479444,roc:0.5207483665203709


In [238]:
##指标均有提升

## SVM

In [29]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [38]:
# 定义要尝试的超参数组合，SVM主要参数有，正则项系数C，核函数，
param = {'C':[0.1, 1, 10],'kernel':['rbf','linear']}

svc = SVC(random_state=42,max_iter=100)#默认迭代次数太慢了 ，
         
# 创建 GridSearchCV 对象
gsc_svm = GridSearchCV(estimator=svc, param_grid=param)

# 在训练集上拟合模型
gsc_svm.fit(X_train, y_train)



GridSearchCV(estimator=SVC(max_iter=100, random_state=42),
             param_grid={'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']})

In [40]:
from sklearn.metrics import accuracy_score,recall_score,auc,roc_curve,confusion_matrix,f1_score

best_params = gsc_svm.best_params_

best_model = gsc_svm.best_estimator_

y_pred = best_model.predict(X_test)

cal_eval(y_test,y_pred)

acc:0.9320294523699953,recall:0.012803234501347708,f1:0.025082508250825083,roc:0.5061052624151509


In [39]:
best_params

{'C': 10, 'kernel': 'linear'}

## KNN

In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [32]:
%%time
# 定义要尝试的超参数组合，KNN主要参数为K的值，
param = {'n_neighbors':[1,3]}

knn = KNeighborsClassifier(n_jobs=25,leaf_size=30)
         
# 创建 GridSearchCV 对象
gsc_knn = GridSearchCV(estimator=knn, param_grid=param)

# 在训练集上拟合模型
gsc_knn.fit(X_train, y_train)

CPU times: user 12min 37s, sys: 20min 54s, total: 33min 32s
Wall time: 3min 9s


GridSearchCV(estimator=KNeighborsClassifier(n_jobs=25),
             param_grid={'n_neighbors': [1, 3]})

In [None]:
#KNN算的极其慢

In [34]:
X_test = X_test.rename(columns=dict(zip(X_test.columns, X_train.columns)))

In [37]:
from sklearn.metrics import accuracy_score,recall_score,auc,roc_curve,confusion_matrix,f1_score

best_params = gsc_knn.best_params_

best_model = gsc_knn.best_estimator_

y_pred = best_model.predict(X_test)

cal_eval(y_test,y_pred)

acc:0.9267832489645651,recall:0.15296495956873316,f1:0.22200488997555015,roc:0.5682339368623079
