In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# 1、获取
column_name = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 
              'Uniformity of Cell Shape', 'Marginal Adhesion', 'Single EpithelialCell Size', 
              'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
data = pd.read_csv("./datasets/breast-cancer-wisconsin.data", names=column_name)
# 2、缺失值处理
data = data.replace(to_replace="?", value=np.nan)
data.dropna(inplace=True)
# 3、筛选特征值和目标值
x = data.iloc[:, 1:-1]
y = data["Class"]
# 4、划分
x_train, x_test, y_train, y_test = train_test_split(x, y)
# 5、标准化
transfer = StandardScaler()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)
# 6、逻辑回归预估器
estimator = LogisticRegression()
estimator.fit(x_train, y_train)
# 7、模型评估（回归系数和偏置）
print(estimator.coef_)
print(estimator.intercept_)
y_predict = estimator.predict(x_test)
print("y_predict: ", y_predict)
print("直接对比真实值和预测值：", y_test == y_predict)
score = estimator.score(x_test, y_test)
print("准确率为：", score)
# 8、精确率，召回率，F1-score
report = classification_report(y_test, y_predict, labels=[2, 4], target_names=["良性", "恶性"])
print(report)
# 9、转成 0，1 值（类别为0代表反例，为1代表正例）
y_true = np.where(y_test > 3, 1, 0)
# 10、roc 曲线，auc 指标
roc_auc_score(y_true, y_predict)

[[1.22306621 0.74596399 1.0500805  0.67801833 0.48602955 1.33883294
  0.62083724 0.425345   0.64248875]]
[-0.6568643]
y_predict:  [4 2 2 2 2 4 2 2 4 2 4 2 2 2 4 2 2 2 4 2 2 4 4 2 4 4 2 2 4 2 4 4 2 2 2 2 4
 4 2 4 2 2 2 2 2 2 2 2 2 2 4 4 2 2 2 2 4 2 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2 2
 2 4 2 2 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 2 2 2 2 2 2 2 2 4 4 2 2 2 2 2 2 4 4
 2 4 4 2 4 2 4 2 4 4 2 4 2 4 2 2 2 4 2 2 2 4 2 2 2 2 2 4 2 4 2 2 2 4 2 4 2
 2 2 2 2 4 2 4 4 2 4 4 2 2 2 2 4 4 4 2 2 4 2 2]
直接对比真实值和预测值： 570    True
529    True
280    True
589    True
463    True
       ... 
278    True
563    True
416    True
422    True
121    True
Name: Class, Length: 171, dtype: bool
准确率为： 0.9707602339181286
              precision    recall  f1-score   support

          良性       0.99      0.97      0.98       120
          恶性       0.93      0.98      0.95        51

    accuracy                           0.97       171
   macro avg       0.96      0.97      0.97       171
weighted avg       0.97      0.97      0.9

0.973529411764706