In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
import joblib
import pandas as pd
import numpy as np

# 线性回归

### 线性回归（正规方程）预测房价

In [None]:
# 1.获取数据
lb = load_boston()

# 2.数据集划分
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=1)
print(x_train.shape)
print('\n')

# 3.特征工程-标准化
std = StandardScaler()
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)
# 发现误差太大，对目标值也进行标准化试试
std_y = StandardScaler()
# -1代表把剩余的元素都堆到一维
y_train = std_y.fit_transform(y_train.reshape(-1, 1))
y_test = std_y.transform(y_test.reshape(-1, 1))

# 4.机器学习-线性回归（正规方程）
lr = LinearRegression()
lr.fit(x_train, y_train)

# 5.模型评估
# 获取系数等值
y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格
y_lr_predict = std_y.inverse_transform(y_predict)
print('预测值为：', y_lr_predict)
print('\n')
print('回归系数：', lr.coef_)
print('\n')
# 评价
# 均方误差
error = mean_squared_error(y_test, y_predict)
print('误差为：', error)

# 6.保存训练好的模型
# # 模型中保存了w的值，也保存了模型结构
joblib.dump(lr, "./tmp/test.pkl")

### 加载保存的模型

In [9]:
model = joblib.load('./tmp/test.pkl')
y_predict = model.predict(x_test)
print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test),
                                               std_y.inverse_transform(y_predict)))

保存的模型预测的结果： [[ 1.12620955]
 [ 0.62994234]
 [-0.47955756]
 [-0.08002168]
 [-0.38323459]
 [-0.26734514]
 [ 1.11558027]
 [-0.48011678]
 [ 0.26773583]
 [ 0.50610896]
 [ 0.54872518]
 [ 0.69878929]
 [-0.12984488]
 [ 0.51624959]
 [ 0.11609798]
 [-0.16307075]
 [-0.58671359]
 [ 1.72804157]
 [ 0.91761907]
 [-1.56015899]
 [-0.16601029]
 [-0.68746111]
 [ 0.31332585]
 [ 0.27297733]
 [ 1.01697482]
 [-1.27028638]
 [-0.95672557]
 [-0.62211389]
 [ 1.5267197 ]
 [-0.8563123 ]
 [-0.12405138]
 [-0.91970532]
 [ 2.28757241]
 [-0.50574043]
 [-0.05595243]
 [-0.21806897]
 [-0.54345359]
 [ 0.52264682]
 [-1.40720286]
 [-0.26284251]
 [ 0.21619076]
 [-0.14338071]
 [ 0.79988591]
 [-0.65772411]
 [-0.33180076]
 [-0.87514574]
 [ 1.91418761]
 [-0.47664284]
 [ 0.43517699]
 [-0.1950607 ]
 [ 0.30927175]
 [ 0.24009869]
 [ 0.30063331]
 [ 0.50569088]
 [-1.94512422]
 [ 0.20018782]
 [-1.30384514]
 [ 0.50366068]
 [-0.6220835 ]
 [ 1.47453167]
 [-0.31823582]
 [ 0.57109939]
 [-0.64702253]
 [-0.35840699]
 [-1.27347275]
 [ 1.08939349

### 线性回归（梯度下降）

In [10]:
# eta0:学习率的初始值;penalty是惩罚，分为L1和L2;alpha：值越高，正则化力度越强
sgd = SGDRegressor(eta0=0.008, penalty='l1', alpha=0.005)
sgd.fit(x_train, y_train)
print('回归系数：', sgd.coef_)
# 预测测试集的房子价格
y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_predict = sgd.predict(x_test)
# print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
print("梯度下降的原始房价量纲均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

回归系数： [-0.09104545  0.07654483 -0.02494045  0.07728752 -0.16730119  0.27213477
 -0.00268789 -0.23188622  0.0854081  -0.02286256 -0.21872819  0.06659854
 -0.41975611]
梯度下降的均方误差： 0.2775881613022312
梯度下降的原始房价量纲均方误差： 22.0330120203174


  y = column_or_1d(y, warn=True)


# 岭回归

In [11]:
lb = load_boston()
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=1)
print(x_train.shape)
print(y_train.shape)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(y_train.reshape(-1, 1))
y_test = std_y.transform(y_test.reshape(-1, 1))
print(y_train.shape)

# 岭回归预测房价
rd = Ridge(alpha=0.005)
rd.fit(x_train, y_train)
print(rd.coef_)
y_predict = rd.predict(x_test)
print(y_predict.shape)
print('岭回归的均方误差：', mean_squared_error(y_test, y_predict))
print('岭回归的原始房价量纲均方误差：', mean_squared_error(std_y.inverse_transform(y_test), std_y.inverse_transform(y_predict)))

(379, 13)
(379,)
(379, 1)
[[-0.1202571   0.15043046  0.02949882  0.07470592 -0.28040931  0.22171843
   0.02189818 -0.35273027  0.29933554 -0.20275251 -0.23911308  0.06305132
  -0.45258164]]
(127, 1)
岭回归的均方误差： 0.2758838478395715
岭回归的原始房价量纲均方误差： 21.89773550552293



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

# Lasso回归

In [10]:
# Lasso回归预测房价

ls = Lasso(alpha=0.005)
ls.fit(x_train, y_train)
print(ls.coef_)
y_predict = ls.predict(x_test)
print(y_predict.shape)
print(y_test.shape)
y_ls_predict = std_y.inverse_transform(ls.predict(x_test).reshape(-1,1))
print('Lasso回归的均方误差：', mean_squared_error(y_test, y_predict))
print('Lasso回归的原始房价量纲均方误差：', mean_squared_error(std_y.inverse_transform(y_test), y_ls_predict))

[-0.10585206  0.12092465  0.          0.07514437 -0.24539391  0.23083067
  0.         -0.32460134  0.20595329 -0.11439876 -0.23114772  0.05849662
 -0.44241315]
(127,)
(127, 1)
Lasso回归的均方误差： 0.2792786693271289
Lasso回归的原始房价量纲均方误差： 22.16719275575755


# 逻辑回归

In [20]:
# 逻辑回归做二分类进行癌症预测

# 构造列标签名字
column = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape',
          'Marginal Adhesion', 'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli',
          'Mitoses', 'Class']

# 读取数据
data = pd.read_csv(
    "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
    names=column)
print(data.shape)
print('\n')
print(data.head())
print('\n')
print(data.describe())
print('\n')
print(data.info)

# 缺失值处理
# 数据中空值用?表示的，替换为nan
data = data.replace(to_replace='?', value=np.nan)
# 删除，哪一行有空值，就删除对应的样本
data = data.dropna()
print(data.shape)
print('\n')

# 确定特征值、目标值
x = data.iloc[:,1:10]
print(x.head())
print('\n')
y = data['Class']
print(y.head())

# 分割数据
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22)

# 特征工程（标准化）
sd = StandardScaler()
x_train = sd.fit_transform(x_train)
x_test = sd.transform(x_test)

# 机器学习（逻辑回归）
lr = LogisticRegression()
# 训练
lr.fit(x_train, y_train)

# 模型评估
y_predict = lr.predict(x_test)
print(y_predict)
print('准确率：', lr.score(x_test,y_test))
print('\n')
# macro avg:平均值; weighted avg:加权平均值
print('召回率：', classification_report(y_test, y_predict, labels=[2,4],target_names=['良性','恶行']))
print()
print('AUC指标：', roc_auc_score(y_test, y_predict))

(699, 11)


   Sample code number  Clump Thickness  Uniformity of Cell Size  \
0             1000025                5                        1   
1             1002945                5                        4   
2             1015425                3                        1   
3             1016277                6                        8   
4             1017023                4                        1   

   Uniformity of Cell Shape  Marginal Adhesion  Single Epithelial Cell Size  \
0                         1                  1                            2   
1                         4                  5                            7   
2                         1                  1                            2   
3                         8                  1                            3   
4                         1                  3                            2   

  Bare Nuclei  Bland Chromatin  Normal Nucleoli  Mitoses  Class  
0           1                3              