### scikit - learn 中的回归问题

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import datasets

In [2]:
boston = datasets.load_boston()

X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
X.shape

(490, 13)

In [4]:
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed = 666)

### scikit - learn 中的线性回归

In [5]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

In [6]:
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [7]:
lin_reg.coef_

array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,
       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,
        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,
       -3.80923751e-01])

In [8]:
lin_reg.intercept_

34.117399723229624

In [9]:
lin_reg.predict(X_test)

array([18.08015868, 25.52447165, 12.98271397, 32.89477638, 24.17839745,
        2.66600125, 26.64297716, 32.23866352, 13.96590659, 24.0465123 ,
       14.92963   , 10.57419644, 30.28539981, 16.28302365, 23.67843428,
       25.63288299, 18.68105783, 24.01767076, 28.77234863, 26.9404495 ,
       12.87158142, 27.23259283, 26.07726096, 23.41270932, 20.80570812,
       31.96527196, 14.93177657, 20.94927605, 12.93149157, 29.8004438 ,
       35.29188752,  4.99369317, 13.10904465, 35.54982047, 16.00603155,
       21.53889058, 12.46701001, 29.12202629, 27.3433202 , 24.04852968,
       14.39961539, 23.61075774, 10.89223868, 22.38043687, 18.62604579,
       16.41773634, 24.43040765, 33.06929581, 19.19757395, 27.03634216,
       18.05693565, 14.90744724, 25.08683225, 16.09610653, 21.7469388 ,
       16.32259928, 24.25418684, 11.75290906, 27.91347808, 31.06610342,
       20.17028271, 24.99229322, 25.99180978, 12.11816691, 16.57739596,
       27.30354042, 22.26700274, 21.72088347, 31.5072238 , 14.09

In [10]:
lin_reg.score(X_test, y_test)

0.8129794056212811

### kNN Regressor

In [11]:
from sklearn.neighbors import KNeighborsRegressor
# 在 sklearn中默认的 k 的值取的是 5
knn_reg = KNeighborsRegressor()

In [12]:
knn_reg.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [13]:
knn_reg.score(X_test, y_test)

0.5865412198300899

In [17]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'weights': ['uniform'],
        'n_neighbors': [i for i in range(1, 11)]
    },
    {
        'weights': ['distance'],
        'n_neighbors': [i for i in range(1, 11)],
        'p': [i for i in range(1, 6)]
    }
]

knn_reg = KNeighborsRegressor()
grid_search = GridSearchCV(knn_reg, param_grid, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    0.5s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=-1,
             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'weights': ['uniform']},
                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [18]:
grid_search.best_estimator_

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=1,
                    weights='distance')

In [19]:
grid_search.best_score_

0.6340477954176972

In [20]:
grid_search.best_index_

30

In [21]:
# 网格搜索使用的是交叉验证的方式
grid_search.best_params_

{'n_neighbors': 5, 'p': 1, 'weights': 'distance'}

In [22]:
# 现在测的才是 r2 的值
grid_search.best_estimator_.score(X_test, y_test)

0.7044357727037996