<img src='pics/otus.png'>

# KNN, подбор параметров модели

In [None]:
import pandas as pd
import numpy as np
from sklearn import datasets
import matplotlib
import matplotlib.pyplot as plt


%matplotlib inline
plt.rcParams["figure.figsize"] = [12, 8]



## KNN в задаче классификации

In [None]:
X, y = datasets.make_blobs(n_samples=100, random_state=4, centers=2, cluster_std=2)
X[:5], y[:5]

In [None]:
def get_class_colour(class_label):
    return 'green' if class_label else 'blue'

In [None]:
def plot_points(X, y, new_points=None, new_prediction=None, nearest_points=None, file_name=None):
    plt.scatter(X[:, 0], X[:, 1], c=[get_class_colour(y_i) for y_i in y], s=100, edgecolor='black', alpha=0.3)
    
    if new_points is not None:
        plt.scatter(new_points[:, 0], new_points[:, 1], c='black', s=100, edgecolor='black')
    
    if new_prediction is not None:
        plt.scatter(new_points[:, 0], new_points[:, 1], c=[get_class_colour(y_i) for y_i in new_prediction], s=100, edgecolor='black')
        
    if nearest_points is not None:
        plt.scatter(nearest_points[:, 0], nearest_points[:, 1], c='red', s=100, edgecolor='black')
    
    plt.title("Classification problem \n What is the color for the new (x1, x2) pair?")
    plt.xlabel("x1 (feature)")
    plt.ylabel("x2 (feature)")
    plt.gca().set_aspect('equal', adjustable='box')
    
    if file_name:
        plt.savefig(filename)

In [None]:
plot_points(X, y)

In [None]:
X_new = np.array([[12, 6]])

In [None]:
plot_points(X, y, new_points=X_new)

In [None]:
from scipy.spatial.distance import euclidean

def find_nearest_point_index(x_new, X):
    d_min = euclidean(x_new, X[0])
    min_idx = 0
    for idx in xrange(1, X.shape[0]):
        d = euclidean(x_new, X[idx])
        if d < d_min:
            d_min = d
            min_idx = idx
    return min_idx


min_idx = find_nearest_point_index(X_new, X)

In [None]:
plot_points(X, y, new_points=X_new, nearest_points=X[[min_idx]])

In [None]:
def predict_class(x_new, X, y):
    min_idx = find_nearest_point_index(x_new, X)
    return y[min_idx]

y_new = predict_class(X_new, X, y)

In [None]:
plot_points(X, y, new_points=X_new, new_prediction=[y_new], nearest_points=X[[min_idx]])

In [None]:
X_new = np.array([[12, -2]])
min_idx = find_nearest_point_index(X_new, X)
y_new = predict_class(X_new, X, y)

In [None]:
plot_points(X, y, new_points=X_new, new_prediction=[y_new], nearest_points=X[[min_idx]])

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(X_new)

In [None]:
plot_points(X, y, new_points=X_new, new_prediction=[y_pred])

In [None]:
X_new = np.c_[np.random.randint(5, 15, 10), np.random.randint(-2, 8, 10)]

In [None]:
plot_points(X, y, new_points=X_new, new_prediction=knn.predict(X_new))

In [None]:
from scipy.spatial.distance import cdist
from collections import Counter

def predict_class(x_new, X, y, k=1):

    dist = cdist(x_new, X)
    nearest_y = y[np.argpartition(dist, k, axis=1)[:, :k]]
    return np.array([Counter(row).most_common(1)[0][0] for row in nearest_y])


In [None]:
y_pred = predict_class(X_new, X, y, k=3)

In [None]:
plot_points(X, y, new_points=X_new, new_prediction=y_pred)

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)
y_pred = knn.predict(X_new)
plot_points(X, y, new_points=X_new, new_prediction=y_pred)

In [None]:
X, y = datasets.make_blobs(n_samples=100, random_state=5, centers=2)
X[:5], y[:5]
plot_points(X, y)

In [None]:

h = .02
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
x_pred = np.c_[xx.ravel(), yy.ravel()]


In [None]:
def plot_knn_classifier(k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, y)
    Z = knn.predict(x_pred)
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(12, 8))
    plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xticks(())
    plt.yticks(())
    plt.show()


In [None]:
plot_knn_classifier(1)

In [None]:
plot_knn_classifier(30)

In [None]:
plot_knn_classifier(90)

## Нормализация в knn

In [None]:
X, y = datasets.make_blobs(n_samples=100, random_state=4, centers=2, cluster_std=2)
X[:, 1] *= 1000000000
X[:5], y[:5]

In [None]:
def plot_points_scaled(X, y, new_points=None, new_prediction=None, nearest_points=None, file_name=None):
    plt.scatter(X[:, 0], X[:, 1], c=[get_class_colour(y_i) for y_i in y], s=100, edgecolor='black', alpha=0.3)
    plt.figure(1, figsize=(12, 8))
    plt.title("Classification problem \n What is the color for the new (x1, x2) pair?")
    plt.xlabel("x1 (feature)")
    plt.ylabel("x2 (feature)")
    plt.gca().set_aspect('equal', adjustable='datalim')
    if file_name:
        plt.savefig(filename)

In [None]:
plot_points_scaled(X, y)

In [None]:
from sklearn.metrics import accuracy_score
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
y_pred = knn.predict(X)
print accuracy_score(y_pred, y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
knn = KNeighborsClassifier(n_neighbors=5)
X_scaled = MinMaxScaler().fit_transform(X)
knn.fit(X_scaled, y)
y_pred = knn.predict(X_scaled)
print accuracy_score(y_pred, y)

### Другой пример 

<img src="pics/knn_cls.png">
<img src="pics/knn_cls_1.png">

## KNN в задаче регрессии

In [None]:
X = 10 * np.random.rand(100, 1).reshape(-1, 1)
y = X + np.random.randn(100, 1)

In [None]:
def plot_points_regression(X, y, new_points=None, new_prediction=None, nearest_points=None, file_name=None):
    plt.scatter(X[:, 0], y, c='blue', s=100, edgecolor='black', alpha=0.3)
    
    if new_points is not None:
        plt.scatter(new_points, new_prediction, c='green', s=100, edgecolor='black')
    
    plt.title("Regression problem \n What is the y value for the new x?")
    plt.xlabel("x1 (feature)")
    plt.ylabel("x2 (feature)")
    plt.gca().set_aspect('equal', adjustable='box')
    
    if file_name:
        plt.savefig(filename)

In [None]:
plot_points_regression(X, y)

In [None]:
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=3)
knn.fit(X, y)
x_new = 10 * np.random.rand(10, 1).reshape(-1, 1)
y_pred = knn.predict(x_new)

In [None]:
plot_points_regression(X, y, new_points=x_new, new_prediction=y_pred)

In [None]:
x_new = (30 * np.random.rand(100, 1) - 10).reshape(-1, 1)

In [None]:
knn = KNeighborsRegressor(n_neighbors=1)
knn.fit(X, y)
y_pred = knn.predict(x_new)
plot_points_regression(X, y, new_points=x_new, new_prediction=y_pred)

In [None]:
knn = KNeighborsRegressor(n_neighbors=30)
knn.fit(X, y)
y_pred = knn.predict(x_new)
plot_points_regression(X, y, new_points=x_new, new_prediction=y_pred)

In [None]:
knn = KNeighborsRegressor(n_neighbors=100)
knn.fit(X, y)
y_pred = knn.predict(x_new)
plot_points_regression(X, y, new_points=x_new, new_prediction=y_pred)

## Выбор параметров модели

<img src="pics/vtt.png">


Testing set - в самом начале работы сохранить часть данных и не использовать их до внедрения модели в боевое окружение. Дальше работаем с разбиением на 2 части, которые вместо train и validation называем train и test.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape 

In [None]:
from sklearn.metrics import mean_squared_error
neighbors = range(1, 50)
errors_train = []
errors_test = []
for k in neighbors:
    knn = KNeighborsRegressor(n_neighbors=k)
    knn.fit(X_train, y_train)
    errors_train.append(mean_squared_error(knn.predict(X_train), y_train))
    errors_test.append(mean_squared_error(knn.predict(X_test), y_test))
    
plt.plot(neighbors, errors_train, color='blue', label='train')
plt.plot(neighbors, errors_test, color='red', label='test')
plt.legend()

In [None]:
X, y = datasets.make_blobs(n_samples=100, random_state=4, centers=2, cluster_std=2)
X[:5], y[:5]
plot_points(X, y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 


In [None]:
neighbors = range(1, 60)
errors_train = []
errors_test = []
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    errors_train.append(accuracy_score(knn.predict(X_train), y_train))
    errors_test.append(accuracy_score(knn.predict(X_test), y_test))
    
plt.plot(neighbors, errors_train, color='blue', label='train')
plt.plot(neighbors, errors_test, color='red', label='test')
plt.legend()

In [None]:
X, y = datasets.make_blobs(n_samples=300, random_state=4, centers=2, cluster_std=2)
X[:5], y[:5]
plot_points(X, y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape 


In [None]:
from sklearn.metrics import accuracy_score

neighbors = range(1, 100)
errors_train = []
errors_test = []
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    errors_train.append(accuracy_score(knn.predict(X_train), y_train))
    errors_test.append(accuracy_score(knn.predict(X_test), y_test))
    
plt.plot(neighbors, errors_train, color='blue', label='train')
plt.plot(neighbors, errors_test, color='red', label='test')
plt.legend()

## Кросс-валидация

In [None]:
m = 100
np.random.seed(8)
X = 6 * np.random.rand(m, 1) - 3
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)
X_new = np.linspace(-3, 3, 100).reshape(100, 1)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from itertools import cycle
cycol = cycle('bgrcmk').next

for degree in range(1, 30, 5):
    style = cycol()
    polybig_features = PolynomialFeatures(degree=degree, include_bias=False)
    std_scaler = StandardScaler()
    lin_reg = LinearRegression()
    polynomial_regression = Pipeline([
            ("poly_features", polybig_features),
            ("std_scaler", std_scaler),
            ("lin_reg", lin_reg),
        ])
    polynomial_regression.fit(X, y)
    y_newbig = polynomial_regression.predict(X_new)
    plt.plot(X_new, y_newbig, style, label=str(degree))

plt.plot(X, y, "b.", linewidth=3)
plt.legend(loc="upper left")
plt.xlabel("$x_1$", fontsize=18)
plt.ylabel("$y$", rotation=0, fontsize=18)
plt.axis([-3, 3, 0, 10])
plt.savefig('pics/regression_poly_overfit.pdf')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)


errors_train = []
errors_test = []
degree = range(1, 20, 1)
for d in degree:
    
    polybig_features = PolynomialFeatures(degree=d, include_bias=False)
    std_scaler = StandardScaler()
    lr = LinearRegression()
    polynomial_regression = Pipeline([
            ("poly_features", polybig_features),
            ("std_scaler", std_scaler),
            ("lr", lr),
        ])
    polynomial_regression.fit(X_train, y_train)
    errors_train.append(mean_squared_error(polynomial_regression.predict(X_train), y_train))
    errors_test.append(mean_squared_error(polynomial_regression.predict(X_test), y_test))
    
plt.plot(degree, errors_train, color='blue', label='train')
plt.plot(degree, errors_test, color='red', label='test')
plt.legend()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
from sklearn.pipeline import make_pipeline
reg = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(), 
    LinearRegression()
)
reg.fit(X_train, y_train)
mean_squared_error(reg.predict(X_test), y_test)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2)
reg = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(), 
    LinearRegression()
)
reg.fit(X_train, y_train)
mean_squared_error(reg.predict(X_test), y_test)

<img src='pics/cv.png'>

In [None]:
from sklearn.model_selection import cross_val_score
reg = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(), 
    LinearRegression()
)
scores = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
print scores
print 'mean', scores.mean()
print 'std', scores.std()

In [None]:
from sklearn.linear_model import Ridge
reg = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(), 
    Ridge()
)
scores = cross_val_score(reg, X, y, cv=5, scoring='neg_mean_squared_error')
print scores
print 'mean', scores.mean()
print 'std', scores.std()

## GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'ridge__alpha': [1e-8, 1e-7, 1e-6, 1e-5, 0.0001, 0.001, 0.01, 0.1, 1, 10]},
]

reg = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(), 
    Ridge()
)

grid_search = GridSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search.best_score_

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'ridge__alpha': [1e-8, 1e-7, 1e-6, 1e-5, 0.0001, 0.001, 0.01, 0.1, 1, 10],
        'polynomialfeatures__degree': [1, 2, 3, 4, 5]
    },
]

reg = make_pipeline(
    PolynomialFeatures(include_bias=False),
    StandardScaler(), 
    Ridge()
)

grid_search = GridSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
        'ridge__alpha': [1e-8, 1e-7, 1e-6, 1e-5, 0.0001, 0.001, 0.01, 0.1, 1, 10],
        'polynomialfeatures__degree': [1, 2, 3, 4, 5]
    }

reg = make_pipeline(
    PolynomialFeatures(include_bias=False),
    StandardScaler(), 
    Ridge()
)

rand_search = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')
rand_search.fit(X, y)

In [None]:
rand_search.best_estimator_

In [None]:
rand_search.best_score_

In [None]:
rand_search.best_params_

Перед запуском в бой - проверить на отложенной выборке, но не менять параметры!