![scikit-learn logo](images/sklearn_algorithms.png)

In [1]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn.predict([[3, 5, 4, 2]])

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X, y)
logreg.predict(X_test)
print(linreg.intercept_)
print(linreg.coef_)
# pair the feature names with the coefficients
list(zip(feature_cols, linreg.coef_))

In [None]:
from sklearn import metrics

print(metrics.accuracy_score(y, y_pred))
print(metrics.mean_absolute_error(true, pred))
print(metrics.mean_squared_error(true, pred))

import numpy as np

# calculate RMSE using scikit-learn
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4)

In [None]:
k_range = list(range(1, 26))
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred))

In [None]:
import pandas as pd

import seaborn as sns

# allow plots to appear within the notebook
%matplotlib inline

sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7, kind='reg')

![scatterplot](images/scatterplot.png)

In [None]:
# create a Python list of feature names
feature_cols = ['TV', 'Radio', 'Newspaper']

# use the list to select a subset of the original DataFrame
X = data[feature_cols]

# equivalent command to do this in one line
X = data[['TV', 'Radio', 'Newspaper']]


# select a Series from the DataFrame
y = data['Sales']

# equivalent command that works if there are no spaces in the column name
y = data.Sales

In [None]:
from sklearn.cross_validation import cross_val_score

# search for an optimal value of K for KNN
k_range = list(range(1, 31))
k_scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
    k_scores.append(scores.mean())
print(k_scores)

# 10-fold cross-validation with two features (excluding Newspaper)
feature_cols = ['TV', 'Radio']
X = data[feature_cols]
print(np.sqrt(-cross_val_score(lm, X, y, cv=10, scoring='mean_squared_error')).mean())

In [None]:
from sklearn.grid_search import GridSearchCV

k_range = list(range(1, 31))
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)
grid.grid_scores_
print(grid.grid_scores_[0].parameters)
print(grid.grid_scores_[0].cv_validation_scores)
print(grid.grid_scores_[0].mean_validation_score)
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)

In [None]:
# define the parameter values that should be searched
k_range = list(range(1, 31))
weight_options = ['uniform', 'distance']

param_grid = dict(n_neighbors=k_range, weights=weight_options)
grid = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')
grid.fit(X, y)

In [None]:
from sklearn.grid_search import RandomizedSearchCV

param_dist = dict(n_neighbors=k_range, weights=weight_options)
rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10, random_state=5)
rand.fit(X, y)
rand.grid_scores_
print(rand.best_score_)
print(rand.best_params_)

best_scores = []
for _ in range(20):
    rand = RandomizedSearchCV(knn, param_dist, cv=10, scoring='accuracy', n_iter=10)
    rand.fit(X, y)
    best_scores.append(round(rand.best_score_, 3))
print(best_scores)

In [None]:
print(metrics.confusion_matrix(y_test, y_pred_class))

![confusion matrix](images/confusion_matrix.png)

In [None]:
# Classification Accuracy: Overall, how often is the classifier correct?
print((TP + TN) / float(TP + TN + FP + FN))
print(metrics.accuracy_score(y_test, y_pred_class))

In [None]:
# Classification Error: Overall, how often is the classifier incorrect?
print((FP + FN) / float(TP + TN + FP + FN))
print(1 - metrics.accuracy_score(y_test, y_pred_class))

In [None]:
# Sensitivity: When the actual value is positive, how often is the prediction correct?
# How "sensitive" is the classifier to detecting positive instances? Also known as "True Positive Rate" or "Recall"
print(TP / float(TP + FN))
print(metrics.recall_score(y_test, y_pred_class))

In [None]:
# Specificity: When the actual value is negative, how often is the prediction correct?
# How "specific" (or "selective") is the classifier in predicting positive instances?
print(TN / float(TN + FP))

In [None]:
# False Positive Rate: When the actual value is negative, how often is the prediction incorrect?
print(FP / float(TN + FP))

In [None]:
# Precision: When a positive value is predicted, how often is the prediction correct?
# How "precise" is the classifier when predicting positive instances?
print(TP / float(TP + FP))
print(metrics.precision_score(y_test, y_pred_class))

In [None]:
# histogram of predicted probabilities
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14

plt.hist(y_pred_prob, bins=8)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability of diabetes')
plt.ylabel('Frequency')

![histogram](images/histogram.png)

In [None]:
rom sklearn.preprocessing import binarize
y_pred_class = binarize([y_pred_prob], 0.3)[0]

In [None]:
# ROC Curves and Area Under the Curve (AUC)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)

![roc](images/roc.png)

In [None]:
# define a function that accepts a threshold and prints sensitivity and specificity
def evaluate_threshold(threshold):
    print('Sensitivity:', tpr[thresholds > threshold][-1])
    print('Specificity:', 1 - fpr[thresholds > threshold][-1])

evaluate_threshold(0.5)
evaluate_threshold(0.3)

In [None]:
# AUC is the percentage of the ROC plot that is underneath the curve:
print(metrics.roc_auc_score(y_test, y_pred_prob))