In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Lecture 11 - Model Validation
You might remember this code block from last class

In [None]:
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier as KNN
from matplotlib.colors import ListedColormap

In [None]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features. We could
                      # avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02  # step size in the mesh

# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])

for n_neighbors in [1, 5, 10, 15, 50, 100]:

    # we create an instance of Neighbours Classifier and fit the data.
    clf = KNN(n_neighbors)
    clf.fit(X, y)

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

    # Plot also the training points
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("3-Class classification (k = %i)" % (n_neighbors))

plt.show()

Which of the above plots are overfitting? underfitting? 
<br/><br/><br/>

## Cross-Validation
We've touched on the idea of overfitting a few times now, but I want to explore it again to really emphasize the importance of this method. This is probably the **most important step** of any machine learning / data science process. It asserts that your model can generalize (is not overfit) and makes sure that you're not wasting a clients money or time.

Before we proceed, let's import some data

In [None]:
from sklearn import datasets
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier as KNN
np.random.seed(30)

In [None]:
digits = datasets.load_digits()
X = digits.data
y = digits.target

In [None]:
# Plot digit at position
digit_idx = 37 # the position of the digit to render 
plt.figure(1, figsize=(3, 3))
plt.imshow(X[digit_idx].reshape((8,8)), cmap=plt.cm.gray_r, interpolation='nearest')
plt.show()
print("Corresponding digit",y[digit_idx])

### Split test data from the training data
You may be asking yourself why are we separating a test set from the training set - isn't this what cross-validation already handles? 

We are basically getting the ability to assess how well our model will generalize. We'll use cross-validation (making the train-validation split of this training set) to evaluate subtle changes in our algorithm. Then we will use the test set to get an accuracy measurement for how well we expect our model to perform against new data.

In [None]:
# split dataset using train_test_split

In [None]:
from sklearn.cross_validation import cross_val_score

In [None]:
# Evaluate a KNN model using cross_val_score

As you can see, `cross_val_score` outputs the validation accuracy values for each fold in the k-fold.

Let's make this into something a little bit more readable

In [None]:
def cv_stats(cv_score):
    """ 
    Takes in the output of cross_val_score
    Returns the mean and standard deviation in a readable format
    """
    mean = np.mean(cv_score)
    std = np.std(cv_score)
    return mean, std
cv_stats(clf_score)

Now we're moving. How can we use this to our advantage?

**To pick our model hyperparameters of course!**

A good rule of thumb for picking model hyperparameters, as mentioned in last class, is to vary by order of magnitude at first, then explore the local neighborhood of values that seem relevant.

In [None]:
# iterate through a range of values for k and evaluate how well KNN does on average
# start off with the lowest value possible, then multiply by 2 with every iteration


Not only can we evaluate how different hyperparameters affect the accuracy of a classifier, but we're also able to compare *different learning methods* as well!

Let's try logistic regression. First let's find the best parameter for C, the regularization term of logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier

In [None]:
# iterate through a range of penalty values and evaluate how well LogisticRegression does on average

And lets do the with RidgeClassifier. This algorithm basically implements Ridge Regression, a special case of Linear Regression. 

It basically attempts to limit large sizes of weights. If this is a little daunting, don't worry too much about it. A good explanation is located  [here](https://www.quora.com/What-is-Ridge-Regression-in-laymans-terms) if you are still curious

In [None]:
# iterate through a range of penalty values and evaluate how well RidgeClassifier does on average

Let's choose the best model and evaluate it's result 

In [None]:
clf= # best classifier here w/ best parameters
clf.fit(X_train, y_train)
predictions = clf.predict(X_test) 
accuracy = sum(predictions == y_test) / len(y_test)
accuracy

## Introduction to Confusion Matrices
"How can we understand what types of mistakes a learned model makes? "

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:

import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cnf_matrix = confusion_matrix(y_test, predictions)
plot_confusion_matrix(cnf_matrix, digits.target_names)

# Review of Models
1. Linear Regression
2. Logistic Regression
3. K-means
4. K-Nearest Neighbors
<br /><br /><br />


## Linear Regression
Answers the questions - "What's the best way to draw a line through our points?"

In [None]:
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
import matplotlib.pyplot as plt

In [None]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [None]:
# Initialize linear regression

#### Scoring our linear regression

If you remember from the linear regression lecture and from any stats class you've taken, the residual square error allows us to calculate how well our preidctor does in fitting the data. 
The formula is as so

$$R^2 = 1 - \frac{\sum(y_i - f(x_i))}{\sum(y_i-\bar{y})}$$,
where $f(x_i)$ is our model's prediction for the $x_i$th datapoint.

The great thing about sklearn's api is that a lot of these scoring measure are already built in. Using our trained Linear regressor, we can quickly score it using the score method as so

In [None]:
train_r2 = # fill out linreg
test_r2 = # fill out linreg

print("Train accuracy : ", train_r2)
print("Test accuracy : ", test_r2)

We can graphically examine how well our Linear Regressor performs by plotting the true y values, and those predicted by the linear regressor

In [None]:
y_pred = # predict on test set

fig, ax = plt.subplots()
ax.scatter(y_test, y_pred)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'r--', lw=4)
ax.set_xlabel('Test')
ax.set_ylabel('Predicted')
plt.title('Comparison of Actual vs Predicted Values')
plt.show()

## Logistic Regression
"Model the probability that some class occurs"

In [None]:
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target


np.random.seed(seed=133) # set seed=40 if you want an example where test accuracy is greater than train
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)


In [None]:
# initialize and fit logistic regression

In [None]:
train_acc = # score Logistic Regression on train set
test_acc = # score Logistic Regression on test set


print("Train accuracy : ", train_acc)
print("Test accuracy : ", test_acc)

## k - Nearest Neighbors
" Find the `k` closest points in the training set that match the input datapoint "

In [None]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [None]:
# initialize KNN here

In [None]:
train_acc = # score KNN on train set
test_acc = # score KNN on test set


print("Train accuracy : ", train_acc)
print("Test accuracy : ", test_acc)

## K- means
Find `k` clusters in the data based on how similar each datapoint is to itself

In [None]:
from sklearn.cluster import KMeans
from matplotlib import cm

In [None]:
blobs = datasets.make_blobs(n_samples=1000)
X = blobs[0]
y = blobs[1]


In [None]:
# setup KMeans 

In [None]:
y_pred = # predict KMeans

In [None]:
plt.figure()
y_unique = np.unique(y)
colors = cm.rainbow(np.linspace(0.0, 1.0, y_unique.size))
for this_y, color in zip(y_unique, colors):
    this_X = X[y_pred == this_y]
#     this_sw = sw_train[y == this_y]
    plt.scatter(this_X[:, 0], this_X[:, 1],  c=color, alpha=0.5,
                label="Class %s" % this_y)
plt.legend(loc="best")
plt.title("Data")
