# Linear classifiers with scikit-learn
- logistic regression
- SVM

# 1. Applying logistic regression and SVM

In [None]:
# Review KNN classification
- using Large Movie Review Dataset
# The variables X_train, X_test, y_train, and y_test are already 
# loaded into the environment. The X variables contain features 
# based on the words in the movie reviews, and the y variables 
# contain labels for whether the review sentiment is positive (+1) 
# or negative (-1).

from sklearn.neighbors import KNeighborsClassifier

# Create and fit the model with default hyperparameters
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict on the test features, print the results
pred = knn.predict(X_test)[0]
print("Prediction for test example 0:", pred)
# Prediction for test example 0: 1.0

In [None]:
# Comparing models
# Compare k nearest neighbors classifiers with k=1 and k=5 on the
# handwritten digits data set, which is already loaded into the 
# variables X_train, y_train, X_test, and y_test. 
# You can set k with the n_neighbors parameter when creating the 
# KNeighborsClassifier object, which is also already imported into 
# the environment.

# Which model has a higher test accuracy?
In [2]: knn = KNeighborsClassifier(n_neighbors=1)

In [3]: knn.fit(X_train, y_train)
Out[3]: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [4]: knn.score(X_test, y_test)
Out[4]: 0.9888888888888889

In [5]: knn = KNeighborsClassifier(n_neighbors=5)

In [6]: knn.fit(X_train, y_train)
Out[6]: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [7]: knn.score(X_test, y_test)
Out[7]: 0.9933333333333333

# n_neighbors = 5

## LogisticRegression

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(X_test)
lr.score(X_test, y_test)

In [None]:
# LogReg example 2
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.ft(wine.data, wine.target)
lr.score(wine.data, wine.target)
# 0.972

# confidence intervals
lr.predict_proba(wine.data[:1])
array([[ 9.951e-01, 4.357e-03, 5.339e-04]])

## LinearSVC for SVM

In [None]:
# LinearSVC
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.ft(wine.data, wine.target)
svm.score(wine.data, wine.target)
# 0.893

## SVC - uses nonlinear SVM by default

In [None]:
# SVC
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.svm import SVC
svm = SVC()
svm.ft(wine.data, wine.target)
svm.score(wine.data, wine.target)
# 1.

### Examples

In [None]:
# examples: logistic regression and SVM
#For each classifier, print out the training and validation accuracy.

from sklearn import datasets
digits = datasets.load_digits()
Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target)

# Apply logistic regression and print scores
lr = LogisticRegression()
lr.fit(Xtrain, ytrain)
print(lr.score(Xtrain,ytrain))
print(lr.score(Xtest,ytest))

# Apply SVM and print scores
svm = SVC()
svm.fit(Xtrain, ytrain)
print(svm.score(Xtrain,ytrain))
print(svm.score(Xtest,ytest))

# 0.9977728285077951
# 0.9444444444444444
# 1.0
# 0.26666666666666666

In [None]:
# Sentiment analysis for movie reviews
# In this exercise you'll explore the probabilities outputted by 
# logistic regression on a subset of the Large Movie Review Dataset. 
# The variables X and y are already loaded into the environment. 
# X contains features based on the number of times words appear in 
# the movie reviews, and y contains labels for whether the review 
# sentiment is positive (+1) or negative (-1).

# Instantiate logistic regression and train
lr = LogisticRegression()
lr.fit(X, y)

# Predict sentiment for a glowing review
review1 = "LOVED IT! This movie was amazing. Top 10 this year."
review1_features = get_features(review1)
print("Review:", review1)
print("Probability of positive review:", 
      lr.predict_proba(review1_features)[0,1])

# Predict sentiment for a poor review
review2 = "Total junk! I'll never watch a film by that director again, no matter how good the reviews."
review2_features = get_features(review2)
print("Review:", review2)
print("Probability of positive review:", 
      lr.predict_proba(review2_features)[0,1])

# Review: LOVED IT! This movie was amazing. Top 10 this year.
# Probability of positive review: 0.8079007873616059
# Review: Total junk! I'll never watch a film by that director again, no matter how good the reviews.
# Probability of positive review: 0.5855117402793947

# note: "good" in the second review throws it off

### Linear classifiers
- decision boundaries: linear, nonlinear
- linearly separable data


## Visualizing decision boundaries


In [None]:
# Visualize the decision boundaries of various classifier types. 
# A subset of scikit-learn's built-in wine dataset is already 
# loaded into X along with binary labels in y.

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Define the classifiers
classifiers = [LogisticRegression(),
LinearSVC(),
SVC(),
KNeighborsClassifier()]

# Fit the classifiers
for c in classifiers:
    c.fit(X, y)

# Plot the classifiers
plot_4_classifiers(X, y, classifiers)
plt.show()

# 2. Loss functions

## Linear classifiers: the coefficients
- prediction equations

In [None]:
# Dot products
x = np.arange(3)
x
# out: array([0,1,2])
    
y = np.arange(3,6)
y
# out: array([3,4,5])
    
x*y
# out: array([0,4,10])

# dot product
np.sum(x*y)
# out: 14
# convenient notation: dot product using @
x@y
# out: 14

### Linear classifier prediction
- raw model output = coefficients * features + intercept
- Linear classifier prediction:
    - compute raw model output
    - check the sign (which side of decision boundary)
        - if positive, predict one class
        - if negative, predict the other class
- This is the same for logistic regression and linear SVM
    - 'fit' is different but 'predict' is the same
    - difference in 'fit' in loss functions

In [None]:
# How Logistic Regression makes predictions
lr = LogisticRegression()
lr.fit(X,y)
lr.predict(X)[10]
# out: 0
lr.predict(X)[20]
# out: 1

# get coefficients
lr.coef_ & X[10] + lr.intercept_ # raw model output for example 10
array([-33.78572166])
# it's negative so predict other class
# for example 20, positive, so predict one class
lr.coef_ & X[20] + lr.intercept_ # raw model output for example 20
array([0.08050621])


In [None]:
# Changing the model coefficients
# coefficients determine slope of boundary
# intercept shifts the boundary

# Observe the effects of changing the coefficients of a linear
# classifer. A 2D dataset is already loaded into the environment
# as X and y, along with a linear classifier object model.

# Set the coefficients
# changed coefficients from 0,1 to -1,1
model.coef_ = np.array([[0,1]])
# changed intercept from 0 to -3
model.intercept_ = np.array([0])

# Plot the data and decision boundary
plot_classifier(X,y,model)

# Print the number of errors
num_err = np.sum(y != model.predict(X))
print("Number of errors:", num_err)

## What is a loss function?
- sklearn's LinearRegression minimizes a loss
- minimization is with respect to coefficients or parameters of model
- note: model.score() isn't necessarily the loss function
- The loss is used to fit the model on the data, while the score is used to see how we're doing

Classification errors: the 0-1 loss
- The squared error/loss isn't appropriate for Classification problems since y are categories (not numbers)
- a natural loss for classification problem is the number of errors
- This is the '0-1 loss': 0 for correct prediction, 1 for incorrect prediction
- By summing the results, you get the total number of incorrect
- But this loss is hard to minimize, so logreg and SVM don't use this.


## Minimizing a loss function
- using scipy.optimize.minimize

In [None]:
from scipy.optimize import minimize
minimize(np.square, 0).x
#out: array([0.])

minimize(np.square, 2).x
#out: array([-1.88846401e-08])

In [None]:
# Train a model on the Boston housing price data set, which is 
# already loaded into the variables X and y. For simplicity, we 
# won't include an intercept in our regression model.

# The squared error, summed over training examples
# Several ways to get the number of training examples, 
# such as y.size, len(y), or len(X)
def my_loss(w):
    s = 0
    for i in range(y.size):
        # Get the true and predicted target values for example 'i'
        y_i_true = y[i]
        y_i_pred = w@X[i]
        s = s + (y_i_pred - y_i_true)**2
    return s

# Returns the w that makes my_loss(w) smallest
w_fit = minimize(my_loss, X[0]).x
print(w_fit)

# Compare with scikit-learn's LinearRegression coefficients
lr = LinearRegression(fit_intercept=False).fit(X,y)
print(lr.coef_)

# [-9.16299112e-02  4.86754828e-02 -3.77698794e-03  2.85635998e+00
#  -2.88057050e+00  5.92521269e+00 -7.22470732e-03 -9.67992974e-01
#   1.70448714e-01 -9.38971600e-03 -3.92421893e-01  1.49830571e-02
#  -4.16973012e-01]
# [-9.16297843e-02  4.86751203e-02 -3.77930006e-03  2.85636751e+00
#  -2.88077933e+00  5.92521432e+00 -7.22447929e-03 -9.67995240e-01
#   1.70443393e-01 -9.38925373e-03 -3.92425680e-01  1.49832102e-02
#  -4.16972624e-01]

## Loss function diagrams
- raw model output
- 0-1 loss diagram
- linear regression loss diagram - using least squared
    - the raw model output is the prediction here
    - the squared error doesn't make sense here
- logistic loss diagram
    - think of it as "smooth version of 0-1 loss"
- Hinge loss diagram for SVM

### Comparing the logistic and hinge losses
- log_loss
- hinge_loss

In this exercise you'll create a plot of the logistic and hinge losses using their mathematical expressions, which are provided to you. The loss function diagram from the video is shown on the right.

In [None]:
# Mathematical functions for logistic and hinge losses
# Feel free to ignore if you're not interested
def log_loss(raw_model_output):
   return np.log(1+np.exp(-raw_model_output))
def hinge_loss(raw_model_output):
   return np.maximum(0,1-raw_model_output)

# Create a grid of values and plot
grid = np.linspace(-2,2,1000)
plt.plot(grid, log_loss(grid), label='logistic')
plt.plot(grid, hinge_loss(grid), label='hinge')
plt.legend()
plt.show()

## Implementing logistic regression
This is very similar to the earlier exercise where you implemented linear regression "from scratch" using scipy.optimize.minimize. However, this time we'll minimize the logistic loss and compare with scikit-learn's LogisticRegression (we've set C to a large value to disable regularization; more on this in Chapter 3!). 

The log_loss function from the previous exercise is already defined in your environment, and the sklearn breast cancer prediction dataset (first 10 features, standardized) is loaded into the variables X and y.

In [None]:
# logistic regression is just minimizing the loss function 
# we've been looking at. 

# The logistic loss, summed over training examples
def my_loss(w):
    s = 0
    for i in range(y.size):
        raw_model_output = w@X[i]
        s = s + log_loss(raw_model_output * y[i])
    return s

# Returns the w that makes my_loss(w) smallest
w_fit = minimize(my_loss, X[0]).x
print(w_fit)

# Compare with scikit-learn's LogisticRegression
lr = LogisticRegression(fit_intercept=False, C=1000000).fit(X,y)
print(lr.coef_)

# [ 1.03592182 -1.65378492  4.08331342 -9.40923002 -1.06786489  0.07892114
#  -0.85110344 -2.44103305 -0.45285671  0.43353448]
# [[ 1.03731085 -1.65339037  4.08143924 -9.40788356 -1.06757746  0.07895582
#   -0.85072003 -2.44079089 -0.45271     0.43334997]]

# 3. Logistic regression and regularization
- Regularization combats overfitting by making the coefficients smaller

In sklearn, hyperparameter C is the inverse of the regularization strength
- larger c means less regularization
- smaller c means more regularization (make coefficients smaller)


In [None]:
# How does regularization affect TRAINING accuracy?

# weak regularization
lr_weak_reg = LogisticRegression(C=100)
# strong regularization
lr_strong_reg = LogisticRegression(C=0.01)

# fit both models
lr_weak_reg.fit(X_train, y_train)
lr_strong_reg.fit(X_train, y_train)

# compute training accuracy
# Model with weaker regularization gets higher training accuracy
lr_weak_reg.score(X_train, y_train)
# 1.0
lr_strong_reg.score(X_train, y_train)
# 0.92



Regularized loss = original loss + large coefficient penalty
- more regularization: lower training accuracy
- Regularization reduces training accuracy due to adding a penalty to the original loss function


In [None]:
# How does regularization affect TEST accuracy?

# compute training accuracy
# Model with weaker regularization gets higher training accuracy
lr_weak_reg.score(X_test, y_test)
# 0.86
lr_strong_reg.score(X_test, y_test)
# 0.88

# Regularization improves test accuracy. 

More regularization (almost always) higher TEST accuracy
- it's like a compromise on the coefficient
- regularization makes you "fit less", so overfit less

### L1 vs L2 regularization
Linear regularization - Ridge and Lasso
- Lasso = linear regression with L1 regularization
- Ridge = linear regression with L2 regularization
- For other models like logistic regression we just say L1, L2, etc

In [None]:
lr_L1 = LogisticRegression(penalty='l1') # L1 regularization
lr_L2 = LogisticRegression() # penalty='l2' by default (L2 regularization)
lr_L1.fit(X_train, y_train)
lr_L2.fit(X_train, y_train)
# plot coefficients for both models
plt.plot(lr_L1.coef_.flatten())
plt.plot(lr_L2.coef_.flatten())

# L1 regularization set many features to 0, so did feature selection
# L2 regularizaiton shrinks coefficients smaller

### Regularized logistic regression
In Chapter 1 you used logistic regression on the handwritten digits data set. Here, we'll explore the effect of L2 regularization. The handwritten digits dataset is already loaded, split, and stored in the variables X_train, y_train, X_valid, and y_valid. The variables train_errs and valid_errs are already initialized as empty lists.

In [None]:
# Loop over values of C
for C_value in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:
    # Create LogisticRegression object and fit
    lr = LogisticRegression(C=C_value)
    lr.fit(X_train, y_train)
    
    # Evaluate error rates and append to lists
    train_errs.append( 1.0 - lr.score(X_train,y_train) )
    valid_errs.append( 1.0 - lr.score(X_valid,y_valid) )
    
# Plot results of the training and testing error as a 
# function of the regularization parameter, C
plt.semilogx(C_values, train_errs, C_values, valid_errs)
plt.legend(("train", "validation"))
plt.show()

# Looking at the plot, what's the best value of C?
# 10e-1 ?

### Logistic regression and feature selection
In this exercise we'll perform feature selection on the movie review sentiment data set using L1 regularization. 

The features and targets are already loaded for you in X_train and y_train. We'll search for the best value of C using scikit-learn's GridSearchCV, which was covered in the prerequisite course.

In [None]:
# Specify L1 regularization
lr = LogisticRegression(penalty='l1')

# Instantiate the GridSearchCV object and run the search
searcher = GridSearchCV(lr, {'C':[0.001, 0.01, 0.1, 1, 10]})
searcher.fit(X_train, y_train)

# Report the best parameters
print("Best CV params", searcher.best_params_)

# Find the number of nonzero coefficients (selected features)
best_lr = searcher.best_estimator_
coefs = best_lr.coef_
print("Total number of features:", coefs.size)
print("Number of selected features:", np.count_nonzero(coefs))

# Best CV params {'C': 1}
# Total number of features: 2500
# Number of selected features: 1219

### Identifying the most positive and negative words
In this exercise we'll try to interpret the coefficients of a logistic regression fit on the movie review sentiment data set. The model object is already instantiated and fit for you in the variable lr.

In addition, the words corresponding to the different features are loaded into the variable vocab. For example, since vocab[100] is "think", that means feature 100 corresponds to the number of times the word "think" appeared in that movie review.

In [None]:
# Get the indices of the sorted cofficients
inds_ascending = np.argsort(lr.coef_.flatten()) 
inds_descending = inds_ascending[::-1]

# Print the most positive words
print("Most positive words: ", end="")
for i in range(5):
    print(vocab[inds_descending[i]], end=", ")
print("\n")

# Print most negative words
print("Most negative words: ", end="")
for i in range(5):
    print(vocab[inds_ascending[i]], end=", ")
print("\n")

# Most positive words: favorite, superb, noir, knowing, loved, 

# Most negative words: disappointing, waste, worst, boring, lame,

## Logistic Regression and Probabilities
- interpreting raw model output as a (predicted) probability

With regularization:
- coefficients are smaller, which means less confident predictions (decreased probabilities)
- also affects orientation of the boundary (like slope)

How are these probabilities computed?
- logistic regression predicitons: sign of raw model output
- logistic regression probabilities: "squashed" raw model output
    - b/n 0 and 1

Looks like an ECDF...


### - with more Regularization > smaller coefficients > less confident predictions > 
- raw model outputs closer to zero > probabilities closer to 0.5 after the raw model output is squashed through the sigmoid function

### a. Regularization and probabilities
In this exercise, you will observe the effects of changing the regularization stength on the predicted probabilities. A 2D binary classification dataset is already loaded into the environment as X and y.

In [None]:
# Compute maximum predicted probability

# Set the regularization strength
model = LogisticRegression(C=1)

# Fit and plot
model.fit(X,y)
plot_classifier(X,y,model,proba=True)

# Predict probabilities on training points
prob = model.predict_proba(X)
print("Maximum predicted probability", np.max(prob))
# Maximum predicted probability 0.9761229966765974

# Create a model with C=0.1 and examine how the plot and probabilities 
# change.
# Maximum predicted probability 0.8990965659596716
# Smaller C leads to:
# - decreased probabilities (less confident predictions)
# - shifted boundary

### Visualizing easy and difficult examples
In this exercise, you'll visualize the examples that the logistic regression model is most, and least, confident about by looking at the largest, and smallest, predicted probabilities. 

The handwritten digits dataset is already loaded into the variables X and y. The show_digit function takes in an integer index and plots the corresponding image, with some extra information displayed above the image.

In [None]:
lr = LogisticRegression()
lr.fit(X,y)

# Get predicted probabilities
proba = lr.predict_proba(X)

# Sort the example indices by their maximum probability
proba_inds = np.argsort(np.max(proba,axis=1))

# Show the most confident (least ambiguous) digit
show_digit(proba_inds[-1], lr)

# Show the least confident (most ambiguous) digit
show_digit(proba_inds[0], lr)

## Multi-class logistic regression (2+ classes)
2 methods:
1. Combining binary classifiers with one-vs-rest strategy
2. Multinomial/softmax/cross entropy loss

In [None]:
# one-vs-rest strategy
lr0.fit(X, y==0)
lr1.fit(X, y==1)
lr2.fit(X, y==2)

# get raw model output
lr0.decision_function(X)[0]
# 6.124
lr1.decision_function(X)[0]
# -5.429
lr2.decision_function(X)[0]
# -7.532

# Use largest raw model output
# Which is lr0. It's more confident that the class is 0 than others.

#Check answer:
# One-vs-rest is the default strategy of sklearn logistic regression
lr.fit(X, y)
lr.predict(X)[0]
# 0

One-vs-rest vs Multinomial/softmax/cross entropy loss

One-vs-rest
- fit a binary classifier for each class
- predict with all, take largest output
- pro: simple, modular
- con: not directly optimizing accuracy
- common for SVMs as well
- can produce probabilities
Multinomial/softmax/cross entropy loss
- fit a single classifier for all classes (one time fit)
- prediction directly outputs best class
- con: more complicated, new code
- pro: tackle the problem directly
- possible for SVMs, but less common
- can produce probabilities

### Model coefficients for multi-class

In [None]:
# one-vs-rest by default
lr_ovr = LogisticRegression()
lr_ovr.fit(X,y)
lr_ovr.coef_.shape
# (3,13)
# for 3 classes: 1 coeff per feature per class and 1 intercept per class
lr_ovr.intercept_.shape
# (3,)

# multinomial
# solver specifies algorithm to minimize loss
lr_mn = LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_mn.fit(X,y)
lr_mn.coef_.shape
# (3,13)
lr_ovr.intercept_.shape
# (3,)

### Practice counting coefficients
If you fit a logistic regression model on a classification problem with 3 classes and 100 features, how many coefficients would you have, including intercepts?

3x100 + 3 = 303 coefficients

### Fitting multi-class logistic regression
In this exercise, you'll fit the two types of multi-class logistic regression, one-vs-rest and softmax/multinomial, on the handwritten digits data set and compare the results. 

The handwritten digits dataset is already loaded and split into X_train, y_train, X_test, and y_test.

In [None]:
# Fit one-vs-rest logistic regression classifier
lr_ovr = LogisticRegression()
lr_ovr.fit(X_train, y_train)

print("OVR training accuracy:", lr_ovr.score(X_train, y_train))
print("OVR test accuracy    :", lr_ovr.score(X_test, y_test))

# Fit softmax classifier
lr_mn = LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_mn.fit(X_train, y_train)

print("Softmax training accuracy:", lr_mn.score(X_train, y_train))
print("Softmax test accuracy    :", lr_mn.score(X_test, y_test))

# OVR training accuracy: 0.9948032665181886
# OVR test accuracy    : 0.9644444444444444
# Softmax training accuracy: 1.0
# Softmax test accuracy    : 0.9688888888888889

# the accuracies for both methods are similar on this data set

### Visualizing multi-class logistic regression
In this exercise we'll continue with the two types of multi-class logistic regression, but on a toy 2D data set specifically designed to break the one-vs-rest scheme. 

The data set is loaded into X_train and y_train. The two logistic regression objects, lr_mn and lr_ovr, are already instantiated (with C=100), fit, and plotted. Notice that lr_ovr never predicts the dark blue class... yikes! Let's explore why this happens by plotting one of the binary classifiers that it's using behind the scenes.

In [None]:
# Print training accuracies
print("Softmax     training accuracy:", lr_mn.score(X_train, y_train))
print("One-vs-rest training accuracy:", lr_ovr.score(X_train, y_train))

# Create the binary classifier (class 1 vs. rest)
lr_class_1 = LogisticRegression(C=100)
lr_class_1.fit(X_train, y_train==1)

# Plot the binary classifier (class 1 vs. rest)
plot_classifier(X_train, y_train==1, lr_class_1)

# Softmax     training accuracy: 0.996
# One-vs-rest training accuracy: 0.916

# Although it didn't work well here,
# in general, one-vs-rest often works well.

### One-vs-rest SVM
As motivation for the next and final chapter on support vector machines, we'll repeat the previous exercise with a non-linear SVM. 

Once again, the data is loaded into X_train, y_train, X_test, and y_test . Instead of using LinearSVC, we'll now use scikit-learn's SVC object, which is a non-linear "kernel" SVM (much more on what this means in Chapter 4!). Again, your task is to create a plot of the binary classifier for class 1 vs. rest.

In [None]:
# We'll use SVC instead of LinearSVC from now on
from sklearn.svm import SVC

# Create/plot the binary classifier (class 1 vs. rest)
svm_class_1 = SVC()
svm_class_1.fit(X_train, y_train==1)
plot_classifier(X_train, y_train==1, svm_class_1)

# 4. Support Vector Machines - SVM
- hinge loss and L2 regularization

What is an SVM?
- linear classifiers
- trained using: hinge loss and L2 regularization
What are support vectors?
- support vector - a training example NOT in the flat part of the loss diagram
- support vector - an example that is incorrectly classified OR correctly classified but close to the boundary
- if an example is not a support vector, removing it has no effect on the model
- Having a small number of support vectors makes kernel SVMs really fast
Max-margin viewpoint
- the SVM maximizes the "margin" for linearly separable datasets
- Margin: distance from the boundary to the closest points

### Effect of removing examples - remove non support vectors
Support vectors are defined as training examples that influence the decision boundary. In this exercise, you'll observe this behavior by removing non support vectors from the training set. 

The wine quality dataset is already loaded into X and y (first two features only). (Note: we specify lims in plot_classifier so that the two plots are forced to use the same axis limits and can be compared directly.)

In [None]:
# Train a linear SVM
svm = SVC(kernel="linear")
svm.fit(X,y)
plot_classifier(X, y, svm, lims=(11,15,0,6))

# Make a new data set keeping only the support vectors
print("Number of original examples", len(X))
print("Number of support vectors", len(svm.support_))
X_small = X[svm.support_]
y_small = y[svm.support_]

# Train a new SVM using only the support vectors
svm_small = SVC(kernel="linear")
svm_small.fit(X_small,y_small)
plot_classifier(X_small, y_small, svm_small, lims=(11,15,0,6))

# Number of original examples 178
# Number of support vectors 81

## Kernel SVMs
- Transform your features
    ie. transformed feature = (original feature)**2
- Fitting a linear model in a transformed space corresponds to fitting a nonlinear model in the original space


In [2]:
# kernel SVMs
from sklearn.svm import SVC
svm = SVC(gamma=1) # default is kernel="rbf"

# gamma controls the smoothness of the boundary


### GridSearchCV warm-up
In the video we saw that increasing the RBF kernel hyperparameter gamma increases training accuracy. In this exercise we'll search for the gamma that maximizes cross-validation accuracy using scikit-learn's GridSearchCV. 

A binary version of the handwritten digits dataset, in which you're just trying to predict whether or not an image is a "2", is already loaded into the variables X and y.

In [None]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
# Using default C=1
parameters = {'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X, y)

# Report the best parameters
print("Best CV params", searcher.best_params_)

# Best CV params {'gamma': 0.001}

# Larger values of gamma are better for training accuracy, but 
# cross-validation helped us find something different (and better!).

### Jointly tuning gamma and C with GridSearchCV
In the previous exercise the best value of gamma was 0.001 using the default value of C, which is 1. 

In this exercise you'll search for the best combination of C and gamma using GridSearchCV. As in the previous exercise, the 2-vs-not-2 digits dataset is already loaded, but this time it's split into the variables X_train, y_train, X_test, and y_test. Even though cross-validation already splits the training set into parts, it's often a good idea to hold out a separate test set to make sure the cross-validation results are sensible.

In [None]:
# Instantiate an RBF SVM
svm = SVC()

# Instantiate the GridSearchCV object and run the search
parameters = {'C':[0.1, 1, 10], 'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}
searcher = GridSearchCV(svm, parameters)
searcher.fit(X_train,y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)

# Report the test accuracy using these best parameters
print("Test accuracy of best grid search hypers:", 
      searcher.score(X_test, y_test))

# Best CV params {'gamma': 0.0001, 'C': 10}
# Best CV accuracy 0.9988864142538976
# Test accuracy of best grid search hypers: 0.9988876529477196

## Comparing logistic regression and SVM
Logistic regression
- is a linear classifier
- can use with kernels, but SLOW
- Outputs meaningful probabilities (more NATURAL)
- Can be extended to multi-class
- all data points affect fit
- L2 or L1 regularization

SVM
- also a linear classifier
- can use with kernels, and FAST
- Does not naturally output probabilities (NOT natural)
- can be extended to multi-class
- special property: Only "support vectors" affect fit
- conventionally just L2 regularization

### Review: logistic regression
in sklearn:
- linear_model.LogisticRegression
Key hyperparameters:
- C (inverse regularization strength)
- penalty (type of regularization - L1 and L2)
- multi_class (type of multi-class)
- ...and more


### Review: SVM in sklearn
- svm.LinearSVC and svm.SVC
Key hyperparameters:
- C (inverse regularization strength)
- kernel (type of kernel)
- gamma (inverse RBF smoothness)
    - smaller gamma leads to smoother/simpler boundaries
    - bigger gamma leads to more complex boundaries

### SGDClassifier (SGD=stochastic gradient descent)
- SGDClassifier: scales well to large datasets
    - ***** One advantage of SGDClassifier is that it's very fast - this would have taken a lot longer with LogisticRegression or LinearSVC.
- just to specify the 'loss'

Note:
- SGDClassifier hyperparameter 'alpha' is like '1/C'
    - bigger alpha > more regularization

In [None]:
# SGDClassfier
from skelearn.linear_model import SGDClassifier
logreg = SGDClassifier(loss='log')
linsvm = SGDClassifier(loss='hinge')

### Using SGDClassifier
In this final coding exercise, you'll do a hyperparameter search over the regularization type, regularization strength, and the loss (logistic regression vs. linear SVM) using SGDClassifier.

In [None]:
# We set random_state=0 for reproducibility 
linear_classifier = SGDClassifier(random_state=0)

# Instantiate the GridSearchCV object and run the search
# Search over the regularization strength, the hinge vs. log losses,
# and L1 vs. L2 regularization.
parameters = {'alpha':[0.00001, 0.0001, 0.001, 0.01, 0.1, 1], 
             'loss':['hinge','log'], 'penalty':['l1','l2']}
searcher = GridSearchCV(linear_classifier, parameters, cv=10)
searcher.fit(X_train, y_train)

# Report the best parameters and the corresponding score
print("Best CV params", searcher.best_params_)
print("Best CV accuracy", searcher.best_score_)
print("Test accuracy of best grid search hypers:", 
      searcher.score(X_test, y_test))

# Best CV params {'alpha': 0.0001, 'loss': 'hinge', 'penalty': 'l1'}
# Best CV accuracy 0.94351630867144
# Test accuracy of best grid search hypers: 0.9592592592592593
