# Linear classifiers with scikit-learn
- logistic regression
- SVM

# 1. Applying logistic regression and SVM

In [None]:
# Review KNN classification
- using Large Movie Review Dataset
# The variables X_train, X_test, y_train, and y_test are already 
# loaded into the environment. The X variables contain features 
# based on the words in the movie reviews, and the y variables 
# contain labels for whether the review sentiment is positive (+1) 
# or negative (-1).

from sklearn.neighbors import KNeighborsClassifier

# Create and fit the model with default hyperparameters
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Predict on the test features, print the results
pred = knn.predict(X_test)[0]
print("Prediction for test example 0:", pred)
# Prediction for test example 0: 1.0

In [None]:
# Comparing models
# Compare k nearest neighbors classifiers with k=1 and k=5 on the
# handwritten digits data set, which is already loaded into the 
# variables X_train, y_train, X_test, and y_test. 
# You can set k with the n_neighbors parameter when creating the 
# KNeighborsClassifier object, which is also already imported into 
# the environment.

# Which model has a higher test accuracy?
In [2]: knn = KNeighborsClassifier(n_neighbors=1)

In [3]: knn.fit(X_train, y_train)
Out[3]: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')

In [4]: knn.score(X_test, y_test)
Out[4]: 0.9888888888888889

In [5]: knn = KNeighborsClassifier(n_neighbors=5)

In [6]: knn.fit(X_train, y_train)
Out[6]: 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [7]: knn.score(X_test, y_test)
Out[7]: 0.9933333333333333

# n_neighbors = 5

## LogisticRegression

In [None]:
# LogisticRegression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(X_test)
lr.score(X_test, y_test)

In [None]:
# LogReg example 2
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.ft(wine.data, wine.target)
lr.score(wine.data, wine.target)
# 0.972

# confidence intervals
lr.predict_proba(wine.data[:1])
array([[ 9.951e-01, 4.357e-03, 5.339e-04]])

## LinearSVC for SVM

In [None]:
# LinearSVC
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.svm import LinearSVC
svm = LinearSVC()
svm.ft(wine.data, wine.target)
svm.score(wine.data, wine.target)
# 0.893

## SVC - uses nonlinear SVM by default

In [None]:
# SVC
import sklearn.datasets
wine = sklearn.datasets.load_wine()
from sklearn.svm import SVC
svm = SVC()
svm.ft(wine.data, wine.target)
svm.score(wine.data, wine.target)
# 1.

### Examples

In [None]:
# examples: logistic regression and SVM
#For each classifier, print out the training and validation accuracy.

from sklearn import datasets
digits = datasets.load_digits()
Xtrain, Xtest, ytrain, ytest = train_test_split(digits.data, digits.target)

# Apply logistic regression and print scores
lr = LogisticRegression()
lr.fit(Xtrain, ytrain)
print(lr.score(Xtrain,ytrain))
print(lr.score(Xtest,ytest))

# Apply SVM and print scores
svm = SVC()
svm.fit(Xtrain, ytrain)
print(svm.score(Xtrain,ytrain))
print(svm.score(Xtest,ytest))

# 0.9977728285077951
# 0.9444444444444444
# 1.0
# 0.26666666666666666

In [None]:
# Sentiment analysis for movie reviews
# In this exercise you'll explore the probabilities outputted by 
# logistic regression on a subset of the Large Movie Review Dataset. 
# The variables X and y are already loaded into the environment. 
# X contains features based on the number of times words appear in 
# the movie reviews, and y contains labels for whether the review 
# sentiment is positive (+1) or negative (-1).

# Instantiate logistic regression and train
lr = LogisticRegression()
lr.fit(X, y)

# Predict sentiment for a glowing review
review1 = "LOVED IT! This movie was amazing. Top 10 this year."
review1_features = get_features(review1)
print("Review:", review1)
print("Probability of positive review:", 
      lr.predict_proba(review1_features)[0,1])

# Predict sentiment for a poor review
review2 = "Total junk! I'll never watch a film by that director again, no matter how good the reviews."
review2_features = get_features(review2)
print("Review:", review2)
print("Probability of positive review:", 
      lr.predict_proba(review2_features)[0,1])

# Review: LOVED IT! This movie was amazing. Top 10 this year.
# Probability of positive review: 0.8079007873616059
# Review: Total junk! I'll never watch a film by that director again, no matter how good the reviews.
# Probability of positive review: 0.5855117402793947

# note: "good" in the second review throws it off

### Linear classifiers
- decision boundaries: linear, nonlinear
- linearly separable data


## Visualizing decision boundaries


In [None]:
# Visualize the decision boundaries of various classifier types. 
# A subset of scikit-learn's built-in wine dataset is already 
# loaded into X along with binary labels in y.

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

# Define the classifiers
classifiers = [LogisticRegression(),
LinearSVC(),
SVC(),
KNeighborsClassifier()]

# Fit the classifiers
for c in classifiers:
    c.fit(X, y)

# Plot the classifiers
plot_4_classifiers(X, y, classifiers)
plt.show()

# 2. Loss functions

## Linear classifiers: the coefficients
- prediction equations

In [None]:
# Dot products
x = np.arange(3)
x
# out: array([0,1,2])
    
y = np.arange(3,6)
y
# out: array([3,4,5])
    
x*y
# out: array([0,4,10])

# dot product
np.sum(x*y)
# out: 14
# convenient notation: dot product using @
x@y
# out: 14

### Linear classifier prediction
- raw model output = coefficients * features + intercept
- Linear classifier prediction:
    - compute raw model output
    - check the sign (which side of decision boundary)
        - if positive, predict one class
        - if negative, predict the other class
- This is the same for logistic regression and linear SVM
    - 'fit' is different but 'predict' is the same
    - difference in 'fit' in loss functions

In [None]:
# How Logistic Regression makes predictions
lr = LogisticRegression()
lr.fit(X,y)
lr.predict(X)[10]
# out: 0
lr.predict(X)[20]
# out: 1

# get coefficients
lr.coef_ & X[10] + lr.intercept_ # raw model output for example 10
array([-33.78572166])
# it's negative so predict other class
# for example 20, positive, so predict one class
lr.coef_ & X[20] + lr.intercept_ # raw model output for example 20
array([0.08050621])


In [None]:
# Changing the model coefficients
# coefficients determine slope of boundary
# intercept shifts the boundary

# Observe the effects of changing the coefficients of a linear
# classifer. A 2D dataset is already loaded into the environment
# as X and y, along with a linear classifier object model.

# Set the coefficients
# changed coefficients from 0,1 to -1,1
model.coef_ = np.array([[0,1]])
# changed intercept from 0 to -3
model.intercept_ = np.array([0])

# Plot the data and decision boundary
plot_classifier(X,y,model)

# Print the number of errors
num_err = np.sum(y != model.predict(X))
print("Number of errors:", num_err)

## What is a loss function?
- sklearn's LinearRegression minimizes a loss
- minimization is with respect to coefficients or parameters of model
- note: model.score() isn't necessarily the loss function
- The loss is used to fit the model on the data, while the score is used to see how we're doing

Classification errors: the 0-1 loss
- The squared error/loss isn't appropriate for Classification problems since y are categories (not numbers)
- a natural loss for classification problem is the number of errors
- This is the '0-1 loss': 0 for correct prediction, 1 for incorrect prediction
- By summing the results, you get the total number of incorrect
- But this loss is hard to minimize, so logreg and SVM don't use this.


## Minimizing a loss function
- using scipy.optimize.minimize

In [None]:
from scipy.optimize import minimize
minimize(np.square, 0).x
#out: array([0.])

minimize(np.square, 2).x
#out: array([-1.88846401e-08])

In [None]:
# Train a model on the Boston housing price data set, which is 
# already loaded into the variables X and y. For simplicity, we 
# won't include an intercept in our regression model.

# The squared error, summed over training examples
def my_loss(w):
    s = 0
    for i in range(y.size):
        # Get the true and predicted target values for example 'i'
        y_i_true = y[i]
        y_i_pred = w@X[i]
        s = s + (y_i_pred - y_i_true)**2
    return s

# Returns the w that makes my_loss(w) smallest
w_fit = minimize(my_loss, X[0]).x
print(w_fit)

# Compare with scikit-learn's LinearRegression coefficients
lr = LinearRegression(fit_intercept=False).fit(X,y)
print(lr.coef_)

# [-9.16299112e-02  4.86754828e-02 -3.77698794e-03  2.85635998e+00
#  -2.88057050e+00  5.92521269e+00 -7.22470732e-03 -9.67992974e-01
#   1.70448714e-01 -9.38971600e-03 -3.92421893e-01  1.49830571e-02
#  -4.16973012e-01]
# [-9.16297843e-02  4.86751203e-02 -3.77930006e-03  2.85636751e+00
#  -2.88077933e+00  5.92521432e+00 -7.22447929e-03 -9.67995240e-01
#   1.70443393e-01 -9.38925373e-03 -3.92425680e-01  1.49832102e-02
#  -4.16972624e-01]

# 3. Logistic regression

# 4. Support Vector Machines - SVM