In [1]:
# dataset loader
from sklearn import datasets

# model training and evalutation utilities 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold # this is one way to generate folds
from sklearn.model_selection import KFold

# models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

# toy data
X, y = datasets.load_iris(return_X_y=True)
X.shape, y.shape


((150, 4), (150,))

### What you should learn/be aware of based on this lecture
Key sklearn functions:

train_test_split
cross_validate
Fold generators: KFold and StratifiedKFold
Scoring functions per last lecture and how to pass to cross_validate
How to compare different models by looping over them with cross_validate, GridSearchCV, or RandomizedSearchCV
Not covered today but you should check out:

confusion_matrix and classification_report (helpful to evaluate models)

### A simple "split, train, evaluate" example

In [2]:
# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

# fit the model on one set of data
# ignore the model I choose here, its not important what
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X1, y1) # fit on the "training data" X1 and  y1

# evaluate the model on the second set of data
y2_model = model.predict(X2) # using X2 (out-of-sample data), predict y2
accuracy_score(y2, y2_model) # see how close y2 is to prediction (fraction of all pred that are exactly right)

0.9066666666666666

## Want to do k-fold? It's like repeating the above. In pseudo code, it looks like:
1. Break the X and y data into $k$ subsamples
2. For each subsample, fit the model, predict OOS, score predictions, and save those
Ok?

### K-Fold in Python: The explicit way, and the wrapped way
Watch me do the explicit way

In [3]:
# you can take quick notes here, but I'm not going to write this code slow enough to copy
# the point here is to illustrate

Now try the wrapper below! We are going to see how to use that function to:

1. try multiple models
2. try different sets of X variables
3. try different ways to specific folds

In [4]:
# try the function here

In [5]:
# try here with diff scores

All the metrics it can compute out of the box are here: https://scikit-learn.org/stable/modules/model_evaluation.html

Notice that many of these were discussed in our last lecture!

Warning/Note: the metric names on that link and what you put in the scoring dictionary don't seem to match up.

### question:

In [16]:
cross_validate(model, X, y)

{'fit_time': array([0.00071812, 0.00062728, 0.00076103, 0.00049591, 0.00042796]),
 'score_time': array([0.00398207, 0.00260782, 0.00196385, 0.00183177, 0.00141215]),
 'test_score': array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])}

In [15]:
# answer here
#Using 5 folds, what is the average (across the folds) out-of-sample (training) F1?
cross_validate(model,X, y, scoring=['accuracy', 'r2', 'precision_macro'])

{'fit_time': array([0.00078011, 0.00046897, 0.00041771, 0.00096273, 0.00047112]),
 'score_time': array([0.0061748 , 0.00281191, 0.00308228, 0.00249815, 0.002285  ]),
 'test_accuracy': array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ]),
 'test_r2': array([0.95, 0.95, 0.9 , 0.9 , 1.  ]),
 'test_precision_macro': array([0.96969697, 0.96969697, 0.94444444, 0.93333333, 1.        ])}

In [17]:
cross_validate(model, X, y, scoring='f1_macro')['test_score'].mean()

0.9598319029897976

## Exploring the cross_validate parameters
### The model parameter

In [20]:
# by changing the model parameter, you can adj the type of model and the models parameters
cross_validate(SVC(gamma='auto'), X, y, scoring='f1_macro')
cross_validate(SVC(C=5), X, y, scoring='f1_macro')

{'fit_time': array([0.00057077, 0.00050998, 0.00051308, 0.00051403, 0.00056291]),
 'score_time': array([0.00075221, 0.00089288, 0.00070381, 0.00068688, 0.00079107]),
 'test_score': array([0.96658312, 1.        , 1.        , 0.96658312, 1.        ])}

### question
try to use a regression model, (you can't use f1 on this, so evaulate on r2)

In [24]:
# answer here
cross_validate(LinearRegression(), X, y, scoring='r2')['test_score'].mean()

0.32256072489000853

linear_model submodule contains lots of useful alternate options

In [22]:

# for example:
linear_model.Lasso
linear_model.Ridge
linear_model.LogisticRegression

linear_model.LassoCV() # Returns a Lasso (L1 Regularization) linear model with picking the best model by cross validation
linear_model.RidgeCV() # Returns a Ridge (L2 Regularization) linear model with picking the best model by cross validation
linear_model.LogisticRegressionCV() # return best logit model by CV

LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='auto', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

Looping over models

In [27]:
# set up models to try
models = []
models.append(('svc_1', SVC(gamma='auto') ))
models.append(('svc_2', SVC(C=5) ))
models.append(('neighbor',  KNeighborsClassifier(n_neighbors=1)))
models[0][1]

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
# loop and print
for name, model in models:
    scores = cross_validate(model, X, y, scoring='accuracy')
    print('%s: %.3f (%.3f)' % (name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

svc_1     : 0.980 (0.016)
svc_2     : 0.987 (0.016)
neighbor  : 0.960 (0.025)


In [None]:

gridsearchCV
randomizesearchCV

### The X parameter
You can loop over Xs



In [14]:

# define a smaller X and a bigger X
X_small = X[:,:2] # just first two columns

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
X3 = poly.fit_transform(X)

# set up Xs to try
right here!

# loop and print
right here!

SyntaxError: invalid syntax (<ipython-input-14-03903a534cc6>, line 9)

### Xs and Models

### CV parameter and folds
Just watch.

### Links, resoruces, and next week
Only two resources needed

1. sklearn docs are GREAT https://scikit-learn.org/stable/user_guide.html
2. Python Data Science Handbook (note some module calls are obsolete, so you might need to update code) https://jakevdp.github.io/PythonDataScienceHandbook/index.html

Next week:

1. preprocessing
2. data transformations
3. feasture selection