In [1]:
# dataset loader
from sklearn import datasets

# model training and evalutation utilities 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold # this is one way to generate folds
from sklearn.model_selection import KFold

# models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

# toy data
X,y = datasets.load_iris(return_X_y=True)
X.shape, y.shape

((150, 4), (150,))

# What you should learn/be aware of based on this lecture
Key sklearn functions:

train_test_split
cross_validate
Fold generators: KFold and StratifiedKFold
Scoring functions per last lecture and how to pass to cross_validate
How to compare different models by looping over them with cross_validate, GridSearchCV, or RandomizedSearchCV
Not covered today but you should check out:

confusion_matrix and classification_report (helpful to evaluate models)

# A simple "split, train, evaluate" example

In [2]:

# split the data with 50% in each set
X1, X2, y1, y2 = train_test_split(X, y, random_state=0,
                                  train_size=0.5)

# fit the model on one set of data
# ignore the model I choose here, its not important what
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X1, y1) # fit on the "training data" X1 and  y1

# evaluate the model on the second set of data
y2_model = model.predict(X2) # using X2 (out-of-sample data), predict y2
accuracy_score(y2, y2_model) # see how close y2 is to prediction (fraction of all pred that are exactly right)

0.9066666666666666

## Want to do k-fold? It's like repeating the above. In pseudo code, it looks like:
Break the X and y data into $k$ subsamples
For each subsample, fit the model, predict OOS, score predictions, and save those
Ok?

## K-Fold in Python: The explicit way, and the wrapped way
Watch me do the explicit way

Now try the wrapper below! We are going to see how to use that function to:

try multiple models
try different sets of X variables
try different ways to specific folds

In [26]:
# try the function here
cross_validate(model, X,y,cv=5)

{'fit_time': array([0.00120187, 0.00051403, 0.00052929, 0.00082898, 0.00038576]),
 'score_time': array([0.00237322, 0.00157785, 0.00252175, 0.00181794, 0.00195813]),
 'test_score': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 0.96666667])}

In [25]:
# try here with diff scores

cross_validate(model, X,y, scoring=['accuracy','r2','precision_macro'],cv=5)

{'fit_time': array([0.00114274, 0.00137186, 0.00044703, 0.00044417, 0.00046325]),
 'score_time': array([0.00746536, 0.00851607, 0.00569797, 0.00726891, 0.01046681]),
 'test_accuracy': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 0.96666667]),
 'test_r2': array([0.9 , 0.95, 0.9 , 0.9 , 0.95]),
 'test_precision_macro': array([0.93333333, 0.96969697, 0.94444444, 0.93333333, 0.96969697])}


All the metrics it can compute out of the box are here: https://scikit-learn.org/stable/modules/model_evaluation.html

Notice that many of these were discussed in our last lecture!

Warning/Note: the metric names on that link and what you put in the scoring dictionary don't seem to match up.

### question:
using 5 folds, what is the average (across the folders) out-of-sample (training) F1?

In [24]:
cross_validate(model,X,y,scoring='f1_macro',cv=5)  ['test_score'].mean()


0.9464985696564643

## Exploring the cross_validate parameters

### The model parameter

In [23]:
#change the model
#by changing the model parameter, you can adjust the type of model and the models parameters
cross_validate(SVC(gamma='auto'),X,y,scoring='f1_macro',cv=5)
cross_validate(SVC(C=5,gamma='scale'),X,y,scoring='f1_macro',cv=5)

  'precision', 'predicted', average, warn_for)


{'fit_time': array([0.00092411, 0.00069594, 0.00079298, 0.0009191 , 0.00194287]),
 'score_time': array([0.00124979, 0.00079584, 0.00075817, 0.00123382, 0.00181603]),
 'test_score': array([0.96658312, 1.        , 0.93333333, 0.96658312, 1.        ])}

### question :

try to use a regression model, (you can't use f1 on this, so evaluate on r2)

In [8]:
cross_validate(LinearRegression(),X,y,scoring='r2')  #['score_time'].mean()
#using the LinearRegression model from L16



{'fit_time': array([0.23697114, 0.00094104, 0.00088501]),
 'score_time': array([0.00060868, 0.00054765, 0.00045896]),
 'test_score': array([0., 0., 0.])}

**linear_model submodule contains lots of useful alternate options**

In [9]:
# for example:
linear_model.Lasso
linear_model.Ridge
linear_model.LogisticRegression

linear_model.LassoCV() # Returns a Lasso (L1 Regularization) linear model with picking the best model by cross validation
linear_model.RidgeCV() # Returns a Ridge (L2 Regularization) linear model with picking the best model by cross validation
linear_model.LogisticRegressionCV() # return best logit model by CV

LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
                     fit_intercept=True, intercept_scaling=1.0, l1_ratios=None,
                     max_iter=100, multi_class='warn', n_jobs=None,
                     penalty='l2', random_state=None, refit=True, scoring=None,
                     solver='lbfgs', tol=0.0001, verbose=0)

**Looping over models**

In [10]:
# set up models to try
models = []
models.append(('svc_1', SVC(gamma='auto') ))
models.append(('svc_2', SVC(C=5, gamma='scale') ))
models.append(('neighbor1',  KNeighborsClassifier(n_neighbors=1)))

models[0][1]


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [11]:
# loop and print
for name, model in models:
    scores = cross_validate(model, X, y, scoring='accuracy', cv=5)
    print('%s: %.3f (%.3f)' % (name.ljust(10), 
                                   scores['test_score'].mean(), 
                                   scores['test_score'].std()
                                   )
         )

svc_1     : 0.980 (0.016)
svc_2     : 0.987 (0.016)
neighbor1 : 0.960 (0.025)


- helps you pick the exact perameters within the model!

**gridsearchCV
randomizedsearchCV**

## The X Parameter

**You can loop over Xs**

In [12]:
# define a smaller X and a bigger X
X_small = X[:,:2] # just first two columns

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=3, include_bias=False)
X3 = poly.fit_transform(X)



In [15]:
# set up Xs to try
Xs =[]
Xs.append( ('X',X    )    )
Xs.append( ('X_small',X_small    )    )
Xs.append( ('X3',X3    )    )

# loop and print
model = KNeighborsClassifier(n_neighbors=1)

for X_name, X in Xs:
    scores = cross_validate(model, X, y, scoring='accuracy',cv=5)
    print('%s:%.3f (%.3f)' % (X_name.ljust(10),
                             scores['test_score'].mean(),
                             scores['test_score'].std()
                             )
         )

X         :0.960 (0.025)
X_small   :0.727 (0.061)
X3        :0.947 (0.016)


## Xs and Models 

In [19]:
for X_name, X in Xs:
    for name, model in models:
        scores = cross_validate(model, X, y, scoring='accuracy',cv=5)
        print('%s+ %s:%.3f (%.3f)' % (name.ljust(10),
                              X_name.ljust(10),
                             scores['test_score'].mean(),
                             scores['test_score'].std()
                             )
         )

svc_1     + X         :0.980 (0.016)
svc_2     + X         :0.987 (0.016)
neighbor1 + X         :0.960 (0.025)
svc_1     + X_small   :0.820 (0.058)
svc_2     + X_small   :0.813 (0.054)
neighbor1 + X_small   :0.727 (0.061)
svc_1     + X3        :0.527 (0.077)
svc_2     + X3        :0.973 (0.025)
neighbor1 + X3        :0.947 (0.016)


## CV parameter and folds
Just watch

In [27]:
cross_validate(model, X,y,scoring='accuracy',cv=5)


{'fit_time': array([0.00077534, 0.00075817, 0.00042486, 0.00038385, 0.00041986]),
 'score_time': array([0.00293279, 0.00296998, 0.00177503, 0.00194383, 0.00182223]),
 'test_score': array([0.93333333, 0.96666667, 0.93333333, 0.93333333, 0.96666667])}

In [36]:
y=["a", 'a','a','b','b','b','c','c','c'] #silly formatting purposes

In [37]:
kf=KFold(n_splits=3)
kf=KFold(n_splits=3,shuffle=True,random_state=1) #must give state!

for train, test in kf.split(y): #for each fold,
    print("train: %s test: %s"  % (str(train).ljust(32), test)) #but here, just show
    print("       %s       %s"  % (str([y[j] for j in train]).ljust(32),  [y[j] for j in test]))
    print() #blank line
    
#ffold --> splits it BY INDEX


train: [0 1 3 4 5 7]                    test: [2 6 8]
       ['a', 'a', 'b', 'b', 'b', 'c']         ['a', 'c', 'c']

train: [2 3 4 5 6 8]                    test: [0 1 7]
       ['a', 'b', 'b', 'b', 'c', 'c']         ['a', 'a', 'c']

train: [0 1 2 6 7 8]                    test: [3 4 5]
       ['a', 'a', 'a', 'c', 'c', 'c']         ['b', 'b', 'b']



In [41]:
skf = StratifiedKFold(n_splits=3)
#skf = StratifiedKFold(n_splits=3,shuffle=True,random_state=1) #now random
X = y #skf needs an X and y variable

for train, test in skf.split(X,y): #for each fold,
    print("train: %s test: %s"  % (str(train).ljust(32), test)) #but here, just show
    print("       %s       %s"  % (str([y[j] for j in train]).ljust(32),  [y[j] for j in test]))
    print() #blank line
    

train: [1 2 4 5 7 8]                    test: [0 3 6]
       ['a', 'a', 'b', 'b', 'c', 'c']         ['a', 'b', 'c']

train: [0 2 3 5 6 8]                    test: [1 4 7]
       ['a', 'a', 'b', 'b', 'c', 'c']         ['a', 'b', 'c']

train: [0 1 3 4 6 7]                    test: [2 5 8]
       ['a', 'a', 'b', 'b', 'c', 'c']         ['a', 'b', 'c']



In [47]:
#reload the X and y variables (we just overwrote)
X,y= datasets.load_iris(return_X_y=True)
model = KNeighborsClassifier(n_neighbors=1)
cross_validate(model, X, y, cv=StratifiedKFold(n_splits=3)   ,  scoring = 'accuracy')

#set up folds to try
folds=[]
folds.append(('kf', KFold(n_splits=3)   ))
folds.append(('kf_rand', KFold(n_splits=3,shuffle=True,random_state=1)  ))
folds.append(('skf_3', StratifiedKFold(n_splits=3)))
folds.append(('skf_3_rand', StratifiedKFold(n_splits=3,shuffle=True,random_state=1)))
folds.append(('skf_5', StratifiedKFold(n_splits=5)))

model = KNeighborsClassifier(n_neighbors=1)
#loop and print
for fold_name, fold in folds:
    scores = cross_validate(model,X,y,cv=fold,scoring='accuracy')
    print('%s: %.3f (%.3f)' % (fold_name.ljust(10),
                                  scores['test_score'].mean(),
                                  scores['test_score'].std()
                                  )
         )
    

kf        : 0.000 (0.000)
kf_rand   : 0.947 (0.025)
skf_3     : 0.967 (0.033)
skf_3_rand: 0.960 (0.016)
skf_5     : 0.960 (0.025)



# Links, resources, and next week
Only two resources needed

- sklearn docs are GREAT https://scikit-learn.org/stable/user_guide.html
- Python Data Science Handbook (note some module calls are obsolete, so you might need to update code) https://jakevdp.github.io/PythonDataScienceHandbook/index.html

Next week:

- preprocessing
- data transformations
- feasture selection