In [None]:
# IGNORE THIS CELL WHICH CUSTOMIZES LAYOUT AND STYLING OF THE NOTEBOOK !
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings

import matplotlib.pyplot as plt

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings(
    "ignore",
    message="X does not have valid feature names, but [a-zA-Z]+ was fitted with feature names",
    category=UserWarning,
)

warnings.filterwarnings = lambda *a, **kw: None
from IPython.core.display import HTML

HTML(open("custom.html", "r").read())

# Chapter 3: Overfitting, underfitting and cross-validation

## What are overfitting and underfitting?

Let us recall the `LogisticRegression`-based beer classfier we used in the first script. We've disovered that setting hyperparmeter `C=2` gave us good results (`C` controls `regularization`, lower `C` means higher `regularization` and vice-versa):

In [None]:
import pandas as pd

# reading the beer dataset
beer_data = pd.read_csv("data/beers.csv")
print(beer_data.shape)

# all columns up to the last one:
input_features = beer_data.iloc[:, :-1]

# only the last column:
labels = beer_data.iloc[:, -1]

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=2)

classifier.fit(input_features, labels)

# Predict
predicted_labels = classifier.predict(input_features)
print(
    "{:.2f} % labeled correctly".format(
        sum(predicted_labels == labels) / len(labels) * 100
    )
)

Here to train (fit) the model we only used 225 samples from the original data set of 300 beers.

But if the above classifier works well, it should also show the same performance on the left out 75 beers.

Let us check this on the left out data:

In [None]:
eval_data = pd.read_csv("data/beers_eval.csv")
print(eval_data.shape)

In [None]:
eval_features = eval_data.iloc[:, :-1]
eval_labels = eval_data.iloc[:, -1]

# Predict
predicted_labels = classifier.predict(eval_features)
print(
    "{:.2f} % labeled correctly".format(
        sum(predicted_labels == eval_labels) / len(eval_labels) * 100
    )
)

<div style="font-size:150%; font-weight: bold;">
           
WHAT HAPPENED????
<br/>
<br/>
Why is the accuracy on new data much lower?
<br/>
<br/>
Answer: OVERFITTING !!

</div>

We observed a phenomenon called **"overfitting"**.


<img src="./images/2qky90.jpg" width=30% />

### Overfitting

To explain the concept of "overfitting" let's use the circle data set:

In [None]:
data = pd.read_csv("data/circle.csv")
features = data.iloc[:, :-1]
labels = data.iloc[:, -1]

COLORS = ["chocolate", "steelblue"]

plt.figure(figsize=(4, 4))
ax = plt.subplot(1, 1, 1)
plt.scatter(
    features.iloc[:, 0], features.iloc[:, 1], c=[COLORS[l] for l in labels], marker="o"
);

We mentioned before that classifiers depend on (hyper)parameters (like `C`) which can be tuned to improve performance.

Let us try to find out the purpose of the `gamma` parameter of `SVC` classifier:

In [None]:
# utility functions copy-pasted from previous script

import matplotlib.pyplot as plt
import numpy as np


def plot_points(features_2d, labels, plt=plt, marker="o"):
    colors = [["steelblue", "chocolate"][i] for i in labels]
    plt.scatter(features_2d[:, 0], features_2d[:, 1], color=colors, marker=marker)


def train_and_plot_decision_surface(
    name, classifier, features_2d, labels, preproc=None, plt=plt, marker="o", N=300
):

    features_2d = np.array(features_2d)

    xmin, ymin = features_2d.min(axis=0)
    xmax, ymax = features_2d.max(axis=0)

    x = np.linspace(xmin, xmax, N)
    y = np.linspace(ymin, ymax, N)
    points = np.array(np.meshgrid(x, y)).T.reshape(-1, 2)

    if preproc is not None:
        points_for_classifier = preproc.fit_transform(points)
        features_2d = preproc.fit_transform(features_2d)
    else:
        points_for_classifier = points

    classifier.fit(features_2d, labels)
    predicted = classifier.predict(features_2d)

    if preproc is not None:
        name += " (w/ preprocessing)"
    print(name + ":\t", sum(predicted == labels), "/", len(labels), "correct")

    classes = np.array(classifier.predict(points_for_classifier), dtype=bool)
    plt.scatter(
        points[~classes][:, 0],
        points[~classes][:, 1],
        color="steelblue",
        marker=marker,
        s=1,
        alpha=0.05,
    )
    plt.scatter(
        points[classes][:, 0],
        points[classes][:, 1],
        color="chocolate",
        marker=marker,
        s=1,
        alpha=0.05,
    )

    plot_points(features_2d, labels)
    plt.title(name)

In [None]:
from sklearn.svm import SVC

df = pd.read_csv("data/circle.csv")
features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

# three classifiers with different values for gamma:
classifiers = [SVC(gamma=18), SVC(gamma=9), SVC(gamma=0.1)]

plt.figure(figsize=(21, 6))

for i, clf in enumerate(classifiers):

    plt.subplot(1, len(classifiers), i + 1)
    train_and_plot_decision_surface(
        "gamma = {}".format(clf.gamma), clf, features, labels
    )

#### Observation

The parameter `gamma` of `SVC` has an effect on the flexibility/complexity of the decision surface. A large value allows a very flexible / "irregular" decision surface, for smaller values the surface gets smoother / "stiffer" / "more regular" (allowing more misclassifications).

This is also coined **simple** resp. **complex** models.

We see here also 

- that the smallest `gamma` value produces a classifier which seems to get the idea of a "circle", 
- whereas the large `gamma` value adapts the classifier more to the training data samples.

Let's try an even larger `gamma` value:

In [None]:
clf = SVC(gamma=90)
plt.figure(figsize=(6, 6))

train_and_plot_decision_surface("gamma = {}".format(clf.gamma), clf, features, labels)

The plot above shows an extreme example for the previously mentioned effect of overfitting.

- If we evaluate performance of this classifier on the training data set we get an **accuracy of `~100%`**

- But the classifier totally fails to learn the concept of a circle, and you can easily imagine how bad this classifier performs on new and unseen data.


<div class="alert alert-block alert-warning">
<p style="font-weight: bold;"><i class="fa fa-warning"></i>&nbsp; Definitions</p>

<ul>

<li><strong>Overfitting</strong>: The classifier overfits if it too closely fits to/learns detail or noise in the training data instead of learning the underlying concept. Thus, the classifier does not generalize well and shows much worse performance on previously unseen new data.</li>
<br/>
<li><strong>Generalization</strong>: An ability of a classifier to learn the concept behind data. A classifier generalizes well if it shows similar performance on training and on new data.</li>
<br/>
<li><strong>Robust classifier</strong>: A classifier which is not or very little susceptible to overfitting when learning some data, i.e. a classfier which usually generalizes well.</li>


</ul>
 
</div>




#### More "probabilistic" definition

- Our data is generated by a (usually unknown) model.
- We have only samples from this model.
- A classifier tries to approximate the underlying model based on the given samples.

In this context the observed bad generalization performance of the classifier can be explained by computing a model which is to far away from the original model.

The following graphics depicts our explanations: 

- The more "complex" a model gets the better it fits trainig data. Thus accuracy on the training data improves.
- At a certain point the model is too adapted to the training data and gets worse and worse when evaluated later on previously unseen new data.


<img src="./images/accuracy_training_vs_eval.svg" width=50%/>  

### Underfitting

The other extreme of overfitting is called **underfitting**: the classifiers decision boundary deviates too far from the boundary in training data and produces a classifier which does not perform well even on training data.

We can demonstrate this by choosing a "too small" value of `gamma`

In [None]:
# small gamma tries to build a "safe", "perfect" circle

clf = SVC(gamma=0.06)
plt.figure(figsize=(6, 6))

train_and_plot_decision_surface("gamma = {}".format(clf.gamma), clf, features, labels)
# plt.scatter(features.iloc[:, 0], features.iloc[:, 1], color=c, marker='.');

## Diagnosing and solving the overfitting problem

### How did we fall for overfitting? 

<div class="alert alert-block alert-warning">

<div style="font-size:150%;">
    <i class="fa fa-info-circle"></i>
    <center>
Our fundamental mistake was to evaluate the performace <br/>of the classifier on the training data.

</center>
</div>
</div>

Repeat:

<div class="alert alert-block alert-warning">



<div style="font-size:150%;">
     <i class="fa fa-info-circle"></i>
    <center>
Our fundamental mistake was to evaluate the performace <br/>of the classifier on the training data.

</center>
</div>
</div>


### How can we do better?

There is no classifier which works out of the box in all situations. Depending on the "geometry" / "shape" of the data, classification algorithms and their settings can make a big difference.

In our previous 2D examples we were able to visualize the data and classification results, this is not possible for higher dimensional data.

The general way to handle this situation is as follows: 

- split our data into a learning data set and a test data set


- train the classifier on the learning data set


- assess performance of the classifier on the test data set.

## Cross-validation

The procedure called *cross-validation* goes a step further in data splitting: In this procedure the full dataset is split into learn-/test-set in various ways. Statistics of the achieved test performance is computed to assess future performance of the classifier.

A common approach is **K-fold cross-validation**:

K-fold cross-validation has an advantage that we do not leave out part of our data from training. This is useful when we do not have a lot of data.

<img src="./images/305azk.jpg" title="made at imgflip.com" width=40%/>

### Example: 4-fold cross validation

For 4-fold cross validation we split our data set into four equal sized partitions P1, P2, P3 and P4.

We:

- hold out `P1`: train the classifier on `P2 + P3 + P4`, compute accuracy `m1` on `P1`.

<img src="./images/cross_val_0.svg" />

-  hold out `P2`: train the classifier on `P1 + P3 + P4`, compute accuracy `m2` on `P2`.

<img src="./images/cross_val_1.svg" />

-  hold out `P3`: train the classifier on `P1 + P2 + P4`, compute accuray `m3` on `P3`.

<img src="./images/cross_val_2.svg" />

-  hold out `P4`: train the classifier on `P1 + P2 + P3`, compute accuracy `m4` on `P4`.

<img src="./images/cross_val_3.svg" />

Finally we can compute the average of `m1` .. `m4` as the final measure for accuracy.

Some advice:

- This can be done on the original data or on randomly shuffled data. It is recommended to shuffle the data first, as there might be some unknown underlying ordering in your dataset

- Usually one uses 3- to 10-fold cross validation, depending on the amount of data available.

### Variant: randomized cross validation

A randomized variant works like this:

- Perform $n$ iterations:

   - draw a fraction $p$ (e.g. 80%) from your full data set without replacement for the training data set.
   - use the remaining fraction $1 - p$ as evaluation data set
   - train classifier and compute performance score(s).
  

### Cross valiation with scikit-learn

In [None]:
import pandas as pd

beer = pd.read_csv("data/beers.csv")
beer_eval = pd.read_csv("data/beers_eval.csv")

# Since we're using cross validation, let's use all data
all_beer = pd.concat((beer, beer_eval))

all_beer.shape

Let's use the familiar _accuracy_ score: a percentage of correctly classified samples. (More about other ways of assessing quality of a classifier in one of the following scripts.)


In [None]:
from sklearn.utils import shuffle

all_beer = shuffle(all_beer, random_state=42)  # fix randomization for reproduciblity

features = all_beer.iloc[:, :-1]
labels = all_beer.iloc[:, -1]

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=2)

from sklearn.model_selection import cross_val_score

# 4-fold cross validation with the way we've evaluated classifiers
# up to now: "accuracy" score (the percentage of correct classification)
scores = cross_val_score(classifier, features, labels, scoring="accuracy", cv=4)

for i, score in enumerate(scores):
    print("Fold", i + 1, "score:", score)

The `cross_val_score` as used in the previous code example works internally as follows:

- split training data in four chunks
- learn `classifier` on chunk `1, 2, 3`, apply classifier to chunk `4` and compute score `s1`
- learn `classifier` on chunk `1, 2, 4`, apply classifier to chunk `3` and compute score `s2`
- learn `classifier` on chunk `1, 3, 4`, apply classifier to chunk `2` and compute score `s3`
- learn `classifier` on chunk `2, 3, 4`, apply classifier to chunk `1` and compute score `s4`

`cross_val_score` finally returns `[s1, s2, s3, s4]`.

In [None]:
m = scores.mean()
s = scores.std()

low = m - 2 * s
high = m + 2 * s

print("mean test score is {:.3f}".format(m))
print("std dev of test score is {:.3f}".format(s))
# and, assuming normality of the scores
print(
    "true test score is with 96% probability between {:.3f} and {:.3f}".format(
        low, high
    )
)

## Exercise section

1. Play with the previous examples.
2. Try out different number of cross validation folds for the beer data. What happens with the score?

In [None]:
import pandas as pd

beer = pd.read_csv("data/beers.csv")
beer_eval = pd.read_csv("data/beers_eval.csv")

all_beer = pd.concat((beer, beer_eval))

from sklearn.utils import shuffle

all_beer = shuffle(all_beer, random_state=42)  # fix randomization for reproduciblity

features = all_beer.iloc[:, :-1]
labels = all_beer.iloc[:, -1]

from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(C=2)

from sklearn.model_selection import cross_val_score

for k in [2, 5, 10, 25, 50, 150]:
    scores = cross_val_score(classifier, features, labels, scoring="accuracy", cv=k)
    m = scores.mean()
    s = scores.std()
    print("{:3d}-fold accuracy score is {:.3f} +/- {:.3f}".format(k, m, s))

#
# Q: What happens with the score?
#
# Mean score increases, very slightly from a certain number of folds (here, 25),
# and variance of the score increases significantly.
#
# Intuitively, with very high number of folds models become similar across folds,
# as they fit a big common set of samples, whereas single misclassifications in
# the small testing sets result in much smaller accuracies, increasing variance.
#

<div class="alert alert-block alert-info">
<p style="font-weight: bold;"><i class="fa fa-info-circle"></i>&nbsp;Rule of thumb</p>
<p>Preffer 5- or 10- fold cross validation.</p>
</div>

### Optional exercises

1. Split the dataset `data/spiral.csv` in 300 features/labels for training and 100 features/labels for evaluation. Find a good classifier which reaches 100% accuracy on the training samples, then evaluate the trained classifier on the remaining 100 samples.

In [None]:
import pandas as pd
from sklearn.svm import SVC

df = pd.read_csv("data/spiral.csv")
n_train = 300
features_learn = df.iloc[:n_train, :-1]
features_eval = df.iloc[n_train:, :-1]

labels_learn = df.iloc[:n_train, -1]
labels_eval = df.iloc[n_train:, -1]

clf = SVC(gamma=3, C=90)
clf.fit(features_learn, labels_learn)

predicted = clf.predict(features_learn)
print(
    "training accuracy: {:3.1f}%".format(
        sum(predicted == labels_learn) * 100 / len(predicted)
    )
)

predicted = clf.predict(features_eval)
print(
    "testing accuracy: {:3.1f}%".format(
        sum(predicted == labels_eval) * 100 / len(predicted)
    )
)

## Some reasons for overfitting and how you might fight it.

###  Small / insufficient data sets.

The classifier fails to "grab the concept" because the "concept" is not represented strongly enough in the data set. 

Possible solutions:

- Get more data.
- Augment your data by creating artificial/synthetic data (e.g. for images: shift / scale / rotate images) if feasible.


### Unsuitable classifier / classifier parameters used

This is what we observed in the example before.

Possible solutions:

- Optimize parameters using cross-validation.

- Evaluate other classification algorithms.

###  Noisy / uninformative features

A classifier can in some situations use noisy or uninformative features to explain noise in the training data. In such cases features noise contributes to "artificially" good results on the training data.

Possible solutions:

- Use features selection techniques:<br/><br/>

    - Inspect your data to detect noisy or uninformative features.
        - See e.g. [removing features with low variance in scikit-learn](https://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance)<br/><br/>

    - Try learning classifier with some features excluded.
        - This can be automated, see [recursive feature elimination in scikit-learn](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE).
        - Random forest classifiers learn in such way (more about them later), hence, supporting features exclusion directly.<br/><br/>

    - Penalize for using many features (prefer simpler models).
        - So called *sparse* learning methods do that (more about them later) and they can be used only for data pre-processing step, see [L1-based feature selection in scikit-learn](https://scikit-learn.org/stable/modules/feature_selection.html#l1-based-feature-selection)<br/><br/>

- Use dimension reduction techniques like `PCA` (more about this later).

### Strongly correlated / redundant features

In case the data set contains strongly, but not 100% correlated features, their (weighted) difference might be considered as random data. The effect is then similar to having noisy or uninformative features.

Possible solutions:

- Same as for noise or uninformative features: features selection or dimension reduction techniques.


The following code demonstrates the effect of noise and redundant features:

In [None]:
beer_data = pd.read_csv("data/beers.csv")

# all columns up to the last one:
input_features = beer_data.iloc[:, :-1]
input_labels = beer_data.iloc[:, -1]

eval_data = pd.read_csv("data/beers_eval.csv")

eval_features = eval_data.iloc[:, :-1]
eval_labels = eval_data.iloc[:, -1]


def assess(classifier, input_features, eval_features):

    predicted_labels = classifier.predict(input_features)
    print(
        "{:.2f} % labeled correctly on training dataset".format(
            sum(predicted_labels == input_labels) / len(input_labels) * 100
        )
    )

    # Predict
    predicted_labels = classifier.predict(eval_features)
    print(
        "{:.2f} % labeled correctly on evaluation dataset".format(
            sum(predicted_labels == eval_labels) / len(eval_labels) * 100
        )
    )


from sklearn.linear_model import LogisticRegression

classifier = SVC(C=2, gamma=2)

classifier.fit(input_features, input_labels)

print("ORIGINAL DATA")
assess(classifier, input_features, eval_features)

print()
print("WITH ADDED NOISY FEATURES")
np.random.seed(5)

# Extend original data by adding new features:
#
# 1. alcohol_content with some random noise added
# 2. pure random noise
#
# to both training data
input_features["redundant"] = input_features.loc[:, "alcohol_content"] + 1 * (
    np.random.random((225,)) - 0.5
)
input_features["noise"] = 0.1 * (np.random.random((225,)) - 0.5)
# and evaluation data
eval_features["redundant"] = eval_features.loc[:, "alcohol_content"] + 1 * (
    np.random.random((75,)) - 0.5
)
eval_features["noise"] = 0.1 * (np.random.random((75,)) - 0.5)

classifier.fit(input_features, input_labels)

assess(classifier, input_features, eval_features)

You can see above that the classifier yields better accuracy on the extended training data set. But you also can see that the performance on the extended evaluation data set is worse than before.



<div class="alert alert-block alert-info">
<p style="font-weight: bold;"><i class="fa fa-info-circle"></i>&nbsp;About applicability to regression</p>

<p>We're talking here about overfitting, underfitting and cross-validation in context of classification/classifiers, but these problems or methods, and related workarounds, apply in general to supervised learning methods, so also to regression methods about which we will learn later on.</p>
</div>

## Training the final classifier

Cross-validation was helpful to determine and tune a good classifier. But how do we eventually build the classifier we want to use later "in production" ?

A common procedure is:

- Split your data 80% to 20% (or another fraction) from the beginning.


- Use the 80% fraction for determining and tuning a classifier.


- Train the final classifier on the 80% part.


- Finally use the 20% fraction for a final validation of the classifiers accuracy.

<img src="./images/cross_eval_and_test.svg">

Comment: Literature is not consistent in terms. Sometimes the terms "validation data set" and "test data set" are interchanged.

### Demonstration

We demonstrate what we've explained before using the MNIST dataset.

In [None]:
import numpy as np
from sklearn.datasets import load_digits
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.svm import SVC

digits = load_digits()

print("digits data set shape:", digits.images.shape)

# flatten images of shape N_SAMPLES x 8 x 8
# to N_SAMPLES x 64:
labels = digits.target
n_samples = len(labels)

features = digits.images.reshape((n_samples, -1))
print("feature matrix shape:", features.shape)

In [None]:
import matplotlib.pyplot as plt

plt.imshow(
    digits.images[0],
    cmap="gray",
)
plt.axis("off")
plt.title(f"image for figure {labels[0]}")

fig = plt.figure(figsize=(12, 4))
ax = plt.imshow(features[0][None, :], cmap="gray")
plt.title("image flattened")
plt.axis("off");

We introduce the `train_test_split` function from `sklearn.model_selection` in the following example.

It splits features and labels in a given proportion. Usually this is randomized, so that you get different results for every function invocation. To get the same result every time we use `random_state=..` (with arbitrary number) below:

In [None]:
# SHUFFLE AND SPLIT DATA 80:20
# with fixed randomization
from sklearn.model_selection import train_test_split

# Note 1: `shuffle=True` is default, hence, unnecessary to specify
# Note 2: using `stratify=labels` to perserve classes proportion after split same as in the original dataset
(
    features_crosseval,
    features_validation,
    labels_crosseval,
    labels_validation,
) = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)


def report(labels):
    print("number of all samples:", len(labels))
    for number in range(10):
        print(
            f"proportion of images for class {number}:",
            round(sum(labels == number) / len(labels), 3),
        )
    print()


print("# Whole dataset ")
report(labels)
print("# Cross-validation dataset ")
report(labels_crosseval)
print("# Validation dataset ")
report(labels_validation)

As you can see the splits maintained the distribution of all classes `0` to `9`.



Moreover, we introduce use of explicit speficiation of a cross-validation method: `StratifiedKFold` from `sklearn.model_selection`. 

`StratifiedKFold` allows us to splitt data during cross validation in the same way as we did with `train_test_split`, i.e. 

1. with data shufflling before split, and 
2. **perserving class-proportions of samples**. 



In [None]:
from sklearn.model_selection import StratifiedKFold

cross_validator = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

C = 1
gamma = 0.1
classifier = SVC(C=C, gamma=gamma)

test_scores = cross_val_score(
    classifier,
    features_crosseval,
    labels_crosseval,
    scoring="accuracy",
    cv=cross_validator,
)  # cv arg is now different
print(
    "score = {:.3f} +/- {:.3f}, C = {:.1e},  gamma = {:.1e}".format(
        test_scores.mean(), test_scores.std(), C, gamma
    )
)

We can now try to use this approach to tune the **hyper-parameters** `C` and `gamma` of the `SVC` classifier.

Remember:
1. A classifier learns parameters
2. Hyper-parameters control how a classifier learns.

In [None]:
# FIND A "BEST" CLASSIFIER
# with fixed randomization

# By default `cross_val_score(.., cv=n)` call implicitly uses
# `KFold(n_splits=n, shuffle=False)` cross-validator
from sklearn.model_selection import StratifiedKFold

cross_validator = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = []

print("OPTIMIZE HYPERPARAMETERS")
# selected classifier hyperparameters to optimize
SVC_C_values = (1e-1, 1, 10)
SVC_gamma_values = (0.0001, 0.001, 0.01, 0.1)

for C in SVC_C_values:
    for gamma in SVC_gamma_values:
        classifier = SVC(C=C, gamma=gamma)
        test_scores = cross_val_score(
            classifier,
            features_crosseval,
            labels_crosseval,
            scoring="accuracy",
            cv=cross_validator,
        )  # cv arg is now different
        print(
            "score = {:.3f} +/- {:.3f}, C = {:.1e},  gamma = {:.1e}".format(
                test_scores.mean(), test_scores.std(), C, gamma
            )
        )
        results.append((test_scores.mean(), test_scores.std(), C, gamma))

# max of list of tuples considers value of first entry
# to compare tuples. This we look for test_scores.mean() value:

best_result = max(results)
best_score_mean, best_score_std, best_C, best_gamma = best_result

print()
print("BEST RESULT CROSS VALIDATION")
print(
    "score = {:.3f} +/- {:.3f}, C = {:.1e},  gamma = {:.1e}".format(
        best_score_mean, best_score_std, best_C, best_gamma
    )
)

Finally we evaluate our tuned classifier on the validation data set:

In [None]:
# EVALUATE CLASSIFIER ON VALIDATION DATASET

classifier = SVC(C=best_C, gamma=best_gamma)

classifier.fit(features_crosseval, labels_crosseval)
predicted = classifier.predict(features_validation)

final_accuracy = sum(predicted == labels_validation) / len(labels_validation)

print("VALIDATION")
print("score = {:.3f}".format(final_accuracy))

## Exercise section 

1. Run the previous examples.

2. Can you determine a better pair of `C` and `gamma`? Change folds number to `5` and `20`.

In [None]:
for k in (5, 10, 20):

    print()
    print()
    print("#### {} folds".format(k))

    cross_validator = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    results = []

    print("OPTIMIZE HYPERPARAMETERS")
    # selected classifier hyperparameters to optimize
    SVC_C_values = np.arange(10, 110, 20)
    SVC_gamma_values = np.arange(0.75, 1.5, 0.25)

    for C in SVC_C_values:
        for gamma in SVC_gamma_values:
            classifier = SVC(C=C, gamma=gamma)
            test_scores = cross_val_score(
                classifier,
                features_crosseval,
                labels_crosseval,
                scoring="accuracy",
                cv=cross_validator,
            )
            print(
                "score = {:.3f} +/- {:.3f}, C = {:5.1f},  gamma = {:4.2f}".format(
                    test_scores.mean(), test_scores.std(), C, gamma
                )
            )
            results.append((test_scores.mean(), test_scores.std(), C, gamma))

    # max of list of tuples considers value of first entry
    # to compare tuples. This we look for test_scores.mean() value:

    best_result = max(results)
    best_score_mean, best_score_std, best_C, best_gamma = best_result

    print()
    print("BEST RESULT CROSS VALIDATION")
    print(
        "score = {:.3f} +/- {:.3f}, C = {:.1f},  gamma = {:.2f}".format(
            best_score_mean, best_score_std, best_C, best_gamma
        )
    )

    # EVALUATE CLASSIFIER ON VALIDATION DATASET

    classifier = SVC(C=best_C, gamma=best_gamma)

    classifier.fit(features_crosseval, labels_crosseval)
    predicted = classifier.predict(features_validation)

    final_accuracy = sum(predicted == labels_validation) / len(labels_validation)

    print()
    print("VALIDATION")
    print("score = {:.3f}".format(final_accuracy))

<div class="alert alert-block alert-info">
<p style="font-weight: bold;"><i class="fa fa-info-circle"></i>&nbsp; Recommendation</p>

<p>The result of cross validation depends on the order of the data set, the validation data set and number of cross validation folds.

As a consequence we might find different optimal settings for a classifier.

So don't try to squeeze out minimal performance improvements!</p>
</div>

### Optional exercises

1. Run cross-validation for the `LogisticRegression` applied to the beer data set. Try different `C` and `penalty` values. To use `l1` penalty you must change change the solver to `liblinear`, e.g. `LogisticRegression(..., solver="liblinear")`.

2. Run cross-validation for the `SVC` classifier applied to the `"data/spiral.csv"` data set. Try different `C` and `gamma` values.

3. Implement same strategy for the iris data set introduced in script 1 (`sklearn.datasets.load_iris`).

In [None]:
# SOLUTION
# 1.
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

beer = pd.read_csv("data/beers.csv")
beer_eval = pd.read_csv("data/beers_eval.csv")
all_beer = pd.concat((beer, beer_eval))

features = all_beer.iloc[:, :-1]
labels = all_beer.iloc[:, -1]

(
    features_crosseval,
    features_validation,
    labels_crosseval,
    labels_validation,
) = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

# TRY random_state 43 OR 1 INSTEAD OF 42!

cross_validator = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = []

print("OPTIMIZE SETTINGS")

for penalty in ("l1", "l2"):
    for C in (1, 10, 100, 1000):
        classifier = LogisticRegression(C=C, penalty=penalty, solver="liblinear")
        test_scores = cross_val_score(
            classifier,
            features_crosseval,
            labels_crosseval,
            scoring="accuracy",
            cv=cross_validator,
        )
        print(
            "score = {:.3f} +/- {:.3f}, C = {:6.1f},  penalty = {}".format(
                test_scores.mean(), test_scores.std(), C, penalty
            )
        )
        results.append((test_scores.mean(), test_scores.std(), C, penalty))

best_result = max(results)
best_score_mean, best_score_std, best_C, best_penalty = best_result

print()
print("BEST RESULT CROSS VALIDATION")
print(
    "score = {:.3f} +/- {:.3f}, C = {:6.1f},  penalty = {}".format(
        best_score_mean, best_score_std, best_C, best_penalty
    )
)


classifier = LogisticRegression(C=best_C, penalty=best_penalty, solver="liblinear")

classifier.fit(features_crosseval, labels_crosseval)
predicted = classifier.predict(features_validation)

final_accuracy = sum(predicted == labels_validation) / len(labels_validation)

print()
print("VALIDATION")
print("score = {:.3f}".format(final_accuracy))

# There is some diff in validation score. Looks like LogisticRegression is not
# as robust as SVC for beer data.

In [None]:
# SOLUTION
# 2.
df = pd.read_csv("data/spiral.csv")

features = df.iloc[:, :-1]
labels = df.iloc[:, -1]

(
    features_crosseval,
    features_validation,
    labels_crosseval,
    labels_validation,
) = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

cross_validator = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = []

print("OPTIMIZE SETTINGS")

SVC_C_values = (10, 20, 100, 500)  # (5, 10, 20, 30, 35, 40)
SVC_gamma_values = (0.1, 1, 10)  # (2, 3, 4, 5)

for C in SVC_C_values:
    for gamma in SVC_gamma_values:
        classifier = SVC(C=C, gamma=gamma)
        test_scores = cross_val_score(
            classifier,
            features_crosseval,
            labels_crosseval,
            scoring="accuracy",
            cv=cross_validator,
        )
        print(
            "score = {:.3f} +/- {:.3f}, C = {:5.1f},  gamma = {:5.1f}".format(
                test_scores.mean(), test_scores.std(), C, gamma
            )
        )
        results.append((test_scores.mean(), test_scores.std(), C, gamma))

best_result = max(results)
best_score_mean, best_score_std, best_C, best_gamma = best_result

print()
print("BEST RESULT CROSS VALIDATION")
print(
    "score = {:.3f} +/- {:.3f}, C = {:.1f},  gamma = {:.1f}".format(
        best_score_mean, best_score_std, best_C, best_gamma
    )
)

classifier = SVC(C=best_C, gamma=best_gamma)

classifier.fit(features_crosseval, labels_crosseval)
predicted = classifier.predict(features_validation)

final_accuracy = sum(predicted == labels_validation) / len(labels_validation)

print()
print("VALIDATION")
print("score = {:.3f}".format(final_accuracy))

In [None]:
# SOLUTION
# 3.
from sklearn.datasets import load_iris

data = load_iris()

features = data.data
labels = data.target

(
    features_crosseval,
    features_validation,
    labels_crosseval,
    labels_validation,
) = train_test_split(features, labels, test_size=0.2, stratify=labels, random_state=42)

cross_validator = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

results = []

print("OPTIMIZE SETTINGS")

SVC_C_values = (1, 2, 5, 10)
SVC_gamma_values = (0.5, 1, 2)

for C in SVC_C_values:
    for gamma in SVC_gamma_values:
        classifier = SVC(C=C, gamma=gamma)
        test_scores = cross_val_score(
            classifier,
            features_crosseval,
            labels_crosseval,
            scoring="accuracy",
            cv=cross_validator,
        )
        print(
            "score = {:.3f} +/- {:.3f}, C = {:5.1f},  gamma = {:5.1f}".format(
                test_scores.mean(), test_scores.std(), C, gamma
            )
        )
        results.append((test_scores.mean(), test_scores.std(), C, gamma))

best_result = max(results)
best_score_mean, best_score_std, best_C, best_gamma = best_result

print()
print("BEST RESULT CROSS VALIDATION")
print(
    "score = {:.3f} +/- {:.3f}, C = {:.1f},  gamma = {:.1f}".format(
        best_score_mean, best_score_std, best_C, best_gamma
    )
)

classifier = SVC(C=best_C, gamma=best_gamma)

classifier.fit(features_crosseval, labels_crosseval)
predicted = classifier.predict(features_validation)

final_accuracy = sum(predicted == labels_validation) / len(labels_validation)

print()
print("VALIDATION")
print("score = {:.3f}".format(final_accuracy))

# Here, SVC is robust

Copyright (C) 2019-2022 ETH Zurich, SIS ID