<a href="https://colab.research.google.com/github/stepthom/869_course/blob/main/classification/slides_classification_cv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification Cross Validation

- Stephen W. Thomas
- Used for MMA 869, MMAI 869, and GMMA 869

In [1]:
import datetime
print(datetime.datetime.now())

2022-05-11 19:02:04.538578


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold, ShuffleSplit
from sklearn.pipeline import Pipeline

In [3]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.0.2.


# Load Data

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/stepthom/869_course/main/data/GermanCredit.csv')
df['Class'] = df['Class'].map({'Good': 1, 'Bad': 0})
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 62 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   Duration                                1000 non-null   int64
 1   Amount                                  1000 non-null   int64
 2   InstallmentRatePercentage               1000 non-null   int64
 3   ResidenceDuration                       1000 non-null   int64
 4   Age                                     1000 non-null   int64
 5   NumberExistingCredits                   1000 non-null   int64
 6   NumberPeopleMaintenance                 1000 non-null   int64
 7   Telephone                               1000 non-null   int64
 8   ForeignWorker                           1000 non-null   int64
 9   Class                                   1000 non-null   int64
 10  CheckingAccountStatus.lt.0              1000 non-null   int64
 11  CheckingAccountSta

Unnamed: 0,Duration,Amount,InstallmentRatePercentage,ResidenceDuration,Age,NumberExistingCredits,NumberPeopleMaintenance,Telephone,ForeignWorker,Class,...,OtherInstallmentPlans.Bank,OtherInstallmentPlans.Stores,OtherInstallmentPlans.None,Housing.Rent,Housing.Own,Housing.ForFree,Job.UnemployedUnskilled,Job.UnskilledResident,Job.SkilledEmployee,Job.Management.SelfEmp.HighlyQualified
0,6,1169,4,4,67,2,1,0,1,1,...,0,0,1,0,1,0,0,0,1,0
1,48,5951,2,2,22,1,1,1,1,0,...,0,0,1,0,1,0,0,0,1,0
2,12,2096,2,3,49,1,2,1,1,1,...,0,0,1,0,1,0,0,1,0,0
3,42,7882,2,4,45,1,2,1,1,1,...,0,0,1,0,0,1,0,0,1,0
4,24,4870,3,4,53,2,2,1,1,0,...,0,0,1,0,0,1,0,0,1,0


In [5]:
X = df.drop(['Class'], axis=1).to_numpy()
y = df['Class'].to_numpy()

# Defining and Assessing Candidate Pipelines

In [6]:
clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=5, min_samples_split=5, random_state=0)

pipe1 = Pipeline(steps=[
                        ("clf", clf1)
                        ])

scores = cross_val_score(pipe1, X, y, cv=10, scoring="f1_macro")


with np.printoptions(formatter={'float': '{: 0.3f}'.format}):
  print("CV scores: {}".format(scores))
print("Estimated F1 Score: {:0.3f} +/- {:0.2f}".format(np.mean(scores), np.std(scores)))

CV scores: [ 0.682  0.600  0.597  0.655  0.566  0.543  0.667  0.693  0.505  0.592]
Estimated F1 Score: 0.610 +/- 0.06


In [7]:
clf2 = DecisionTreeClassifier(criterion='entropy', max_depth=10, min_samples_split=5, random_state=0)

pipe2 = Pipeline(steps=[
                        ('imputer', SimpleImputer()),
                        ('scaler', StandardScaler()),
                        ("clf", clf2),
                        ])

scores = cross_val_score(pipe2, X, y, cv=10, scoring="f1_macro")

with np.printoptions(formatter={'float': '{: 0.3f}'.format}):
  print("CV scores: {}".format(scores))
print("Estimated F1 Score: {:0.3f} +/- {:0.2f}".format(np.mean(scores), np.std(scores)))

CV scores: [ 0.632  0.705  0.679  0.670  0.536  0.566  0.611  0.693  0.595  0.594]
Estimated F1 Score: 0.628 +/- 0.05


In [8]:
clf3 = RandomForestClassifier(random_state=0)

pipe3 = Pipeline(steps=[
                        ('imputer', SimpleImputer()),
                        ('scaler', StandardScaler()),
                        ("clf", clf3),
                        ])

scores = cross_val_score(pipe3, X, y, cv=10, scoring="f1_macro")

with np.printoptions(formatter={'float': '{: 0.3f}'.format}):
  print("CV scores: {}".format(scores))
print("Estimated F1 Score: {:0.3f} +/- {:0.2f}".format(np.mean(scores), np.std(scores)))

CV scores: [ 0.752  0.616  0.608  0.662  0.699  0.602  0.608  0.736  0.714  0.689]
Estimated F1 Score: 0.669 +/- 0.05


# Comparing Different Validation Techniques

In [9]:
pipes = [("pipe1", pipe1), ("pipe2", pipe2), ("pipe3", pipe3)]
for pipe_name, pipe in pipes:

  for k in [2, 5, 10]:
    scores = cross_val_score(pipe, X, y, cv=k, scoring="f1_macro")
    print("{}, K={:3d} CV, F1 Score: {:0.3f} +/- {:0.2f}".format(pipe_name, k, np.mean(scores), np.std(scores)))

  for k in [2, 5, 10]:
    splitter = RepeatedKFold(n_splits=k, n_repeats=10, random_state=62)
    scores = cross_val_score(pipe, X, y, cv=splitter, scoring="f1_macro")
    print("{}, K={:3d} CV Repeated, F1 Score: {:0.3f} +/- {:0.2f}".format(pipe_name, k, np.mean(scores), np.std(scores)))


  for k in [2, 5, 10]:
    splitter = ShuffleSplit(n_splits=k, random_state=62)
    scores = cross_val_score(pipe, X, y, cv=splitter, scoring="f1_macro")
    print("{}, K={:3d} Shuffle, F1 Score: {:0.3f} +/- {:0.2f}".format(pipe_name, k, np.mean(scores), np.std(scores)))



pipe1, K=  2 CV, F1 Score: 0.577 +/- 0.01
pipe1, K=  5 CV, F1 Score: 0.616 +/- 0.03
pipe1, K= 10 CV, F1 Score: 0.610 +/- 0.06
pipe1, K=  2 CV Repeated, F1 Score: 0.601 +/- 0.04
pipe1, K=  5 CV Repeated, F1 Score: 0.595 +/- 0.05
pipe1, K= 10 CV Repeated, F1 Score: 0.593 +/- 0.06
pipe1, K=  2 Shuffle, F1 Score: 0.594 +/- 0.02
pipe1, K=  5 Shuffle, F1 Score: 0.587 +/- 0.04
pipe1, K= 10 Shuffle, F1 Score: 0.594 +/- 0.04
pipe2, K=  2 CV, F1 Score: 0.627 +/- 0.02
pipe2, K=  5 CV, F1 Score: 0.642 +/- 0.03
pipe2, K= 10 CV, F1 Score: 0.628 +/- 0.05
pipe2, K=  2 CV Repeated, F1 Score: 0.622 +/- 0.03
pipe2, K=  5 CV Repeated, F1 Score: 0.621 +/- 0.04
pipe2, K= 10 CV Repeated, F1 Score: 0.618 +/- 0.05
pipe2, K=  2 Shuffle, F1 Score: 0.569 +/- 0.03
pipe2, K=  5 Shuffle, F1 Score: 0.578 +/- 0.02
pipe2, K= 10 Shuffle, F1 Score: 0.590 +/- 0.05
pipe3, K=  2 CV, F1 Score: 0.659 +/- 0.01
pipe3, K=  5 CV, F1 Score: 0.657 +/- 0.02
pipe3, K= 10 CV, F1 Score: 0.669 +/- 0.05
pipe3, K=  2 CV Repeated, F1 Score