In [1]:
# import for data frame
import pandas as pd

# import for the one supervised learning model
from sklearn.linear_model import LogisticRegression

# import for all cross validation strategy
from sklearn.model_selection import cross_val_score

# import for stratified k-fold
from sklearn.model_selection import StratifiedKFold

# import for leave-one-out cross validation
from sklearn.model_selection import LeaveOneOut

# import for shuffle split cross validation
from sklearn.model_selection import ShuffleSplit

In [2]:
# loading wine quality - red dataset
data = pd.read_csv("winequality-red.csv", sep=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
# separate features (X) and target (y)
X = data.drop(columns=['quality'])
y = data['quality']

# printing shapes
print(X.shape)
print(y.shape)

(1599, 11)
(1599,)


In [4]:
# instantiate the model
# using liblinear solver and max iterations at 1000 to avoid convergence warning
# USING LOGISTIC REGRESSION MODEL CLASSIFIER
logreg = LogisticRegression(solver='liblinear', max_iter=1000)

Standard k-fold cross validation with default folds

Parameters: logreg model, X and y feature and target values, accuracy scoring method

In [5]:
# STANDARD K-FOLD CROSS VALIDATION WITH DEFAULT FOLDS
# book mentions default is 3, however scikit-learn uses 5 (https://scikit-learn.org/dev/modules/generated/sklearn.model_selection.cross_val_score.html)
scores = cross_val_score(logreg, X, y, scoring='accuracy')
print("Default k-fold Cross-validation Mean score: {}".format(scores.mean()))

Default k-fold Cross-validation Mean score: 0.5691183385579938


Standard k-fold cross validation with 10 folds

Parameters: logreg model, X and y feature and target values, cv=10 for k value, accuracy scoring method

In [6]:
# STANDARD K-FOLD CROSS VALIDATION WITH CHOSEN 10 FOLDS
scores = cross_val_score(logreg, X, y, cv=10, scoring='accuracy')
print("Choosing 10-fold Cross-validation Mean score: {}".format(scores.mean()))

Choosing 10-fold Cross-validation Mean score: 0.5759787735849057


Stratified k fold cross validation with 10 folds

skf object containts Stratified K Fold method: k signified by n_splits, random state is 0 for reproducibility

Parameters: logreg model, X and y feature and target values, cv=skf for k value, accuracy scoring method

In [7]:
# STRATIFIED K-FOLD CROSS VALIDATION WITH CHOSEN 10 FOLDS
# syntax obtained from https://www.geeksforgeeks.org/stratified-k-fold-cross-validation/
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
scores = cross_val_score(logreg, X, y, cv=skf, scoring='accuracy')
print("Stratified k-fold Cross-validation Mean score: {}".format(scores.mean()))

Stratified k-fold Cross-validation Mean score: 0.5791037735849057


Leave one out cross validation

Parameters: logreg model, X and y feature and target values, cv=loo for leaveoneout object, accuracy scoring method

In [8]:
# LEAVE-ONE-OUT CROSS VALIDATION
loo = LeaveOneOut()
scores = cross_val_score(logreg, X, y, cv=loo, scoring='accuracy')
print("Leave-one-out Cross-validation Mean score: {}".format(scores.mean()))

Leave-one-out Cross-validation Mean score: 0.5784865540963102


Shuffle split cross validation

shuffle_split object contains: test and train size each 50%, k value is 10 under n_splits, random state is 0

Parameters: logreg model, X and y feature and target values, cv=shuffle_split for k value and test/train size, accuracy scoring method

In [9]:
# SHUFFLE-SPLIT CROSS VALIDATION
shuffle_split = ShuffleSplit(test_size=0.5, train_size=0.5, n_splits=10, random_state=0)
scores = cross_val_score(logreg, X, y, cv=shuffle_split, scoring='accuracy')
print("Shuffle-split Cross-validation Mean score: {}".format(scores.mean()))

Shuffle-split Cross-validation Mean score: 0.592125


In [12]:
#import for reproducible k-fold cross validation results
from sklearn.model_selection import KFold

In [18]:
#Specifying the number of splits to be 5, shuffle is set to true and random state to be 0 in order to get reproducible results.
kfold= KFold(n_splits=5, shuffle=True, random_state=0)
print("Choosing 5-fold Cross-validation scores: {}".format(
    cross_val_score(logreg, X, y, cv=kfold)))

Choosing 5-fold Cross-validation scores: [0.628125   0.603125   0.534375   0.575      0.56112853]


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9881b3a1-451c-4191-9eb2-185d7f860490' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>