In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
SEED = 42
FOLDS = 5

### Load data and split to train and test set
Use Breast Cancer dataset, where based on features, the existence of breast cancer is predicted.
Check for class imbalance. Split into train and test data set.

In [None]:
X, y = load_breast_cancer(as_frame=True, return_X_y=True)

num_class_0, num_class_1 = (y == 0).sum(), (y == 1).sum()
print('Number of data points belonging to class 0:', num_class_0)
print('Number of data points belonging to class 1:', num_class_1)
minority_class = num_class_0 if num_class_0 < num_class_1 else num_class_1
print(f'''
Data has {(minority_class / X.shape[0]):.2%} in the minority class.
''')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=SEED)

print(X.shape)
X.describe()

### Prepare cross validation and models
Since the dataset is small, we use only 5 folds.
All models are applied with default hyperparameters for simplicity.
Quick description of models used:
- Ridge Regression: linear model with L2 loss to reduce overfitting
- Support Vector Machine: finds hyperplane that maximizes margin between classes
- Decision Tree: applies decision rules based on features to split data points into groups
- Random Forest: aggregates results from multiple decision trees on randomly selected subsets of data
- K-Nearest Neighbors: classifies data points based on *k* nearest neighbors
- Gaussian Naive Bayes: Naive Bayes with assumption, that features within a class follow Gaussian distribution

In [None]:
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
scaler = StandardScaler()
models = {
  "Ridge Regression": RidgeClassifier(),
  "SVM": SVC(),
  "Decision Tree": DecisionTreeClassifier(),
  "Random Forest": RandomForestClassifier(),
  "KNN": KNeighborsClassifier(),
  "GaussianNB": GaussianNB(),
}

#### Cross-validation function
There are also `cross_validate` and `cross_val_score` in sklearn.

In [None]:
def cross_validate(model, kfold, data, labels) -> list:
  accuracies = []
  for tr_index, val_index in kfold.split(data, labels):
    X_tr, y_tr = data.iloc[tr_index], labels.iloc[tr_index]
    X_val, y_val = data.iloc[val_index], labels.iloc[val_index]

    # preprocess data after splitting to prevent data leakage
    X_tr, X_val = scaler.fit_transform(X_tr), scaler.fit_transform(X_val)

    model.fit(X_tr, y_tr)
    accuracies.append( model.score(X_val, y_val))

  return accuracies

### Perform cross validation and plot results
Metric used is accuracy. Afterwards pick best performing model.

In [None]:
plt.figure(figsize=(10, 6))
for name, model in models.items():
  accuracies = cross_validate(model, kf, X_train, y_train)
  print(f"Mean accuracy for {name}:", np.mean(accuracies))
  plt.plot(range(FOLDS), accuracies, label=name)
plt.title("Accuracies of each fold across different classifiers")
plt.xlabel("Folds")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

### Hyperparameter optimization
Based on the performance of the previous cross validation, we assume SVM to be the best model.

In [None]:
pipeline = Pipeline([
  ('scaler', scaler),
  ('model', models['SVM'])
])
parameters = {
  'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
  'model__C': [.1, 1, 10],
}
clf = GridSearchCV(pipeline, parameters, cv=FOLDS, scoring='accuracy')

### Fit on whole training set and predict on whole test set

In [None]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(clf.best_params_)
accuracy_score(y_test, y_pred)