In [None]:
# model implementation
from ml.models.linear import LogisticRegression
from ml.algorithms.optimization import GradientDescent

# model selection
from ml.model_selection import GridSearchCV
from ml.functions.metrics.classification import F1Score
from ml.stats import ClassificationStats

# data manipulation
from ml.dataset import load_csv, split_train_test
from ml.algorithms.normalization import MinMaxScaler

import numpy as np
import matplotlib.pyplot as plt

np.random.seed(25)

## Q1

In [None]:
X, y = load_csv('./datasets/breastcancer.csv')
X_train, y_train, X_test, y_test = split_train_test(X, y, 0.8, shuffle=True)

## Q1.a
### Logistic regression using Gradient Descent and grid search with k-fold cross-validation

In [None]:
grid = GridSearchCV(stats_generator=ClassificationStats)

i=0
for regularization_term in [0.01, 0.001]:
    for learning_rate in [0.1, 0.05, 0.01]:
        grid.add(LogisticRegression, {
            "solver": GradientDescent(epochs=4000, learning_rate=learning_rate, regularization=regularization_term, metrics=F1Score()),
            "dataScaler": MinMaxScaler})
        
        print(f"Grid search candidate {i}: Logistic regression, learning rate={learning_rate}, regularization={regularization_term}, score=F1-score")
        i+=1

best = grid.search(X_train, y_train,num_folds=10, score=F1Score(), score_minimize=False, shuffle=True)

In [None]:
# Best model information
solver = grid.best_model_params['solver']
print("Best candidate:", grid.index_of_best, "- Learning rate:", solver.learning_rate, "- Regularization term:", solver.regularization)

model = grid.best_model(**grid.best_model_params)
training_score = model.fit(X_train, y_train)

plt.plot(training_score)
plt.title("Training F1-score of best model")
plt.show()

## Q1.b
### Logistic regression

In [None]:
num_candidates = len(grid.candidates)
k=0
plt.figure(figsize=(30, 4))
plt.suptitle("Logistic Regression hyperparameter evaluation", fontsize=20)
plt.subplots_adjust(top=0.7)
for metric in ['f1', 'recall', 'precision', 'binary_accuracy']:
    means = []
    std = []
    for i in range(num_candidates):
        means.append(np.mean(grid.stats[i].stats["values"][metric]))
        std.append(np.std(grid.stats[i].stats["values"][metric]))
    plt.subplot(1, 4, k+1)
    plt.title(f"Mean {metric}", fontsize=20)
    plt.errorbar(np.arange(num_candidates), means, std, fmt='ok', lw=3)
    plt.errorbar(grid.index_of_best, means[grid.index_of_best], std[grid.index_of_best], ecolor='red', fmt='ok', lw=3)
    plt.xticks(np.arange(num_candidates), labels=np.arange(num_candidates)+1)
    plt.xlabel("Candidate", fontsize=16)
    plt.ylabel(metric, fontsize=16)
    k+=1