In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from math import log, exp

# Boosting algorithm


### Load and preprocess data

In [None]:
# load and prepare data
dataset = pd.read_csv('sonar.all-data',header=None)
X = dataset.iloc[:,:-1].to_numpy()
# Labels should be -1 and 1!
y = (dataset.iloc[:,-1].to_numpy()=='M').astype(int)
y = np.where(y==0, -np.ones_like(y), y)
print("Number of samples: ", X.shape[0])
print('Number of features: ', X.shape[-1])
print(X)
print(y)

### Boosting train and predict functions


**Note 1 **: 

in scikit-learn, all supervised estimators implement a ``fit(X, y)`` method and a ``predict(X)`` method with ``X`` being unlabeled observations and  ``y`` being labels. 

Therefore ``Classifier`` parameter can be any sklearn class implementing a supervised classifier.

(See *The problem solved in supervised learning* section in the supervised learning tutorial from [sklearn documentation](https://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html)

**Note 2 **: 

Some sklearn classifiers (such as [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html), [SVM](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html), [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html), etc.)  have a ``sample_weight`` parameters in their ``fit`` and ``score`` methods, making it easy to implement a user-defined boosting algorithm. 

In [None]:
def boosting_train(
    X_train, 
    y_train, 
    n_clfs,           # number of classifier
    Classifier,       # Python class of classifier
    clfs_args = {},   # Specific python class of classifier's arguments
):
    """
    Adaboost training lgorithm
    """
    clfs = []
    alphas = []

    n = len(X_train) 
    # Initialize weights to 1/n
    ... #TODO!
    for t in range(n_clfs):
        # -------------------------
        # Train a new classifier
        # -------------------------

        # Train a weak learner using the training data and the sample weights
        ... #TODO!
        # Compute weighted training error 
        ... #TODO!
        # Compute alpha_t (and avoid math errors)
        ... #TODO!

        # -------------------------
        # Update weights
        # -------------------------
        
        ... #TODO!

    # Return the list of trained classifiers composing the boosting classifier
    # with their corresponding weights 'alphas'
    return(clfs, alphas)

def boosting_predict(
    clfs,       # list of classifiers composing the boosting classifier
    alphas,     # Weights associated with each classifier in 'clfs'
    X_test,
):
    """
    Adaboost predict algorithm
    """
    ... #TODO! 
    return(y_pred)

### Cross validation


You can use the [KFold](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html) from sklearn to split your datasets into k folds. 

In [None]:
def KFold_split(X, y, num_folds, seed=int(666)):
    """
    Split 'X' and 'y' into k-folds with k='num_fold'
    """
    KFold_splitter = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    # Initialize lists of X (train/val) and y (train/val) for each fold
    X_train_folds = []  
    X_val_folds = []
    y_train_folds = []
    y_val_folds = []
    for (kth_fold_train_idxs, kth_fold_val_idxs) in KFold_splitter.split(X, y):
        X_train_folds.append(X[kth_fold_train_idxs])
        X_val_folds.append(X[kth_fold_val_idxs])
        y_train_folds.append(y[kth_fold_train_idxs])
        y_val_folds.append(y[kth_fold_val_idxs])
    # Return the list of k-folds datasets
    return X_train_folds, X_val_folds, y_train_folds, y_val_folds

In [None]:
def evaluate_boosting(
    X_train_val, 
    y_train_val, 
    num_folds, 
    n_clfs,                                    # number of classifier
    Classifier = DecisionTreeClassifier,       # Python class of classifier
    clfs_args = {"max_depth" : 1},             # Specific python class of classifier's arguments
    seed=int(666),
):
    """
    Perform a cross validation of the boosting algorithm
    """
    ... #TODO!
    return boosting_clfs, boosting_alphas, boosting_scores

### Test boosting on the sonar dataset


In [None]:
num_folds = 5
# Extract a test set:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.35, shuffle=True, random_state=42)


# For each hyper-parameter instance, do KFold cross validation:
for n_clfs in [1, 10, 50, 100, 200, 300, 400]:
    boosting_clfs, boosting_alphas, boosting_scores = evaluate_boosting(...)#TODO!
    print('Trees: %d' % n_clfs)
    print('Validation scores: ', [round(s, 3) for s in boosting_scores])
    print('Mean validation accuracy: %.3f' % (sum(boosting_scores)/len(boosting_scores)))
    # Test accuracy computed with the boosting classifier trained with the first k-fold
    test_acc = accuracy_score(
        boosting_predict(boosting_clfs[0], boosting_alphas[0], X_test),
        y_test,
    )
    print('Test set accuracy: %.3f' %test_acc)