In [1]:
import numpy as np
import math
from copy import copy
import sklearn.datasets
from sklearn.svm import SVC
from __future__ import division

In [2]:
X,y = sklearn.datasets.make_hastie_10_2()
X_train = X[0:8000,:]
y_train = y[0:8000]
X_test = X[8000:,:]
y_test = y[8000:]

# Exercise 1
Implement the AdaBoost ensemble algorithm by completing the following code:

In [3]:
class AdaBoostExample:
    def __init__(self, weakModel, T):
        return None # to be completed

    def fit(self, X, y):
        return None # to be completed

    def predict(self, X):
        return None # to be completed

In the implementation you are free to assume:
- that the problem is a binary classification problem with labels in $\{-1, +1\}$.
- that the weakModel can fit a weighted sample set by means of the call `weakModel.fit(X,y,sample_weight=w)` where `w` is a vector of length $|y|$.

Test your implementation on the dataset loaded above and using an SVC with a polynomial kernel. 

In [4]:
weakModel = SVC(kernel="poly", degree=3)
adaboost = AdaBoost(weakModel, 100)
y_train_ = c.predict(X_train)
y_test_ = c.predict(X_test)

NameError: name 'AdaBoost' is not defined

and evaluate the AdaBoost performances as usual by calculating the classification error. 

**Note 1**:  
since the labels are bound to be in ${+1, -1}$, the classification error can be easily computed as:
$$
   error(y,y') = \frac{1}{2} - \frac{y^T \times y'}{2N},
$$
where $N$ is the total number of examples. The formula can be derived noticing that $y^T \times y'$ calculates the number $N_c$ of examples correctly classified  minus the number $N_{\bar c}$ of examples incorrectly classified. We have then $y^T \times y' = N_c - N_{\bar c}$ and by noticing that $N = N_c + N_{\bar c}$:
$$
   N - y^T \times y' = 2 N_{\bar c} \Rightarrow \frac{N - y^T \times y'}{2 N} = \frac{N_{\bar c}}{N} = error(y,y')
$$

**Note 2**:
do not forget to deepcopy your base model before fitting it to the new data

**Note 3**:
The SVC model allows specifying weights, but it *does not* work weights are normalized (it works well when the weights are larger). The following class takes normalized weights and denormalize them before passing them to the SVC classifier:

```python
    class SVC_:
        def __init__(self, kernel="rbf", degree="3"):
            self.svc = SVC(kernel=kernel, degree=degree)

        def fit(self, X,y,sample_weight=None):
            if sample_weight is not None:
                sample_weight = sample_weight * len(X)

            self.svc.fit(X,y,sample_weight=sample_weight)
            return self

        def predict(self, X):
            return self.svc.predict(X)
```

## Impementation

In [5]:
# https://github.com/scikit-learn/scikit-learn/blob/f0ab589f/sklearn/ensemble/weight_boosting.py#L297
# https://github.com/jaimeps/adaboost-implementation/blob/master/adaboost.py
# https://github.com/xiaoyubai/AdaBoost/blob/master/AdaBoostBinary.py
# https://github.com/simsicon/AdaBoostTrees/blob/master/boost.py
# https://github.com/prateekbhat91/Decision-Tree/blob/master/ensemble.py
# https://github.com/quqixun/MLAlgorithms/blob/master/AdaBoost/src/AdaBoostTree.py
# http://rob.schapire.net/papers/explaining-adaboost.pdf


In [6]:
class SVC_:
    def __init__(self, kernel="rbf", degree="3"):
            self.svc = SVC(kernel=kernel, degree=degree)

    def fit(self, X,y,sample_weight=None):
        if sample_weight is not None:
            sample_weight = np.multiply(sample_weight, len(X))

        self.svc.fit(X,y,sample_weight=sample_weight)
        return self

    def predict(self, X):
        return self.svc.predict(X)

In [15]:
class Adaboost(object):

    def __init__(self, n_estimators, base_estimator):
        self.n_estimators = n_estimators
        self.estimator = base_estimator
        self.estimators = None
        self.alphas = None
        self.test_errors = None
        self.train_errors = None

        return

    def fit(self, X, y):
        self.estimators = []
        self.alphas = []
        self.train_errors = []

        train_num = len(X)
        
        ws_pred_train = np.zeros(train_num)
        
        weights = np.ones(train_num) / train_num

        for m in range(self.n_estimators):
            estimator = copy(self.estimator)
            
            estimator.fit(X, y, sample_weight = weights)

            self.estimators.append(estimator)

            y_predicted = estimator.predict(X)
            
            if False:
                for i in range(len(y_predicted)):
                    if y_predicted[i] > 0:
                        y_predicted[i] = 1
                    else:
                        y_predicted[i] = -1

            incorrect = [int(i) for i in (y_predicted != y)]
            #print("incorrect", incorrect)
            error = np.dot(weights, incorrect)
            #print("error", error)
            alpha = np.log((1 - error) / error) / 2.0
            #print("alpha", alpha)
            self.alphas.append(alpha)
            exp = [np.exp(-1 * alpha * y[i] * y_predicted[i]) for i in range(train_num)]
            Z = np.dot(weights, exp)
            #print("z", Z)
            weights = [w / Z * e for w, e in zip(weights, exp)]
            
            ws_pred_train += alpha * y_predicted
            train_error_rate = sum(np.sign(ws_pred_train) != y) / len(ws_pred_train)
            self.train_errors.append(train_error_rate)
            
            stamp_iteration = 10
            if m % stamp_iteration == 0:
                print("Train error rate: ", train_error_rate)
            
            #print("-------------------------")

        return

    def predict(self, X, y):
        self.test_errors = []
        test_num = len(X)
        ws_pred_test = np.zeros(test_num)
        
        predicted = np.zeros(len(X))
        i = 0
        for clf, alpha in zip(self.estimators, self.alphas):
            predicted += alpha * clf.predict(X)
            
            ws_pred_test += alpha * predicted
            test_error_rate = sum(np.sign(ws_pred_test) != y) / len(ws_pred_test)
            self.test_errors.append(test_error_rate)
            
            stamp_iteration = 10
            if i % stamp_iteration == 0:
                print("Test error rate: ", test_error_rate)
            i += 1
            
        return np.sign(predicted)

In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

weak_classifier = DecisionTreeClassifier(criterion="entropy", max_depth=3, random_state=42)
# weak_classifier = SVC(kernel="poly", degree=3)
# weak_classifier = SVC_(kernel="poly", degree=3)

clf_boosted = Adaboost(200, weak_classifier)

clf_boosted.fit(X_train, y_train)

y_predicted = clf_boosted.predict(X_test, y_test)
print("Accuracy of test set:", accuracy_score(y_predicted, y_test))

('Train error rate: ', 0.382625)
('Train error rate: ', 0.174125)
('Train error rate: ', 0.15525)
('Train error rate: ', 0.11375)
('Train error rate: ', 0.092625)
('Train error rate: ', 0.083125)
('Train error rate: ', 0.07325)
('Train error rate: ', 0.067875)
('Train error rate: ', 0.0585)
('Train error rate: ', 0.050375)
('Train error rate: ', 0.048375)
('Train error rate: ', 0.045125)
('Train error rate: ', 0.041375)
('Train error rate: ', 0.039375)
('Train error rate: ', 0.034125)
('Train error rate: ', 0.033375)
('Train error rate: ', 0.0295)
('Train error rate: ', 0.028625)
('Train error rate: ', 0.0245)
('Train error rate: ', 0.02425)
('Test error rate: ', 0.3835)
('Test error rate: ', 0.27975)
('Test error rate: ', 0.19375)
('Test error rate: ', 0.16175)
('Test error rate: ', 0.1455)
('Test error rate: ', 0.1315)
('Test error rate: ', 0.123)
('Test error rate: ', 0.11725)
('Test error rate: ', 0.112)
('Test error rate: ', 0.1045)
('Test error rate: ', 0.09975)
('Test error rate

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

bdt = AdaBoostClassifier(SVC(kernel="poly", degree=3), n_estimators = 100, algorithm = "SAMME") 
bdt.fit(X_train, y_train)
y_pred = bdt.predict(X_test)
print("Accuracy: ",  accuracy_score(y_test, y_pred))
scores = cross_val_score(bdt, X_test, y_test)
print(scores.mean())

bdt2 = AdaBoostClassifier(n_estimators = 100) 
bdt2.fit(X_train, y_train)
y_pred = bdt2.predict(X_test)
print("Accuracy: ",  accuracy_score(y_test, y_pred))
scores2 = cross_val_score(bdt2, X_test, y_test)
print(scores2.mean())

# Exercise 2

Write a weak learner to be used with the AdaBoost algorithm you just wrote. The weak learner that you will implement shall work as follows:

- creates a random linear model by generating the needed weight vector $\mathbf{w}$ at random; each weight shall be sampled from U(-1,1);
- it evaluates the weighted loss $\epsilon_t$ on the given dataset and flip the linear model if $\epsilon_t > 0.5$
- at prediction time it predicts +1 if $\mathbf{x} \cdot \mathbf{w} > 0$ it predicts -1 otherwise.

In [17]:
class RandomLinearModelExample:
    def loss(self, y, y_, w):
        return None # to be completed
        
    def fit(self,X,y,sample_weight=None):
        return None # to be completed        
        
    def predict(self,X):
        return None # to be completed

Learn an AdaBoost model using the RandomLinearModel weak learner printing every $K$ iteratins the weighted error and the current error of the ensemble (you are free to choose $K$ so to make your output just frequent enough to let you know what is happening but without flooding the console with messages). Evaluate the training and test error of the final ensemble model.

In [18]:
rs = RandomSplit()
a = AdaBoost(rs,10000)
a.fit(X_train,y_train)

y_train_ = a.predict(X_train)
y_test_ = a.predict(X_test)

NameError: name 'RandomSplit' is not defined

Write few paragraphs about what you think about the experiment and about the results you obtained. 

## Implementation

In [45]:
class RandomLinearModel:
    
    def __init__(self, dim):
        self.classes = dim
        self.weights = None

    def loss(self, y, y_pred, sample_weight):
        return (np.dot(sample_weight, (y_pred - y) ** 2)) / self.classes
        
    def fit(self, X, y, sample_weight = None):
        self.weights = np.random.uniform(-1, 1, self.classes)
        y_pred = np.dot(X, self.weights)
        loss_value = self.loss(y, y_pred, sample_weight)
        if loss_value > 0.5:
            self.weights = self.weights * (-1)
        
        print("Loss value: " + str(loss_value))
        
    def predict(self, X):
        y_pred = np.dot(X, self.weights)
        y_pred[y_pred > 0] = 1
        y_pred[y_pred <= 0 ] = -1
        return y_pred

In [46]:
# https://github.com/eriklindernoren/ML-From-Scratch/blob/master/mlfromscratch/supervised_learning/regression.py
# https://www.stat.cmu.edu/~cshalizi/350/lectures/18/lecture-18.pdf
# http://www.seas.ucla.edu/~vandenbe/133A/lectures/ls-fitting.pdf
# guardare pure la slide di Esposito perché c'è una che si chiama "Using least squares for classification"
# https://github.com/HarryGogonis/Python-Linear-Least-Squares-Classifier/blob/master/LLS.py
# https://www.google.com/search?client=firefox-b&ei=_XSmW6CeKsjClwTaiYOQAQ&q=Using+least+squares+for+classification+python&oq=Using+least+squares+for+classification+python&gs_l=psy-ab.3..33i22i29i30k1.261904.265020.0.265163.7.7.0.0.0.0.297.626.0j2j1.3.0....0...1c.1.64.psy-ab..4.3.624....0.W2dr0PTiWFY
# https://github.com/nandhiniramanan5/Classifiers-implemented/blob/master/classalgorithms.py

In [48]:
from sklearn import linear_model
weak_classifier2 = RandomLinearModel(X_train.shape[1])
# weak_classifier2 = linear_model.LogisticRegression()
# weak_classifier2 = linear_model.SGDClassifier(max_iter=1000)
# weak_classifier2 = linear_model.LinearRegression()
# weak_classifier2 = LinearRegression()

clf_boosted2 = Adaboost(200, weak_classifier2)

clf_boosted2.fit(X_train, y_train)
       
y_predicted2 = clf_boosted2.predict(X_test, y_test)
print("Accuracy of test set:", accuracy_score(y_predicted2, y_test))

Loss value: 0.26816365943092213
('Train error rate: ', 0.489125)
Loss value: 0.5274647568104193
Loss value: 0.4414358098818612
Loss value: 0.43803991480600535
Loss value: 0.4123761272274395
Loss value: 0.4832130664093028
Loss value: 0.4825867486396775
Loss value: 0.422024851657356
Loss value: 0.5188613229919884
Loss value: 0.5760004917927922
Loss value: 0.38823530455503147
('Train error rate: ', 0.480125)
Loss value: 0.3787635599914093
Loss value: 0.34499156059612224
Loss value: 0.403098448080641
Loss value: 0.34241833295494367
Loss value: 0.5445295632028184
Loss value: 0.6679994827990549
Loss value: 0.44040639809876153
Loss value: 0.4410782009086508
Loss value: 0.41788940852883305
Loss value: 0.35099673500591955
('Train error rate: ', 0.47825)
Loss value: 0.46928031867571063
Loss value: 0.43412451091504173
Loss value: 0.30480259546642324
Loss value: 0.5242675372571759
Loss value: 0.48716994679067704
Loss value: 0.5838161834702075
Loss value: 0.3899075192646517
Loss value: 0.2586487383