# Ensemble methods. Exercises


In this section we have only two exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package.

2. Build arcing arc-x4 method. 

In [1]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

Please use the following classifiers:

* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [2]:
import numpy as np
import itertools as it
import warnings
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
def build_classifiers(): 
    
    # Linear regression
    linear_regression = LinearRegression()
    linear_regression.fit(data_set, labels)  
    
    # Nearest Neighbours
    neighbors = KNeighborsClassifier()
    neighbors.fit(data_set, labels)

    # Linear SVC
    svc = SVC()
    svc.fit(data_set, labels)
    
    # Decision tree
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(data_set, labels)

    # Gaussian Naive-Bayes
    gauss = GaussianNB()
    gauss.fit(data_set, labels)
    
    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis()
    qda.fit(data_set, labels)
    
    return linear_regression, neighbors, decision_tree, svc, gauss, qda

In [4]:
def build_stacked_classifier(classifiers, stacked_classifier):
    output = [] 
    classifiers_list = []
    
    # Combinations of 3 classfiers
    classifiers_combinations = it.combinations(classifiers, 3)
    
    # Outputs for each combination
    for combination in classifiers_combinations:
        tmp_output = []
        tmp_combination = []
        for classifier in combination:
            tmp_output.append(classifier.predict(data_set)) 
            tmp_combination.append(classifier)
        tmp_output = np.array(tmp_output).reshape((130,3))
        output.append(tmp_output)  # ten set of data, for each of 10 combination 3-elements from 5 elements set
        classifiers_list.append(tmp_combination)
        
    # Combination fit array
    combination_fit = []
    for combination_output in output:
        #stacked_classifier = DecisionTreeClassifier()
        combination_fit.append(stacked_classifier.fit(combination_output.reshape((130,3)), labels.reshape((130,))))
 
    # Predictions for test dataset using each classifier combination
    test_predictions_list = []   
    for i in range(len(combination_fit)):
        classifiers = classifiers_list[i]
        stacked_classifier = combination_fit[i]
        test_set = []
        
        for classifier in classifiers:
            test_set.append(classifier.predict(test_data_set))
        test_set = np.array(test_set).reshape((len(test_set[0]),3))
        predicted = stacked_classifier.predict(test_set)
        test_predictions_list.append(predicted)
    
    # Prediction accuracy for each combination
    accuracy_list = []
    for prediction in test_predictions_list:
        accuracy_list.append(accuracy_score(test_labels, prediction))
        accuracy = accuracy_score(test_labels, prediction) ##
        #print(accuracy)
    
    # Selecting best accuracy combination
    best_prediction_index = np.argmax(accuracy_list)
    
    best_classifiers = classifiers_list[best_prediction_index]
    best_stacked_classifier = combination_fit[best_prediction_index]
    best_predicted = best_stacked_classifier.predict(test_set)
    
    return best_classifiers, best_predicted, best_stacked_classifier

In [5]:
# Running all functions
warnings.filterwarnings('ignore')


# Using KNeighborsClassifier as stacked classifier
stacked_classifier = KNeighborsClassifier()
classifiers = build_classifiers()

best_classifiers, best_predicted, best_stacked_classifier = build_stacked_classifier(classifiers, stacked_classifier)
accuracy = accuracy_score(test_labels, best_predicted)

print("Stacked classifier --> KNeighborsClassifier")
print("Set of 3 stacked classifiers with highest accuracy:")
for classifier in best_classifiers:
    print("-->",type(classifier).__name__)
print("Accuracy value for this set =",accuracy,"\n")


# Using SVC as stacked classifier
stacked_classifier = SVC()
classifiers = build_classifiers()

best_classifiers, best_predicted, best_stacked_classifier = build_stacked_classifier(classifiers, stacked_classifier)
accuracy = accuracy_score(test_labels, best_predicted)

print("Stacked classifier --> SVC")
print("Set of 3 stacked classifiers with highest accuracy:")
for classifier in best_classifiers:
    print("-->",type(classifier).__name__)
print("Accuracy value for this set =",accuracy,"\n")


# Using DecisionTreeClassifier as stacked classifier
stacked_classifier = DecisionTreeClassifier()
classifiers = build_classifiers()

best_classifiers, best_predicted, best_stacked_classifier = build_stacked_classifier(classifiers, stacked_classifier)
accuracy = accuracy_score(test_labels, best_predicted)

print("Stacked classifier --> DecisionTreeClassifier")
print("Set of 3 stacked classifiers with highest accuracy:")
for classifier in best_classifiers:
    print("-->",type(classifier).__name__)
print("Accuracy value for this set =",accuracy,"\n")


# Using GaussianNB as stacked classifier
stacked_classifier = GaussianNB()
classifiers = build_classifiers()

best_classifiers, best_predicted, best_stacked_classifier = build_stacked_classifier(classifiers, stacked_classifier)
accuracy = accuracy_score(test_labels, best_predicted)

print("Stacked classifier --> GaussianNB")
print("Set of 3 stacked classifiers with highest accuracy:")
for classifier in best_classifiers:
    print("-->",type(classifier).__name__)
print("Accuracy value for this set =",accuracy,"\n")


# Using QuadraticDiscriminantAnalysis as stacked classifier
stacked_classifier = QuadraticDiscriminantAnalysis()
classifiers = build_classifiers()

best_classifiers, best_predicted, best_stacked_classifier = build_stacked_classifier(classifiers, stacked_classifier)
accuracy = accuracy_score(test_labels, best_predicted)

print("Stacked classifier --> QuadraticDiscriminantAnalysis")
print("Set of 3 stacked classifiers with highest accuracy:")
for classifier in best_classifiers:
    print("-->",type(classifier).__name__)
print("Accuracy value for this set =",accuracy,"\n")



Stacked classifier --> KNeighborsClassifier
Set of 3 stacked classifiers with highest accuracy:
--> LinearRegression
--> KNeighborsClassifier
--> DecisionTreeClassifier
Accuracy value for this set = 0.0 

Stacked classifier --> SVC
Set of 3 stacked classifiers with highest accuracy:
--> LinearRegression
--> KNeighborsClassifier
--> DecisionTreeClassifier
Accuracy value for this set = 0.75 

Stacked classifier --> DecisionTreeClassifier
Set of 3 stacked classifiers with highest accuracy:
--> KNeighborsClassifier
--> DecisionTreeClassifier
--> QuadraticDiscriminantAnalysis
Accuracy value for this set = 0.8 

Stacked classifier --> GaussianNB
Set of 3 stacked classifiers with highest accuracy:
--> LinearRegression
--> KNeighborsClassifier
--> DecisionTreeClassifier
Accuracy value for this set = 1.0 

Stacked classifier --> QuadraticDiscriminantAnalysis
Set of 3 stacked classifiers with highest accuracy:
--> KNeighborsClassifier
--> DecisionTreeClassifier
--> GaussianNB
Accuracy value for 

## Exercise 2: 

Use the boosting method and change the code to fullfilt the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [6]:
# Loading necessary libraries
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# Preparing dataset
def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

In [7]:
labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

In [8]:
# Initializing weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size

In [9]:
def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

In [10]:
def calculate_accuracy_vector(predicted, labels):
    result = []
    for i in range(len(predicted)):
        if predicted[i] == labels[i]:
            result.append(0)
        else:
            result.append(1)
    return result

In [11]:
def calculate_error(model):
    predicted = model.predict(test_set)
    I = calculate_accuracy_vector(predicted, test_labels)
    Z = np.sum(I)
    return (1+Z)/1.0

Fill the two functions below:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,

In [12]:
def set_new_weights(model):
    predicted = model.predict(test_set)
    I = calculate_accuracy_vector(predicted, test_labels)
    Norm = np.sum(np.add(1,I))
    new_weights = np.add(1,I)/Norm
    return new_weights 

Train the classifier with the code below:

In [13]:
classifier = DecisionTreeClassifier(max_depth = 1, random_state = 1)
classifier.fit(X = train_set, y = train_labels)
alphas = []
classifiers = []
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = set_new_weights(model)
    classifiers.append(model)

# First 10 elements in weights list
print(weights[:10])

[0.00065617 0.00131234 0.00131234 0.00131234 0.00131234 0.00065617
 0.00065617 0.00131234 0.00131234 0.00131234]


Set the validation data set:

In [14]:
validate_x, validate_label = generate_data(1, dimension, labels)

print("Validate_x =",validate_x,"Validate_label =",validate_label)

Validate_x = [[0.91968433 0.52183191]] Validate_label = [1]


Fill the prediction code:

In [15]:
def get_prediction(x, test_labels):
    predictions = []
    test_unique_labels = np.unique(test_labels)
    votes_count = np.zeros(len(test_unique_labels))
    for i in range (len(classifiers)):
        predicted = classifiers[i].predict(x)
        print("Classifier nr.",i,"predicted:",predicted[0])
        for j in range (len(test_unique_labels)):
            if test_unique_labels[j] == predicted[0]:
                votes_count[j] = votes_count[j] + 1
    print("Votes for 0 :",int(votes_count[0]),",votes for 1 :",int(votes_count[1]))
        
    if votes_count[np.argmax(votes_count)] / (len(classifiers)) > 0.5:
        return test_unique_labels[np.argmax(votes_count)]
    else:
        print("Voting ended with a tie.")

Test it:

In [16]:
prediction = get_prediction(validate_x, test_labels)

print("\nValidation point =",validate_x,"Validation point label =",validate_label[0])
print("Predicted label for Validation point =",prediction)

Classifier nr. 0 predicted: 1
Classifier nr. 1 predicted: 1
Classifier nr. 2 predicted: 1
Classifier nr. 3 predicted: 1
Classifier nr. 4 predicted: 1
Classifier nr. 5 predicted: 1
Classifier nr. 6 predicted: 1
Classifier nr. 7 predicted: 1
Classifier nr. 8 predicted: 1
Classifier nr. 9 predicted: 1
Votes for 0 : 0 ,votes for 1 : 10

Validation point = [[0.91968433 0.52183191]] Validation point label = 1
Predicted label for Validation point = 1


In [17]:
print("Sidenote: Since validation_point and validation_label are randomly generated independently, comparing validation_label and prediction doesn't make sense")

Sidenote: Since validation_point and validation_label are randomly generated independently, comparing validation_label and prediction doesn't make sense
