In this task, you are provided with three classes of images, cars, bikes and people in real world settings. You are provided with code for obtaining features for these images (specifically histogram of gradients (HoG) features). You need to implement a boosting based classifier that can be used to classify the images.

In [1]:
import numpy as np
import cv2
import glob
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA



In [2]:
pca=PCA(n_components=0.9)

#Image feature extraction code
def obtain_dataset(folder_name):
    # assuming 128x128 size images and HoGDescriptor length of 34020
    hog_feature_len=34020
    hog = cv2.HOGDescriptor()
    
    #code for obtaining hog feature for one image file name
    files = glob.glob(folder_name+'/*')
    #print(files)
    X = []
    y = []
    for i, file in enumerate(files):
        images = glob.glob(file+'/*.png')
        for image in images:
            im = cv2.imread(image)
            h = hog.compute(im)
            X.append(h[:,0])
            y.append(i)
            #print(shape(h))
    # use this to read all images in the three directories and obtain the set of features X and train labels Y
    # you can assume there are three different classes in the image dataset
    Xraw = np.array(X)
    X = pca.fit_transform(Xraw)
    y = np.array(y)
    return (X,y) 

# Boosting classifier

1. use the previous decison tree to build a decision stump
2. modify the decision tree to build a weighted decision tree

In [3]:
def cal_entropy(labels, weights):
    n_labels = len(labels)
    classes = np.unique(labels)
    entropy = 0
    for k in classes:
        p = np.sum(weights[labels==k])
        entropy = entropy - p*np.log(p)
    return entropy

In [4]:
  
def find_split(X, y, weights):
    start_entropy = cal_entropy(y, weights)
    best = {'infogain': -np.inf}
    N, D = X.shape
    #feat_idxs = np.random.choice(D, replace=False)
    #loop for each feature
    for i in range(D):
        #loop of each unique value of each feature
        for threshold in np.unique(X[:,i]):
            left_indices = np.argwhere(X[:, i] <= threshold).flatten()
            right_indices = np.argwhere(X[:, i] > threshold).flatten()
            nl = float(len(left_indices))
            nr = float(len(right_indices))
            n = nl+nr
            infogain = start_entropy - nl/n * cal_entropy(y[left_indices], weights[left_indices]) -nr/n *cal_entropy(y[right_indices], weights[right_indices])
            #error =  (left_weighted_mistakes + right_weighted_mistakes ) / n
            if infogain>best['infogain']:
                best={'feature':i,
                      'split': threshold,
                      'infogain': infogain,
                      'left_indices': left_indices,
                      'right_indices': right_indices,
                     }
            #print(left_indices)
    return best

def build_tree(X, y, weights, max_depth=2):
    if max_depth == 1 or (y==y[0]).all():
        classes, counts = np.unique(y, return_counts = True)
        return {'leaf': True, 'class': classes[np.argmax(counts)]}

    else:
        move = find_split(X, y, weights)
        left = build_tree(X[move['left_indices']], y[move['left_indices']], weights[move['left_indices']], max_depth-1)
        right = build_tree(X[move['right_indices']], y[move['right_indices']], weights[move['left_indices']], max_depth-1)
        return {'leaf': False,
                'feature': move['feature'],
                'split': move['split'],
                'left': left,
                'right': right,
                'infogain': move['infogain']
               }

def predict_one(nodedict, OneX):
    if nodedict['leaf']:
        return nodedict['class']
    else:
        if OneX[nodedict['feature']]<=nodedict['split']:
            return predict_one(nodedict['left'], OneX)
        else:
            return predict_one(nodedict['right'], OneX)

def predict(tree,Xs):
    return [predict_one(tree, X) for X in Xs]



In [5]:
class BoostingClassifier:

    def __init__(self, numBoostingIters=40):
        '''
        clfs : List object containing individual DecisionTree classifiers, in order of creation during boosting
        betas : List of beta values, in order of creation during boosting
        '''
        self.clfs = []  #list of trees
        self.betas = [] #coefficients
        self.numBoostingIters = numBoostingIters
        #self.maxTreeDepth = maxTreeDepth
        self.K = 0 #number of labels
        self.classes = []  #labels



    def fit(self, X, y, random_state=None):
        '''
        Arguments:
            X : n-by-d 
            y : n-by-1 
        '''
        self.models = []
        self.classes = np.unique((y))
        self.K = len(self.classes)
        n,d = X.shape
        weight = np.full((n,),1/n)
        for i in range(self.numBoostingIters):
            #***********
            #clf = DecisionTreeClassifier(max_depth=self.maxTreeDepth).fit(X,y,sample_weight=weight)
            clf=build_tree(X, y, weight)
            #*******
            prediction = predict(clf, X)
            #********
            e = 1 - sum(weight[y==prediction])/sum(weight)
            
            beta = np.log((1-e)/e) + np.log(self.K - 1)
            match = prediction==y
            weight[~match] *= np.exp(beta)
            weight /= weight.sum()
            self.clfs.append(clf)
            self.betas.append(beta)



    def predict(self, X):
        '''
        Arguments:
            X is an n-by-d ndarray
        Returns:
            an n-by-1 ndarray of the predictions
        '''
        n = len(X)
        pred = np.zeros((n,self.K))
        i = 0
        for beta,clf in zip(self.betas, self.clfs):
            #yp = predict(clf, X).astype(int)
            yp = predict(clf, X)
            pred[range(n),yp] += beta
            i += 1
        pred = np.argmax(pred,axis=1)
        return pred

In [6]:
train_folder_name='image_dataset'
(X_train, Y_train) = obtain_dataset(train_folder_name)
Xtrain, Xtest, ytrain, ytest = train_test_split(X_train, Y_train, train_size=0.8, random_state=123)

In [7]:
#BoostedDT
bc = BoostingClassifier()
bc.fit(Xtrain, ytrain)
y_pred = bc.predict(Xtest)
print('accuracy', accuracy_score(ytest, y_pred))

accuracy 0.4


In [8]:
#compare with sklearn
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1)
# Train Adaboost Classifer
model = abc.fit(Xtrain, ytrain)

#Predict the response for test dataset
ypred = model.predict(Xtest)
print('accuracy', accuracy_score(ytest, ypred))

accuracy 0.43333333333333335
