# Applying Traditional Classifiers with the data obtained after using CNN

In this notebook we load all the data obtained by a fully connected layer, after training the CNN. We receive for each image 1024 values.

For knowing the best values to apply to the classifiers we did GridSearch. There is the KNN, SVC, SVC Linear (which takes a more reasonable time than SVC RBF), Gradient Boosting, Random Forest and Adaboost.

We can not see all the experiments in the notebook. There are too many. But we can see the final and best results in the confusions matrices saved in out/confusion_matrix

In [28]:
#If you want to run the code using GPU
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import keras
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.32
set_session(tf.Session(config=config))


In [2]:
# to run on CPU, execute this lines before import keras and tensorflow
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
#Imports if you want to compute the data (than you have to use
#keras and tensorflow). If you want to run the code with loaded data
#there is no need to run this
from cnn.inceptionV3.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D
from keras.optimizers import SGD
from keras import backend as K


Using TensorFlow backend.


In [1]:
import logging
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn import cross_validation
from sklearn import metrics
from sklearn import grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import ensemble
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
import itertools



In [4]:
# import our scripts
SCRIPTS_PATH = '/../../scripts'
sys.path.insert(0, os.getcwd()+SCRIPTS_PATH)
#print sys.path
from dataset import Dataset

# set the log
logging.basicConfig(level=logging.DEBUG, format='[%(asctime)s] %(message)s',
                    datefmt='%d/%m/%Y %H:%M:%S')


def preprocess_input(x):
    x = x/255.
    x = x - 0.5
    x = x*2.
    return x

#Freeze the n selected layers, and work with the other layers           
def freeze_n_first_layers(n_not_train, model, optimizer=SGD(lr=0.0001, momentum=0.9),
                          loss='sparse_categorical_crossentropy'):

    for layer in model.layers[:n_not_train]:
        layer.trainable = False
    for layer in model.layers[n_not_train:]:
        layer.trainable = True

    # we need to recompile the model for these modifications to take effect
    # we use SGD with a low learning rate
    model.compile(optimizer, loss)

#Build the model of the CNN
def buildModel(load_weights = False):
    # create the base pre-trained model
    if load_weights == False:
        base_model = InceptionV3(weights='imagenet', include_top=False)
    else:
        base_model = InceptionV3(weights=None, include_top=False)

    # add a global spatial average pooling layer
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    # let's add a fully-connected layer
    x = Dense(1024, activation='relu')(x)
    # and a logistic layer with 2 classes food/no_food
    predictions = Dense(NUM_CLASSES, activation='softmax')(x)

    # this is the model we will train
    print 'Loading model...\n'
    model = Model(input=base_model.input, output=predictions)

    if load_weights:
        model.load_weights(WEIGHTS_PATH)

    return model

In [5]:
def get_data_interlayer(model, val_ds):
    #We want to obtain the intermediate layer, to apply Classifiers to them. In layer_output we append the layer from all the batches
    #to obtain the intermediate layer from the given dataset and not just from a batch
    layer_output = []
    #In label we obtain the y_true from all the given dataset
    label = []
    # Obtain the output of the intermediate layers 
    inter_layer_model = K.function([model.layers[0].input, K.learning_phase()], [model.layers[-2].output])
    
    num_validations = val_ds.num_batches()

    for iter_val in range(num_validations):
        print 'iter_val: ', iter_val+1, 'out of: ', val_ds.num_batches()
        # Get new batch and preprocess the image according to the model
        X, y = val_ds.get_Xy(SIZE)
        X = preprocess_input(X)
        
        layer_output.append(inter_layer_model([X, 0])[0])
        label+=y
    return layer_output, label

In [33]:
#Save the confusion matrix, with a lot of information in it (Accuracy, Accuracy normalized, Precision, Recall, F1)
def save_confusion_matrix(y_true, y_pred, name_to_save, title='Confusion matrix ', info=''):

    # create the confusion matrix and normalize it
    cm = 1.0*confusion_matrix(y_true, y_pred, labels=range(len(set(y_true))))
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    inf = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print '\tPrecision:', inf[0],'\t Recall:', inf[1]
    print "\t" +'F1 score:', inf[2]
    # prepare the plot
    fig = plt.figure()
    plt.matshow(cm_norm)
    tick_marks = np.arange(cm_norm.shape[0])
    plt.xticks(tick_marks, rotation=45)
    plt.yticks(tick_marks)
    if (MULTITASK):
        for i, j in itertools.product(range(cm_norm.shape[0]), range(cm_norm.shape[1])):
            """
            plt.text(j, i, str(int(cm[i,j]))+' / '+str(int(sum(cm[i]))),
                     horizontalalignment="center",
                     color="white" if cm_norm[i, j] < 0.5 else "black")
            """
            plt.text(j, i, str(round(cm_norm[i,j]*100, 2))+'%',
                     horizontalalignment="center",
                     color="white" if cm_norm[i, j] < 0.5 else "black")
    plt.title(title)
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label \n '+info+'\n F1 score: '+str(round(inf[2],2))+'\n Precision: '+str(round(inf[0], 2))+', Recall: '+str(round(inf[1], 2)))

    # save the figure and close so it never gets displayed
    plt.savefig(CONF_MATRIX+name_to_save+'.png')
    plt.close(fig)

In [34]:
# Creating or loading the data
def get_data_model(load=False):
    if (load):
        #Load files
        X_train = np.load('npdata/X_train'+TITLE+'.npy')
        y_train = np.load('npdata/y_train'+TITLE+'.npy')
        X_val = np.load('npdata/X_val'+TITLE+'.npy')
        y_val = np.load('npdata/y_val'+TITLE+'.npy')
        X_test = np.load('npdata/X_test'+TITLE+'.npy')
        y_test = np.load('npdata/y_test'+TITLE+'.npy')
    else:
        # ------------------- Data
        # prepare the training and validation dataset
        train_ds = Dataset(TRAIN_TXT, NUM_TRAIN_IMG, BATCH_SIZE, CLASSES, 
                           data_augmentation=False, multitask = MULTITASK)
        val_ds = Dataset(VAL_TXT, NUM_VAL_IMG, BATCH_SIZE, CLASSES, 
                         data_augmentation=False, multitask = MULTITASK)
        test_ds = Dataset(TEST_TXT, NUM_TEST_IMG, BATCH_SIZE, CLASSES, 
                         data_augmentation=False, multitask = MULTITASK)

        # ------------------- Model
        # load the desired model and visualize it
        model = buildModel(load_weights=True)

        # ------------------- Train more Layers
        # we chose to train the top 2 inception blocks, i.e. we will freeze
        # the first 172 layers and unfreeze the rest:
        if (MULTITASK):
            freeze_n_first_layers(-1, model,loss = 'binary_crossentropy')
        else:
            freeze_n_first_layers(172, model, SGD(lr=0.0001, momentum=0.9))
        
        #Training
        X_train, y_train = get_data_interlayer(model, train_ds)
        X_train = np.concatenate(X_train)
        y_train = np.asarray(y_train)

        np.save('npdata/X_train'+TITLE, X_train)
        np.save('npdata/y_train'+TITLE, y_train)
        
        #Validation
        X_val, y_val = get_data_interlayer(model, val_ds)
        X_val = np.concatenate(X_val)
        y_val = np.asarray(y_val)
        
        np.save('npdata/X_val'+TITLE, X_val)
        np.save('npdata/y_val'+TITLE, y_val)
        
        #Test
        X_test, y_test = get_data_interlayer(model, test_ds)
        X_test = np.concatenate(X_test)
        y_test = np.asarray(y_test)
        
        ## Save test dataset
        np.save('npdata/X_test'+TITLE, X_test)
        np.save('npdata/y_test'+TITLE, y_test)

    return X_train, X_val, X_test, y_train, y_val, y_test
    #return X_val, X_test, y_val, y_test

#Select which classifier to use. 
def accuracy_classifiers(X_train, X_test, y_train, y_test, not_grid_search=False,
                         adaboost=False, gradient_boosting=False, random_forest=False,
                         svc=False, linear_svc=False, k_neighbors=False, name_task='', class_weight=''):  
        if (adaboost):
            Adaboost_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
        if (gradient_boosting):
            GradientBoosting_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
        if(random_forest):
            RandomForest_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
        if(svc):
            SVC_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task, class_weight=class_weight)
        if(linear_svc):
            LinearSVC_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task, class_weight=class_weight)
        if(k_neighbors):
            KNeighbors_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
            
        if (not_grid_search):
            otherClassifiers(X_train, X_test, y_train, y_test, name_task=name_task)
        

In [36]:
#Computing accuracy with different classifiers. Without GridSearch run some classifiers, with default parameters.
#Save a an image for each classifier with the confusion matrix and information
def otherClassifiers(X_train, X_test, y_train, y_test, grid_search=False, name_task=''):
    names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes", "Gradient Boosting"]
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        #SVC(gamma=2, C=1),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=10, max_features=1),
        #AdaBoostClassifier(),
        #GaussianNB(),
        GradientBoostingClassifier()]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    for name, clf in zip(names, classifiers):    
        # Standard parameter
        clf.fit(X_train, y_train)
        yhat = clf.predict(X_test)
        # Recall, f1, precision
        acc = metrics.accuracy_score(yhat, y_test)
        acc_norm = normalized_accuracy(y_test, yhat)
        print '\n \n',name
        print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
        save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_noGrid_'+str(round(acc,5)), 
                          title=str(name)+' '+name_task, 
                          info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))
        

In [37]:
#Computes de normalized accuracy. Given the true label and the y predicted, returns the normalized accuracy
def normalized_accuracy(y_true, y_pred):
    cm = 1.0*confusion_matrix(y_true, y_pred, labels=range(len(set(y_true))))
    acc = []
    for i, row in enumerate(cm):
        acc.append(row[i]/sum(row))
    return sum(acc)/len(acc)

In [38]:
#Given a binary matrix with labels (ex. [[1,0,0], [0, 0, 1]]) returns correspondent vector no binary ([0,2])
def to_one_vector(y, pos):
    return np.asarray([lbl[pos[0]:pos[1]].argmax() for lbl in y])    
    

## GridSearch

#### Adaboost

In [10]:
#Adaboost Classifier applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def Adaboost_GridSearch(X, X_test, y, y_test, name_task):
    name = 'AdaBoost'
    print  name
    param_grid = {"algorithm" : ["SAMME.R","SAMME"], "n_estimators" : [1, 10, 50, 100, 200, 500]}
    # KFold
    nfolds = 5
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = ensemble.AdaBoostClassifier()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = svm.LinearSVC(loss=best_param['loss'], C=best_param['C'])

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


#### Gradient Boosting

In [2]:
#Gradient Boosting applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def GradientBoosting_GridSearch(X, X_test, y, y_test, name_task):
    name = 'GradientBoosting'
    print  name
    param_grid = {'n_estimators':[50, 100], 'learning_rate': [0.1, 0.01],
                    #'max_depth': [3, 4, 6],
                    'min_samples_leaf': [1, 3, 5]
                    # 'max_features': [1.0, 0.3, 0.1]
                    }

    # KFold
    nfolds = 3
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = ensemble.GradientBoostingClassifier()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))

    
    #If you want to directly do grid search and find the best parameters, and with those predict and see the accuracy of the test
    #dataset, run also this
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = ensemble.GradientBoostingClassifier(n_estimators=best_param['n_estimators'], learning_rate=best_param['learning_rate'], 
                                              min_samples_leaf=best_param['min_samples_leaf'])
       
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


#### Random Forest

In [11]:
# Random Forest applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def RandomForest_GridSearch(X, X_test, y, y_test, name_task):
    name = 'RandomForest'
    print  name
    param_grid = { 
        'n_estimators': [10, 50, 100, 200, 300, 500],
        'criterion':['gini', 'entropy'],
        'max_features': ['auto', 'sqrt', 'log2']
    }
   
    # KFold
    nfolds = 5
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = ensemble.RandomForestClassifier()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = svm.LinearSVC(loss=best_param['loss'], C=best_param['C'])

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


#### SVC

In [12]:
#SVC applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def SVC_GridSearch(X, X_test, y, y_test, name_task, class_weight=''):
    name = 'SVC'
    print  name
    param_grid = [{'kernel': ['rbf'], 'gamma': [0.1, 0.001, 0.0001, 'auto'], 'C': [0.01, 0.025, 0.5, 1, 10, 100], 'class_weight':[class_weight]}]
    # KFold
    nfolds = 5
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = SVC()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    #if (best_param['kernel']=='rbf'):
    clf2 = SVC(kernel=best_param['kernel'], gamma=best_param['gamma'], 
                                                   C=best_param['C'], class_weight=best_param['class_weight'])
    #else:
    #    clf2 = SVC(kernel=best_param['kernel'], C=best_param['C'])    
        
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


#### Linear SVC

In [3]:
#Linear SVC applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def LinearSVC_GridSearch(X, X_test, y, y_test, name_task, class_weight):
    name = 'LinearSVC'
    print  name
    param_grid = [{'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100], 'loss': ['hinge', 'squared_hinge'], 'class_weight':[class_weight]}]
                 #{'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100, 1000], 'loss': ['hinge', 'squared_hinge'], 'class_weight':{0:1.0, 1:5582/96. ,2: 5582/138. , 3:1., 4:5582/1409,5: 1.0,6:5582/4173 }}]
    print class_weight
    # KFold
    nfolds = 3
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = svm.LinearSVC()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=3, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))

    #If you want to directly do grid search and find the best parameters, and with those predict and see the accuracy of the test
    #dataset, run also this
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = svm.LinearSVC(loss=best_param['loss'], C=best_param['C'])

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


#### K-Neighbors

In [4]:
#KNN applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def KNeighbors_GridSearch(X, X_test, y, y_test, name_task):
    name = 'KNeighbors'
    print  name
    #p=1 is manhattan distance, p=2 is euclidean_distance
    param_grid = [{'n_neighbors': [3, 5, 8, 10, 15, 20], 'metric': ['euclidean', 'chebyshev', 'minkowski'], 'weights':['distance']}]

    # KFold
    nfolds = 3
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = KNeighborsClassifier(n_jobs=1)

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
      
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = KNeighborsClassifier(n_neighbors=best_param['n_neighbors'], 
                                               metric=best_param['metric'], weights=best_param['weights'])

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc,2))+',  Acc. norm.:'+str(round(acc_norm,2)))


In [None]:
"""A different dataset
### Dataset Food related objects EGO
###food_related_objects_EGO
TITLE = 'Food_related_objects_EGO' 
DATASET_FOLDER = '../../data/datasets/food_related_objects_EGO'
WEIGHTS_NAME = 'weights/weights_85-30_85-45_25_1040.h5'
NUM_CLASSES = 2
NUM_TRAIN_IMG = 2610 # just to check if the dataset was correctly loaded
NUM_VAL_IMG = 1045   # just to check if the dataset was correctly loaded
NUM_TEST_IMG = 1597   # just to check if the dataset was correctly loaded
BATCH_SIZE = 32      # Depends on the available GPU memory
MULTITASK=False
##### GENERAL SETTINGS
TRAIN_TXT = DATASET_FOLDER+'/train.txt'
VAL_TXT = DATASET_FOLDER+'/val.txt'
TEST_TXT=DATASET_FOLDER+'/test.txt'
CLASSES = DATASET_FOLDER+'/classesID.txt'
OUTPUTS = 'outputs/'
WEIGHTS_PATH = OUTPUTS + WEIGHTS_NAME
NAME_SAVED_MODEL = OUTPUTS + 'weights/weights_'
CONF_MATRIX = OUTPUTS + 'confusion_matrices/cm_'
ACCU_LOSS = OUTPUTS + 'accuracy_loss/acc_loss_'
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
SIZE = [MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT]

X_train, X_val, X_test, y_train, y_val, y_test = get_data_model(True)
X_train = np.vstack((X_train, X_val))
y_train =  np.hstack((y_train, y_val))
accuracy_classifiers(X_train, X_test, y_train, y_test, linear_svc=True)
"""

In [13]:
"""
FOR DOING EXPERIMENTS (small dataset)
TITLE = 'multitask_test' 
DATASET_FOLDER = '../../data/datasets/Test_multitask'
WEIGHTS_NAME = 'weights/weights_86-73_86-73_e99_i8799_multitask.h5'
NUM_CLASSES = 7
NUM_TRAIN_IMG = 100 # just to check if the dataset was correctly loaded
NUM_VAL_IMG = 100   # just to check if the dataset was correctly loaded
NUM_TEST_IMG = 100   # just to check if the dataset was correctly loaded
BATCH_SIZE = 32      # Depends on the available GPU memory
MULTITASK = True
MULTITASK_LBL = ["Food", "Table", "Social"]

##### GENERAL SETTINGS
TRAIN_TXT = DATASET_FOLDER+'/train.txt'
VAL_TXT = DATASET_FOLDER+'/val.txt'
TEST_TXT=DATASET_FOLDER+'/test.txt'
CLASSES = DATASET_FOLDER+'/classesID.txt'
OUTPUTS = 'outputs/'
WEIGHTS_PATH = OUTPUTS + WEIGHTS_NAME
NAME_SAVED_MODEL = OUTPUTS + 'weights/weights_'
CONF_MATRIX = OUTPUTS + 'confusion_matrices/cm_'
ACCU_LOSS = OUTPUTS + 'accuracy_loss/acc_loss_'
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
SIZE = [MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT]
"""

## Dataset

Run one of the datasets or insert anotherone. You can decide which one to run.

#### Multitask Dataset

In [1]:
#All the dataset multitask
TITLE = 'multitask_CNN_' 
DATASET_FOLDER = '../../data/datasets/multitask_dataset'
WEIGHTS_NAME = 'weights/weights_70-19_72-67_42-58_e11_i5951_multitask_weighted_15.h5'
NUM_CLASSES = 7
NUM_TRAIN_IMG = 31708 # just to check if the dataset was correctly loaded
NUM_VAL_IMG = 6795   # just to check if the dataset was correctly loaded
NUM_TEST_IMG = 6794   # just to check if the dataset was correctly loaded

BATCH_SIZE = 16      # Depends on the available GPU memory
MULTITASK = True
MULTITASK_LBL = ["Food", "Table", "Social"]

##### GENERAL SETTINGS
TRAIN_TXT = DATASET_FOLDER+'/train.txt'
VAL_TXT = DATASET_FOLDER+'/val.txt'
TEST_TXT=DATASET_FOLDER+'/test.txt'
CLASSES = DATASET_FOLDER+'/classesID.txt'
OUTPUTS = 'outputs/'
WEIGHTS_PATH = OUTPUTS + WEIGHTS_NAME
NAME_SAVED_MODEL = OUTPUTS + 'weights/weights_'
CONF_MATRIX = OUTPUTS + 'confusion_matrices/cm_'
ACCU_LOSS = OUTPUTS + 'accuracy_loss/acc_loss_'
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
SIZE = [MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT]

#### Full dataset

In [None]:
TITLE = 'full_dataset' 
DATASET_FOLDER =  '../../data/datasets/full_dataset'
WEIGHTS_NAME = 'weights/weights_17-70_52-17_e5_i5945_full_dataset.h5'
NUM_CLASSES = 12
NUM_TRAIN_IMG = 31708 # just to check if the dataset was correctly loaded
NUM_VAL_IMG = 6795   # just to check if the dataset was correctly loaded
NUM_TEST_IMG = 6794   # just to check if the dataset was correctly loaded
BATCH_SIZE = 16      # Depends on the available GPU memory
MULTITASK = False

##### GENERAL SETTINGS
TRAIN_TXT = DATASET_FOLDER+'/train.txt'
VAL_TXT = DATASET_FOLDER+'/val.txt'
TEST_TXT=DATASET_FOLDER+'/test.txt'
CLASSES = DATASET_FOLDER+'/classesID.txt'
OUTPUTS = 'outputs/'
WEIGHTS_PATH = OUTPUTS + WEIGHTS_NAME
NAME_SAVED_MODEL = OUTPUTS + 'weights/weights_'
CONF_MATRIX = OUTPUTS + 'confusion_matrices/cm_'
ACCU_LOSS = OUTPUTS + 'accuracy_loss/acc_loss_'
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
SIZE = [MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT]

In [12]:
#Creating or loading the data (parameter load)
X_train, X_val, X_test, y_train, y_val, y_test = get_data_model(load=True)

In [13]:
X_train = np.vstack((X_train, X_val))
y_train =  np.vstack((y_train, y_val))

In [14]:
X_train.shape

(38503, 1024)

In [15]:
#If the dataset is Full dataset, don't run this
#Converting the binary matrix into a vector with the number of the class
y_train_food = to_one_vector(y_train, [0,3])
y_test_food = to_one_vector(y_test, [0,3])

y_train_social =  to_one_vector(y_train, [3,5])
y_test_social =  to_one_vector(y_test, [3,5])

y_train_table =  to_one_vector(y_train, [5,7])
y_test_table =  to_one_vector(y_test, [5,7])

## Some Results

##### Linear SVC

In [None]:
#Grid search for food (dataset multitask) using linear SVM
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, linear_svc=True, name_task='CNN_Food', class_weight='balanced')

LinearSVC
balanced
Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed: 23.0min finished


	{'loss': 'hinge', 'C': 10, 'class_weight': 'balanced'} 0.933125568108

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed: 24.3min finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.942215296715

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed: 24.8min finished


	{'loss': 'squared_hinge', 'C': 1, 'class_weight': 'balanced'} 0.932346448513

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed: 24.3min finished


### K-NN

In [None]:
#KNN with Grid Search to find the best parameters for multitask social
accuracy_classifiers(X_train, X_test, y_train_social, y_test_social, k_neighbors=True, name_task='CNN_Social')

KNeighbors
Iteration KFold: 1 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 16.3min
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed: 31.0min finished


	{'n_neighbors': 10, 'metric': 'euclidean', 'weights': 'distance'} 0.708868978055

Iteration KFold: 2 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 25.2min
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed: 45.0min finished


	{'n_neighbors': 10, 'metric': 'chebyshev', 'weights': 'distance'} 0.700298662511

Iteration KFold: 3 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 24.7min
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed: 43.2min finished


### Gradient Boosting

In [16]:
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, gradient_boosting=True, name_task='CNN_Food')

GradientBoosting
Iteration KFold: 1 / 3
Fitting 3 folds for each of 12 candidates, totalling 36 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed: 53.1min finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.943202181535

Iteration KFold: 2 / 3
Fitting 3 folds for each of 12 candidates, totalling 36 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  36 out of  36 | elapsed: 67.7min finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.941639395356

Iteration KFold: 3 / 3
Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 

## Summary with the best parameters of the classifiers

In [5]:
def linear_SVM_predict(X, X_test, y, y_test, name_task, class_weight):
    name='linear_SVM'
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = svm.LinearSVC(loss='squared_hinge', C=0.01, class_weight='balanced')

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    title=TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5))
    np.save(title+"_real", y_test)
    np.save(title+"_predicted", yhat)
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, title, 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))

def knn_predict(X, X_test, y, y_test, name_task, class_weight):
    name='knn'
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = KNeighborsClassifier(n_neighbors=5, metric='euclidean', weights='distance')
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    title=TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5))
    np.save(title+"_real", y_test)
    np.save(title+"_predicted", yhat)
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, title, 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))

def gradient_predict(X, X_test, y, y_test, name_task, class_weight):
    name='gradient_boosting'
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = ensemble.GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, 
                                              min_samples_leaf=5)
       
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    title=TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5))
    np.save(title+"_real", y_test)
    np.save(title+"_predicted", yhat)
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, title, 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


### Multitask Dataset

#### Food

In [52]:
#n_neighbors 10, metric euclidean, weights distance
knn_predict(X_train, X_test, y_train_food, y_test_food, name_task='CNN_food', class_weight='')

	Accuracy: 0.940682955549 	 Accuracy normalized: 0.510606518063
	Precision: 0.922883108749 	 Recall: 0.940682955549
	F1 score: 0.926955460564


In [25]:
linear_SVM_predict(X_train, X_test, y_train_food, y_test_food, name_task='CNN_Food', class_weight='balanced')
#loss:hinge, C=10

	Accuracy: 0.940094200765 	 Accuracy normalized: 0.604797363141
	Precision: 0.932518476166 	 Recall: 0.940094200765
	F1 score: 0.935845050466


In [40]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train_food, y_test_food, name_task='CNN_food', class_weight='')

	Accuracy: 0.942302031204 	 Accuracy normalized: 0.486195608494
	Precision: 0.930131735077 	 Recall: 0.942302031204
	F1 score: 0.924913528735


#### Social

In [50]:
#n_neighbors 15, metric euclidean, weights distance
knn_predict(X_train, X_test, y_train_social, y_test_social, name_task='CNN_social', class_weight='')

	Accuracy: 0.706358551663 	 Accuracy normalized: 0.673813299174
	Precision: 0.700187136289 	 Recall: 0.706358551663
	F1 score: 0.700490703491


In [35]:
linear_SVM_predict(X_train, X_test, y_train_social, y_test_social, name_task='CNN_social', class_weight='balanced')
#loss: hinge, C=1

	Accuracy: 0.74065351781 	 Accuracy normalized: 0.731517249839
	Precision: 0.744341248176 	 Recall: 0.74065351781
	F1 score: 0.742053358226


In [41]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train_social, y_test_social, name_task='CNN_social', class_weight='')

	Accuracy: 0.700471003827 	 Accuracy normalized: 0.657010861582
	Precision: 0.692999482391 	 Recall: 0.700471003827
	F1 score: 0.688554313675


#### Table

In [43]:
#n_neighbors 10, metric euclidean, weights distance
knn_predict(X_train, X_test, y_train_table, y_test_table, name_task='CNN_table', class_weight='')

	Accuracy: 0.795849278775 	 Accuracy normalized: 0.743611947142
	Precision: 0.792085107276 	 Recall: 0.795849278775
	F1 score: 0.793587797202


In [30]:
linear_SVM_predict(X_train, X_test, y_train_table, y_test_table, name_task='CNN_table', class_weight='balanced')
#loss= hinge, C=1

	Accuracy: 0.819546658817 	 Accuracy normalized: 0.821890316539
	Precision: 0.840727062236 	 Recall: 0.819546658817
	F1 score: 0.824909222874


In [39]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train_table, y_test_table, name_task='CNN_table', class_weight='')

	Accuracy: 0.794524580512 	 Accuracy normalized: 0.7091399054
	Precision: 0.785617395649 	 Recall: 0.794524580512
	F1 score: 0.782646237233


### Full Dataset

In [41]:
X_train, X_val, X_test, y_train, y_val, y_test = get_data_model(load=True)


In [42]:
X_train = np.vstack((X_train, X_val))
y_train =  np.hstack((y_train, y_val))
y_train.shape

(38503,)

In [None]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train, y_test, name_task='CNN', class_weight='')

In [None]:
#n_neighbors 5, metric euclidean, weights distance
knn_predict(X_train, X_test, y_train, y_test, name_task='CNN', class_weight='')

In [None]:
linear_SVM_predict(X_train, X_test, y_train, y_test, name_task='CNN', class_weight='balanced')
#{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''}