# Applying Traditional Classifiers using PCA

In this notebook we load all the images and apply PCA to them (there are many images, so we use Incremental PCA, that allow us to deal with so many images). Once we have a tractable amount of data, we proceed to apply classifiers. 

GridSearch is used to find the best hyperparameters of the classifiers. We tried the floowing models: Knn, SVC, SVC Linear (which takes a more reasonable time than SVC RBF), Gradient Boosting, Random Forest and Adaboost. 

In [4]:
import logging
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import our scripts
SCRIPTS_PATH = '/../../scripts'
sys.path.insert(0, os.getcwd()+SCRIPTS_PATH)
#print sys.path
from dataset import Dataset

from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn import cross_validation
from sklearn import metrics
from sklearn import grid_search
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn import ensemble
from sklearn.metrics import f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support
import itertools



In [5]:
#Save the confusion matrix, with a lot of information in it (Accuracy, Accuracy normalized, Precision, Recall, F1)
def save_confusion_matrix(y_true, y_pred, name_to_save, title='Confusion matrix ', info=''):

    # create the confusion matrix and normalize it
    cm = 1.0*confusion_matrix(y_true, y_pred, labels=range(len(set(y_true))))
    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    inf = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    print '\tPrecision:', inf[0],'\t Recall:', inf[1]
    print "\t" +'F1 score:', inf[2]
    # prepare the plot
    fig = plt.figure()
    plt.matshow(cm_norm)
    tick_marks = np.arange(cm_norm.shape[0])
    plt.xticks(tick_marks, rotation=45)
    plt.yticks(tick_marks)
    if (MULTITASK):
        for i, j in itertools.product(range(cm_norm.shape[0]), range(cm_norm.shape[1])):
            """
            plt.text(j, i, str(int(cm[i,j]))+' / '+str(int(sum(cm[i]))),
                     horizontalalignment="center",
                     color="white" if cm_norm[i, j] < 0.5 else "black")
            """
            plt.text(j, i, str(round(cm_norm[i,j]*100, 2))+'%',
                     horizontalalignment="center",
                     color="white" if cm_norm[i, j] < 0.5 else "black")
    plt.title(title)
    plt.colorbar()
    plt.ylabel('True label')
    plt.xlabel('Predicted label \n '+info+'\n F1 score: '+str(round(inf[2],2))+'\n Precision: '+str(round(inf[0], 2))+', Recall: '+str(round(inf[1], 2)))

    # save the figure and close so it never gets displayed
    plt.savefig(CONF_MATRIX+name_to_save+'.png')
    plt.close(fig)

In [6]:
def preprocess_input(x):
    x = x/255.
    x = x - 0.5
    x = x*2.
    return x

In [7]:
#Extract the images from the given path, and returns the information of the image and the true label
def get_data(val_ds):
    #We want to obtain the intermediate layer, to apply Classifiers to them. In layer_output we append the layer from all the batches
    #to obtain the intermediate layer from the given dataset and not just from a batch
    layer_output = []
    label=[]
    #In label we obtain the y_true from all the given dataset
    num_validations = val_ds.num_batches()
    for iter_val in range(num_validations):
        print 'iter_val: ', iter_val+1, 'out of: ', val_ds.num_batches()
        # Get new batch and preprocess the image according to the model
        X, y = val_ds.get_Xy(SIZE)
        X = preprocess_input(X)
        layer_output.append(X)
        label+=y
    return layer_output, label

In [8]:
# Creating or loading the data
def get_data_model(load=False):
    if (load):
        #Load files
        #X_train = np.load('npdata/X_train_noCNN_'+TITLE+'.npy')
        y_train = np.load('npdata/y_train_noCNN_'+TITLE+'.npy')
        #X_val = np.load('npdata/X_val_noCNN_'+TITLE+'.npy')
        y_val = np.load('npdata/y_val_noCNN_'+TITLE+'.npy')
        #X_test = np.load('npdata/X_test_noCNN_comp'+TITLE+'.npy')
        y_test = np.load('npdata/y_test_noCNN_comp'+TITLE+'.npy')
    else:
        
        # ------------------- Data
        # prepare the training, validation and test dataset
        train_ds = Dataset(TRAIN_TXT, NUM_TRAIN_IMG, BATCH_SIZE, CLASSES, 
                           data_augmentation=False, multitask = MULTITASK)
        val_ds = Dataset(VAL_TXT, NUM_VAL_IMG, BATCH_SIZE, CLASSES, 
                         data_augmentation=False, multitask = MULTITASK)
        
        test_ds = Dataset(TEST_TXT, NUM_TEST_IMG, BATCH_SIZE, CLASSES, 
                         data_augmentation=False, multitask = MULTITASK)

        
        #Training
        X_train, y_train = get_data(train_ds)
        X_train = np.concatenate(X_train)
        y_train = np.asarray(y_train)

        #Validation
        X_val, y_val = get_data(val_ds)
        X_val = np.concatenate(X_val)
        y_val = np.asarray(y_val)
        
        #Test
        X_test, y_test = get_data(test_ds)
        X_test = np.concatenate(X_test)
        y_test = np.asarray(y_test)
        
        ## Save training and test dataset
        np.save('npdata/X_train_noCNN_'+TITLE, X_train)
        np.save('npdata/y_train_noCNN_'+TITLE, y_train)
        np.save('npdata/X_val_noCNN_'+TITLE, X_val)
        np.save('npdata/y_val_noCNN_'+TITLE, y_val)
        
        np.save('npdata/X_test_noCNN_comp'+TITLE, X_test)
        np.save('npdata/y_test_noCNN_comp'+TITLE, y_test)

    #return X_train, X_val, X_test, y_train, y_val, y_test
    return y_train, y_val, y_test

#Select which classifier to use. 
def accuracy_classifiers(X_train, X_test, y_train, y_test, not_grid_search=False,
                         adaboost=False, gradient_boosting=False, random_forest=False,
                         svc=False, linear_svc=False, k_neighbors=False, name_task='', class_weight=''):  
        if (adaboost):
            Adaboost_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
        if (gradient_boosting):
            GradientBoosting_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
        if(random_forest):
            RandomForest_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
        if(svc):
            SVC_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task, class_weight=class_weight)
        if(linear_svc):
            LinearSVC_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task, class_weight=class_weight)
        if(k_neighbors):
            KNeighbors_GridSearch(X_train, X_test, y_train, y_test, name_task=name_task)
            
        if (not_grid_search):
            otherClassifiers(X_train, X_test, y_train, y_test, name_task=name_task)
        

In [9]:
#Computing accuracy with different classifiers. Without GridSearch run some classifiers, with default parameters.
#Save a an image for each classifier with the confusion matrix and information
def otherClassifiers(X_train, X_test, y_train, y_test, grid_search=False, name_task=''):
    names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", 
         "Decision Tree", "Random Forest", "AdaBoost",
         "Naive Bayes", "Gradient Boosting"]
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(),
        RandomForestClassifier(n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        GradientBoostingClassifier()]
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    for name, clf in zip(names, classifiers):    
        # Standard parameter
        clf.fit(X_train, y_train)
        yhat = clf.predict(X_test)
        # Recall, f1, precision
        acc = metrics.accuracy_score(yhat, y_test)
        acc_norm = normalized_accuracy(y_test, yhat)
        print '\n \n',name
        print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
        save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_noGrid_'+str(round(acc,5)), 
                          title=str(name)+' '+name_task, 
                          info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))
        

In [10]:
#Computes de normalized accuracy. Given the true label and the y predicted, returns the normalized accuracy
def normalized_accuracy(y_true, y_pred):
    cm = 1.0*confusion_matrix(y_true, y_pred, labels=range(len(set(y_true))))
    acc = []
    for i, row in enumerate(cm):
        acc.append(row[i]/sum(row))
    return sum(acc)/len(acc)

In [11]:
#Given a binary matrix with labels (ex. [[1,0,0], [0, 0, 1]]) returns correspondent vector no binary ([0,2])
def to_one_vector(y, pos):
    return np.asarray([lbl[pos[0]:pos[1]].argmax() for lbl in y])    
    

In [12]:
#KNeighbors Classifier applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def KNeighbors_GridSearch(X, X_test, y, y_test, name_task):
    name = 'KNeighbors'
    print  name
    #p=1 is manhattan distance, p=2 is euclidean_distance
    param_grid = [{'n_neighbors': [3, 5, 10, 100], 'metric': ['euclidean', 'manhattan','chebyshev', 'minkowski'], 'weights':['uniform','distance']}]

    # KFold
    nfolds = 5
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = KNeighborsClassifier()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = KNeighborsClassifier(n_neighbors=best_param['n_neighbors'], 
                                               metric=best_param['metric'], weights=best_param['weights'])

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc,2))+',  Acc. norm.:'+str(round(acc_norm,2)))

In [13]:
#Linear SVC applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def LinearSVC_GridSearch(X, X_test, y, y_test, name_task, class_weight):
    name = 'LinearSVC'
    print  name
    param_grid = [{'C': [0.001, 0.01, 0.1, 0.5, 1, 10, 100], 'loss': ['hinge', 'squared_hinge'], 'class_weight':[class_weight]}]
    # KFold
    nfolds = 5
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = svm.LinearSVC()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = svm.LinearSVC(loss=best_param['loss'], C=best_param['C'])

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


In [14]:
#Gradient Boosting applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def GradientBoosting_GridSearch(X, X_test, y, y_test, name_task):
    name = 'GradientBoosting'
    print  name
    param_grid = {'n_estimators':[10, 50, 100], 'learning_rate': [0.1, 0.01],
                    #'max_depth': [3, 4, 6],
                    'min_samples_leaf': [1, 3, 5]
                    # 'max_features': [1.0, 0.3, 0.1]
                    }

    # KFold
    nfolds = 2
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = ensemble.GradientBoostingClassifier()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = ensemble.GradientBoostingClassifier(n_estimators=best_param['n_estimators'], learning_rate=best_param['learning_rate'], 
                                              min_samples_leaf=best_param['min_samples_leaf'])
       
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


In [15]:
#SVC applying GridSearchCV. In param_grid, you can choose the parameters to do in GridSearch
def SVC_GridSearch(X, X_test, y, y_test, name_task, class_weight=''):
    name = 'SVC'
    print  name
    param_grid = [{'kernel': ['rbf'], 'gamma': [0.1, 0.001, 0.0001, 'auto'], 'C': [0.01, 0.025, 0.5, 1, 10, 100], 'class_weight':[class_weight]}]
    # KFold
    nfolds = 5
    kf = cross_validation.KFold(n=X.shape[0], n_folds=nfolds, shuffle=True, random_state=0)

    yhat = np.empty_like(y)
    acc = np.empty(nfolds)
    acc_norm = np.empty(nfolds)
    i = 0

    best_acc = 0

    for train_index, test_index in kf:
        X_train, X_val = X[train_index], X[test_index]
        y_train, y_val = y[train_index], y[test_index]

        print "Iteration KFold:", i+1,'/',nfolds

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        # Standard parameters
        clf = SVC()

        # We can change the scoring "average_precision", "recall", "f1"
        clf = grid_search.GridSearchCV(clf, param_grid, scoring='f1_weighted', n_jobs=4, verbose=1)
        clf.fit(X_train,y_train.ravel())

        X_val = scaler.transform(X_val)
        yhat[test_index] = clf.predict(X_val)

        acc[i] = metrics.accuracy_score(yhat[test_index], y_val)
        acc_norm[i] = normalized_accuracy(y_val, yhat[test_index])
        #We save the parameters with the ones we obtain better accuract
        if (acc_norm[i]>best_acc):
            best_acc = acc_norm[i]
            best_param = clf.best_params_
            
        print "\t" + str(clf.best_params_), acc[i]
        print
        i=i+1

    print 'Mean accuracy: '+ str(np.mean(acc))
    
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    #if (best_param['kernel']=='rbf'):
    clf2 = SVC(kernel=best_param['kernel'], gamma=best_param['gamma'], 
                                                   C=best_param['C'], class_weight=best_param['class_weight'])
    #else:
    #    clf2 = SVC(kernel=best_param['kernel'], C=best_param['C'])    
        
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    print best_param
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5)), 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


## Dataset

Run one of the datasets or insert anotherone. You can decide which one to run.

#### Multitask Dataset

In [62]:
#All the dataset multitask
TITLE = 'multitask' 
DATASET_FOLDER = '../../data/datasets/tiny_ds/multitask_dataset'
WEIGHTS_NAME = 'weights/weights_85-30_85-45_25_1040.h5'
NUM_CLASSES = 7
NUM_TRAIN_IMG = 5582 # just to check if the dataset was correctly loaded
NUM_VAL_IMG = 1196   # just to check if the dataset was correctly loaded
#NUM_TEST_IMG = 1195   # just to check if the dataset was correctly loaded
NUM_TEST_IMG = 6794
BATCH_SIZE = 64      # Depends on the available GPU memory
MULTITASK = True
MULTITASK_LBL = ["Food", "Table", "Social"]

##### GENERAL SETTINGS
TRAIN_TXT = DATASET_FOLDER+'/train.txt'
VAL_TXT = DATASET_FOLDER+'/val.txt'
TEST_TXT= '../../data/datasets/multitask_dataset/test.txt'


CLASSES = DATASET_FOLDER+'/classesID.txt'
OUTPUTS = 'outputs/'
WEIGHTS_PATH = OUTPUTS + WEIGHTS_NAME
NAME_SAVED_MODEL = OUTPUTS + 'weights/weights_'
CONF_MATRIX = OUTPUTS + 'confusion_matrices/cm_'
ACCU_LOSS = OUTPUTS + 'accuracy_loss/acc_loss_'
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
SIZE = [MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT]

##### Full Dataset

In [None]:
#All the dataset multitask
TITLE = 'full_dataset' 
DATASET_FOLDER = '../../data/datasets/tiny_ds/full_dataset'
NUM_CLASSES = 12
NUM_TRAIN_IMG = 5582 # just to check if the dataset was correctly loaded
NUM_VAL_IMG = 1196   # just to check if the dataset was correctly loaded
#NUM_TEST_IMG = 1195   # just to check if the dataset was correctly loaded
NUM_TEST_IMG = 6794
BATCH_SIZE = 64      # Depends on the available GPU memory
MULTITASK = False

##### GENERAL SETTINGS
TRAIN_TXT = DATASET_FOLDER+'/train.txt'
VAL_TXT = DATASET_FOLDER+'/val.txt'
TEST_TXT= '../../data/datasets/full_dataset/test.txt'


CLASSES = DATASET_FOLDER+'/classesID.txt'
OUTPUTS = 'outputs/'
WEIGHTS_PATH = OUTPUTS + WEIGHTS_NAME
NAME_SAVED_MODEL = OUTPUTS + 'weights/weights_'
CONF_MATRIX = OUTPUTS + 'confusion_matrices/cm_'
ACCU_LOSS = OUTPUTS + 'accuracy_loss/acc_loss_'
MODEL_INPUT_WIDTH = 299
MODEL_INPUT_HEIGHT = 299
SIZE = [MODEL_INPUT_WIDTH, MODEL_INPUT_HEIGHT]

In [None]:
#Loading or creating the data
X_train, X_val, X_test, y_train, y_val, y_test = get_data_model(load=True)
X_train = np.vstack((X_train, X_val))

In [12]:
# Computing the all image RGB is too much, so we take the channel R
X_train = X_train[:, :, :, 0]
X_train= X_train.reshape(X_train.shape[0],-1)
X_test = X_test[:, :, :, 0]
X_test = X_test.reshape(X_test.shape[0],-1)
y_train =  np.vstack((y_train, y_val))

In [17]:
#Run this if you don't want to run the X_train and X_test. (You want to load directly the X after doing PCA)
y_train, y_val, y_test = get_data_model(load=True)
y_train =  np.vstack((y_train, y_val))


### Incremental PCA

If we use the standard PCA, we obtain memory error. So we use Incremental PCA, 

In [14]:
from sklearn.decomposition import PCA, IncrementalPCA

In [20]:
#initial shape before applying PCA
X_train.shape

(6778, 89401)

In [21]:
ipca = IncrementalPCA(n_components=250)

In [22]:
# Fitting the data with the training dataset
fit_pca = ipca.fit(X_train)

In [23]:
#Checking that the percentage of variance explained by each of the
#selected components is over 98%
sum(fit_pca.explained_variance_ratio_)

0.95029825238972521

In [24]:
# applying dimensionality reduction to training dataset
X_train = fit_pca.transform(X_train)
# Observing the shape of the new training dataset 
X_train.shape
#Saving the data after applying PCA
np.save('npdata/X_train_PCA'+TITLE, X_train)

In [25]:
# applying dimensionaliity reduction to test dataset
X_test =  fit_pca.transform(X_test)
# Observing the shape of the new test dataset 
X_test.shape
#Saving the data after applying PCA
np.save('npdata/X_test_PCA'+TITLE, X_test)

In [18]:
X_train = np.load('npdata/X_train_PCA'+TITLE+'.npy')
X_test = np.load('npdata/X_test_PCA'+TITLE+'.npy')

In [19]:
#If the dataset is Full dataset, don't run this
#Converting the binary matrix into a vector with the number of the class
y_train_food = to_one_vector(y_train, [0,3])
y_test_food = to_one_vector(y_test, [0,3])

y_train_social =  to_one_vector(y_train, [3,5])
y_test_social =  to_one_vector(y_test, [3,5])

y_train_table =  to_one_vector(y_train, [5,7])
y_test_table =  to_one_vector(y_test, [5,7])

We do not have all the information computed. But we can see some of the results that we have computed.

## Some results

#### Using the separated dataset

#### Food

In [47]:
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, not_grid_search=True, name_task='PCA_Food')


 
Nearest Neighbors
	Accuracy: 0.926552840742 	 Accuracy normalized: 0.335587177917
	Precision: 0.886828502419 	 Recall: 0.926552840742
	F1 score: 0.891659628872

 
Linear SVM
	Accuracy: 0.926405652046 	 Accuracy normalized: 0.333333333333
	Precision: 0.858227432143 	 Recall: 0.926405652046
	F1 score: 0.891014238077

 
RBF SVM
	Accuracy: 0.87518398587 	 Accuracy normalized: 0.420379192429
	Precision: 0.881192369976 	 Recall: 0.87518398587
	F1 score: 0.878133109466

 
Decision Tree
	Accuracy: 0.931410067707 	 Accuracy normalized: 0.376812782543
	Precision: 0.934564421415 	 Recall: 0.931410067707
	F1 score: 0.902716506519

 
Random Forest
	Accuracy: 0.923461878128 	 Accuracy normalized: 0.36099530091
	Precision: 0.885792951345 	 Recall: 0.923461878128
	F1 score: 0.895632736529


#### Table

In [48]:
accuracy_classifiers(X_train, X_test, y_train_table, y_test_table, not_grid_search=True, name_task='PCA_Table')


 
Nearest Neighbors
	Accuracy: 0.62481601413 	 Accuracy normalized: 0.51188372904
	Precision: 0.652719929773 	 Recall: 0.62481601413
	F1 score: 0.496538103731

 
Linear SVM
	Accuracy: 0.620105975861 	 Accuracy normalized: 0.503080477474
	Precision: 0.764766122852 	 Recall: 0.620105975861
	F1 score: 0.477155141199

 
RBF SVM
	Accuracy: 0.656903149838 	 Accuracy normalized: 0.620353032847
	Precision: 0.647754984448 	 Recall: 0.656903149838
	F1 score: 0.649422807488

 
Decision Tree
	Accuracy: 0.658080659405 	 Accuracy normalized: 0.572423607122
	Precision: 0.661382053231 	 Recall: 0.658080659405
	F1 score: 0.598418242103

 
Random Forest
	Accuracy: 0.654842508095 	 Accuracy normalized: 0.572371586907
	Precision: 0.650483340529 	 Recall: 0.654842508095
	F1 score: 0.600235251144


#### Social

In [49]:
accuracy_classifiers(X_train, X_test, y_train_social, y_test_social, not_grid_search=True, name_task='PCA_Social')


 
Nearest Neighbors
	Accuracy: 0.71548425081 	 Accuracy normalized: 0.515831723505
	Precision: 0.78837015074 	 Recall: 0.71548425081
	F1 score: 0.60614382555

 
Linear SVM
	Accuracy: 0.706947306447 	 Accuracy normalized: 0.50114894446
	Precision: 0.743980572133 	 Recall: 0.706947306447
	F1 score: 0.58645208227

 
RBF SVM
	Accuracy: 0.717397703856 	 Accuracy normalized: 0.633311660063
	Precision: 0.70396446 	 Recall: 0.717397703856
	F1 score: 0.708608891194

 
Decision Tree
	Accuracy: 0.730350309096 	 Accuracy normalized: 0.559742657331
	Precision: 0.721270452932 	 Recall: 0.730350309096
	F1 score: 0.661508455548

 
Random Forest
	Accuracy: 0.725787459523 	 Accuracy normalized: 0.563834727473
	Precision: 0.70123143655 	 Recall: 0.725787459523
	F1 score: 0.666029116125


### Grid Search

#### Linear SVC

In [18]:
#Grid Search of Linear SVC for the computing if the image is social or not, taking into account the unbalanced dataset
accuracy_classifiers(X_train, X_test, y_train_social, y_test_social, linear_svc=True, name_task='PCA_Social', class_weight='balanced')

LinearSVC
balanced
Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   19.9s finished


	{'loss': 'hinge', 'C': 10, 'class_weight': 'balanced'} 0.76401179941

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   20.4s finished


	{'loss': 'hinge', 'C': 10, 'class_weight': 'balanced'} 0.769174041298

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   20.3s finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.791297935103

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   20.4s finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.767527675277

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   20.7s finished


	{'loss': 'hinge', 'C': 10, 'class_weight': 'balanced'} 0.769741697417

Mean accuracy: 0.772350629701
{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'}
	Accuracy: 0.693405946423 	 Accuracy normalized: 0.530662611937
	Precision: 0.635226019445 	 Recall: 0.693405946423
	F1 score: 0.632931370276


In [19]:
#Grid Search of Linear SVC for the computing if the image is social or not
accuracy_classifiers(X_train, X_test, y_train_social, y_test_social, linear_svc=True, name_task='PCA_Social', class_weight='')

LinearSVC

Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  35 out of  42 | elapsed:   14.3s remaining:    2.9s
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   17.3s finished


	{'loss': 'squared_hinge', 'C': 0.1, 'class_weight': ''} 0.818584070796

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   17.8s finished


	{'loss': 'squared_hinge', 'C': 0.1, 'class_weight': ''} 0.841445427729

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   17.9s finished


	{'loss': 'squared_hinge', 'C': 0.5, 'class_weight': ''} 0.837020648968

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   17.6s finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.807380073801

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   17.9s finished


	{'loss': 'squared_hinge', 'C': 0.1, 'class_weight': ''} 0.814022140221

Mean accuracy: 0.823690472303
{'loss': 'squared_hinge', 'C': 0.5, 'class_weight': ''}
	Accuracy: 0.70900794819 	 Accuracy normalized: 0.520473041324
	Precision: 0.663176220164 	 Recall: 0.70900794819
	F1 score: 0.616828471357


In [21]:
#Grid Search of Linear SVC for the computing if in the image is a table or not, taking into account the unbalanced dataset
accuracy_classifiers(X_train, X_test, y_train_table, y_test_table, linear_svc=True, name_task='PCA_Table', class_weight='balanced')

LinearSVC
balanced
Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   21.0s finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.718289085546

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   21.2s finished


	{'loss': 'hinge', 'C': 100, 'class_weight': 'balanced'} 0.731563421829

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   21.9s finished


	{'loss': 'hinge', 'C': 10, 'class_weight': 'balanced'} 0.700589970501

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   21.0s finished


	{'loss': 'squared_hinge', 'C': 100, 'class_weight': 'balanced'} 0.712915129151

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   21.6s finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.684870848708

Mean accuracy: 0.709645691147
{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'}
	Accuracy: 0.617898145422 	 Accuracy normalized: 0.540487553269
	Precision: 0.587006153644 	 Recall: 0.617898145422
	F1 score: 0.569396519006


In [22]:
#Grid Search of Linear SVC for the computing if in the image is a table or not
accuracy_classifiers(X_train, X_test, y_train_table, y_test_table, linear_svc=True, name_task='PCA_Table', class_weight='')

LinearSVC

Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   19.4s finished


	{'loss': 'squared_hinge', 'C': 0.5, 'class_weight': ''} 0.773598820059

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits
	{'loss': 'squared_hinge', 'C': 0.001, 'class_weight': ''} 0.779498525074

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   19.9s finished
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   19.7s finished


	{'loss': 'squared_hinge', 'C': 0.1, 'class_weight': ''} 0.772861356932

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   19.8s finished


	{'loss': 'squared_hinge', 'C': 0.5, 'class_weight': ''} 0.759409594096

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   19.9s finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.760885608856

Mean accuracy: 0.769250781003
{'loss': 'squared_hinge', 'C': 0.5, 'class_weight': ''}
	Accuracy: 0.627318221961 	 Accuracy normalized: 0.519413769797
	Precision: 0.62772684558 	 Recall: 0.627318221961
	F1 score: 0.514737400612


In [23]:
#Grid Search of Linear SVC for the computing if in the image is FRNE, E, NFR, taking into account the unbalanced dataset
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, linear_svc=True, name_task='PCA_Food', class_weight='balanced')

LinearSVC
balanced
Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   25.5s finished


	{'loss': 'squared_hinge', 'C': 100, 'class_weight': 'balanced'} 0.94395280236

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   22.1s finished


	{'loss': 'hinge', 'C': 0.001, 'class_weight': 'balanced'} 0.941740412979

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   23.3s finished


	{'loss': 'hinge', 'C': 0.001, 'class_weight': 'balanced'} 0.944690265487

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   22.6s finished


	{'loss': 'hinge', 'C': 0.001, 'class_weight': 'balanced'} 0.927675276753

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   24.1s finished


	{'loss': 'squared_hinge', 'C': 100, 'class_weight': 'balanced'} 0.938007380074

Mean accuracy: 0.939213227531
{'loss': 'hinge', 'C': 0.001, 'class_weight': 'balanced'}
	Accuracy: 0.926405652046 	 Accuracy normalized: 0.333333333333
	Precision: 0.858227432143 	 Recall: 0.926405652046
	F1 score: 0.891014238077


  'precision', 'predicted', average, warn_for)


In [24]:
#Grid Search of Linear SVC for the computing if in the image is FRNE, E, NFR
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, linear_svc=True, name_task='PCA_Food', class_weight='')

LinearSVC

Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   25.4s finished


	{'loss': 'hinge', 'C': 0.1, 'class_weight': ''} 0.957227138643

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   20.2s finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.95796460177

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   21.9s finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.955752212389

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


	{'loss': 'hinge', 'C': 0.001, 'class_weight': ''} 0.947601476015

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   20.6s finished
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:   22.3s finished


	{'loss': 'hinge', 'C': 0.001, 'class_weight': ''} 0.957933579336

Mean accuracy: 0.955295801631
{'loss': 'hinge', 'C': 0.1, 'class_weight': ''}
	Accuracy: 0.926405652046 	 Accuracy normalized: 0.333333333333
	Precision: 0.858227432143 	 Recall: 0.926405652046
	F1 score: 0.891014238077


### KNN

In [18]:
accuracy_classifiers(X_train, X_test, y_train_social, y_test_social, k_neighbors=True, name_task='PCA_Social')

KNeighbors
Iteration KFold: 1 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.4s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.830383480826

Iteration KFold: 2 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.9s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.846607669617

Iteration KFold: 3 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.5s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.826696165192

Iteration KFold: 4 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.3s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.816974169742

Iteration KFold: 5 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.9s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.814760147601

Mean accuracy: 0.827084326595
{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'}
	Accuracy: 0.734765969974 	 Accuracy normalized: 0.549103118287
	Precision: 0.800182667102 	 Recall: 0.734765969974
	F1 score: 0.647509858033


In [19]:
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, k_neighbors=True, name_task='PCA_Food')

KNeighbors
Iteration KFold: 1 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.2s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 5, 'metric': 'euclidean', 'weights': 'uniform'} 0.957227138643

Iteration KFold: 2 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.1s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'euclidean', 'weights': 'uniform'} 0.95796460177

Iteration KFold: 3 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.1s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 5, 'metric': 'euclidean', 'weights': 'uniform'} 0.955752212389

Iteration KFold: 4 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.1s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'chebyshev', 'weights': 'uniform'} 0.948339483395

Iteration KFold: 5 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.9s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'euclidean', 'weights': 'uniform'} 0.957933579336

Mean accuracy: 0.955443403107
{'n_neighbors': 3, 'metric': 'chebyshev', 'weights': 'uniform'}
	Accuracy: 0.925964085958 	 Accuracy normalized: 0.333174451859
	Precision: 0.858197313653 	 Recall: 0.925964085958
	F1 score: 0.890793726713


In [20]:
accuracy_classifiers(X_train, X_test, y_train_table, y_test_table, k_neighbors=True, name_task='PCA_Table')

KNeighbors
Iteration KFold: 1 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.8s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'uniform'} 0.773598820059

Iteration KFold: 2 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.8s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'euclidean', 'weights': 'distance'} 0.78982300885

Iteration KFold: 3 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.9s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'euclidean', 'weights': 'uniform'} 0.77581120944

Iteration KFold: 4 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.6s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.774169741697

Iteration KFold: 5 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   42.9s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'} 0.767527675277

Mean accuracy: 0.776186091064
{'n_neighbors': 3, 'metric': 'manhattan', 'weights': 'distance'}
	Accuracy: 0.654548130704 	 Accuracy normalized: 0.54989399161
	Precision: 0.748304209074 	 Recall: 0.654548130704
	F1 score: 0.5545178198


### Gradient Boosting

In [22]:
accuracy_classifiers(X_train, X_test, y_train_social, y_test_social, gradient_boosting=True, name_task='PCA_Social')

GradientBoosting
Iteration KFold: 1 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   21.8s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   30.5s finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.851873709059

Iteration KFold: 2 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   21.9s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   30.2s finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.835054588374

Mean accuracy: 0.843464148716
{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5}
	Accuracy: 0.720047100383 	 Accuracy normalized: 0.542784550457
	Precision: 0.697033617283 	 Recall: 0.720047100383
	F1 score: 0.643149929595


In [23]:
accuracy_classifiers(X_train, X_test, y_train_table, y_test_table, gradient_boosting=True, name_task='PCA_Table')

GradientBoosting
Iteration KFold: 1 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.5s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   30.4s finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 1} 0.774564768368

Iteration KFold: 2 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   22.4s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   29.7s finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.769253467099

Mean accuracy: 0.771909117734
{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 1}
	Accuracy: 0.63703267589 	 Accuracy normalized: 0.538506289538
	Precision: 0.635639369317 	 Recall: 0.63703267589
	F1 score: 0.549832131674


In [24]:
accuracy_classifiers(X_train, X_test, y_train_food, y_test_food, gradient_boosting=True, name_task='PCA_Food')

GradientBoosting
Iteration KFold: 1 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:  1.4min finished


	{'n_estimators': 50, 'learning_rate': 0.1, 'min_samples_leaf': 3} 0.956329300679

Iteration KFold: 2 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   58.6s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:  1.3min finished


	{'n_estimators': 50, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.953673650044

Mean accuracy: 0.955001475361
{'n_estimators': 50, 'learning_rate': 0.1, 'min_samples_leaf': 3}
	Accuracy: 0.926111274654 	 Accuracy normalized: 0.336528738488
	Precision: 0.872862323722 	 Recall: 0.926111274654
	F1 score: 0.891775513188


  'precision', 'predicted', average, warn_for)


###  Using Full dataset


In [41]:
y_train, y_val, y_test = get_data_model(load=True)
y_train = np.vstack((y_train, y_val))
y_train.shape

(6778, 12)

In [42]:
y_train = to_one_vector(y_train,[0,12])


In [43]:
y_train.shape

(6778,)

In [44]:
y_test = to_one_vector(y_test,[0,12])

#### Other Classifiers

In [25]:
accuracy_classifiers(X_train, X_test, to_one_vector(y_train,[0,12]), to_one_vector(y_test,[0,12]), not_grid_search=True, name_task='PCA_Social')


 
Nearest Neighbors
	Accuracy: 0.422137179865 	 Accuracy normalized: 0.0841143345121
	Precision: 0.286215098781 	 Recall: 0.422137179865
	F1 score: 0.324616389061


  'precision', 'predicted', average, warn_for)



 
Linear SVM
	Accuracy: 0.466588166029 	 Accuracy normalized: 0.0830492590964
	Precision: 0.43720083304 	 Recall: 0.466588166029
	F1 score: 0.298959072181

 
RBF SVM
	Accuracy: 0.324403885782 	 Accuracy normalized: 0.0872632483996
	Precision: 0.302444482651 	 Recall: 0.324403885782
	F1 score: 0.309139559556

 
Decision Tree
	Accuracy: 0.433176332058 	 Accuracy normalized: 0.0838831981671
	Precision: 0.293963683215 	 Recall: 0.433176332058
	F1 score: 0.323917995436

 
Random Forest
	Accuracy: 0.438327936414 	 Accuracy normalized: 0.0829329182027
	Precision: 0.294035943825 	 Recall: 0.438327936414
	F1 score: 0.304507526164


#### Linear SVC

In [51]:
accuracy_classifiers(X_train, X_test, y_train, y_test, linear_svc=True, name_task='PCA', class_weight='balanced')

LinearSVC
balanced
Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  2.2min finished


	{'loss': 'squared_hinge', 'C': 100, 'class_weight': 'balanced'} 0.34808259587

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  2.1min finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.401179941003

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  2.1min finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.378318584071

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  2.2min finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.369741697417

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  2.1min finished


	{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'} 0.361623616236

Mean accuracy: 0.371789286919
{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'}
	Accuracy: 0.388430968502 	 Accuracy normalized: 0.0836651915238
	Precision: 0.297414336413 	 Recall: 0.388430968502
	F1 score: 0.322241872721


In [36]:
accuracy_classifiers(X_train, X_test, y_train, y_test, linear_svc=True, name_task='PCA', class_weight='')

LinearSVC

Iteration KFold: 1 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  1.6min finished


	{'loss': 'squared_hinge', 'C': 0.001, 'class_weight': ''} 0.568584070796

Iteration KFold: 2 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  1.6min finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.587020648968

Iteration KFold: 3 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  1.7min finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.559734513274

Iteration KFold: 4 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  1.7min finished


	{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''} 0.588929889299

Iteration KFold: 5 / 5
Fitting 3 folds for each of 14 candidates, totalling 42 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 out of  42 | elapsed:  1.6min finished


	{'loss': 'squared_hinge', 'C': 0.1, 'class_weight': ''} 0.567527675277

Mean accuracy: 0.574359359523
{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''}
	Accuracy: 0.46438033559 	 Accuracy normalized: 0.0839015345965
	Precision: 0.303684920007 	 Recall: 0.46438033559
	F1 score: 0.306373545502


### K-NN

In [49]:
accuracy_classifiers(X_train, X_test, y_train, y_test, k_neighbors=True, name_task='PCA')

KNeighbors
Iteration KFold: 1 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   41.1s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.7min finished


	{'n_neighbors': 5, 'metric': 'chebyshev', 'weights': 'uniform'} 0.525073746313

Iteration KFold: 2 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   52.5s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.7min finished


	{'n_neighbors': 10, 'metric': 'euclidean', 'weights': 'distance'} 0.522123893805

Iteration KFold: 3 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   43.6s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 10, 'metric': 'euclidean', 'weights': 'distance'} 0.548672566372

Iteration KFold: 4 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   41.7s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.5min finished


	{'n_neighbors': 5, 'metric': 'manhattan', 'weights': 'uniform'} 0.553505535055

Iteration KFold: 5 / 5
Fitting 3 folds for each of 32 candidates, totalling 96 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   41.7s
[Parallel(n_jobs=4)]: Done  96 out of  96 | elapsed:  1.4min finished


	{'n_neighbors': 5, 'metric': 'euclidean', 'weights': 'distance'} 0.523247232472

Mean accuracy: 0.534524594803
{'n_neighbors': 5, 'metric': 'manhattan', 'weights': 'uniform'}
	Accuracy: 0.433176332058 	 Accuracy normalized: 0.0828763254151
	Precision: 0.313367144177 	 Recall: 0.433176332058
	F1 score: 0.319313598846


### Gradient Boosting

In [50]:
accuracy_classifiers(X_train, X_test, y_train, y_test, gradient_boosting=True, name_task='PCA')

GradientBoosting
Iteration KFold: 1 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.4min
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:  7.2min finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 1} 0.562702862201

Iteration KFold: 2 / 2
Fitting 3 folds for each of 18 candidates, totalling 54 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  5.3min
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:  7.1min finished


	{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5} 0.573620537032

Mean accuracy: 0.568161699616
{'n_estimators': 100, 'learning_rate': 0.1, 'min_samples_leaf': 5}
	Accuracy: 0.460406240801 	 Accuracy normalized: 0.0858915130342
	Precision: 0.280868747799 	 Recall: 0.460406240801
	F1 score: 0.302467368004


## Summary with the best parameters of the classifiers

In [None]:
# For given parameters see the information and obtain y predicted for the test dataset
#using linear SVC
def linear_SVM_predict(X, X_test, y, y_test, name_task, class_weight):
    name='linear_SVC'
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = svm.LinearSVC(loss='squared_hinge', C=0.001, class_weight='balanced')

    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    title=TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5))
    np.save("npdata/"+title+"_real", y_test)
    np.save("npdata/"+title+"_predicted", yhat)
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, title, 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))

# For given parameters see the information and obtain y predicted for the test dataset
#using linear SVC
def knn_predict(X, X_test, y, y_test, name_task, class_weight):
    name='knn'
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = KNeighborsClassifier(n_neighbors=3, metric='chebyshev', weights='uniform')
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    title=TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5))
    np.save("npdata/"+title+"_real", y_test)
    np.save("npdata/"+title+"_predicted", yhat)
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, title, 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))

# For given parameters see the information and obtain y predicted for the test dataset
#using Gradient Boosting
def gradient_predict(X, X_test, y, y_test, name_task, class_weight):
    name='gradient_boosting'
    # Once we have the best parameters for the training dataset, we obtain the accuraacy for the test dataset with the best parameters obtained. 
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X)
    X_test = scaler.fit_transform(X_test)

    clf2 = ensemble.GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, 
                                              min_samples_leaf=3)
       
    clf2.fit(X_train, y)
    yhat = clf2.predict(X_test)

    acc = metrics.accuracy_score(yhat, y_test)
    acc_norm = normalized_accuracy(y_test, yhat)
    title=TITLE+'_'+name+'_'+name_task+'_'+TITLE+'_'+class_weight+'_'+str(round(acc,5))
    np.save("npdata/"+title+"_real", y_test)
    np.save("npdata/"+title+"_predicted", yhat)
    print '\tAccuracy:',acc, '\t Accuracy normalized:', acc_norm
    save_confusion_matrix(y_test, yhat, title, 
                      title=str(name)+'_'+name_task, 
                      info='Acc.: '+ str(round(acc*100,2))+',  Acc. norm.:'+str(round(acc_norm*100,2)))


### Multitask Dataset

###### Food

In [70]:
#n_neighbors 3, metric chebyshev, weights uniform
knn_predict(X_train, X_test, y_train_food, y_test_food, name_task='PCA_food', class_weight='')

	Accuracy: 0.925964085958 	 Accuracy normalized: 0.333174451859
	Precision: 0.858197313653 	 Recall: 0.925964085958
	F1 score: 0.890793726713


In [71]:
linear_SVM_predict(X_train, X_test, y_train_food, y_test_food, name_task='PCA_food', class_weight='balanced')
#{'loss': 'squared_hinge', 'C': 0.001, 'class_weight': ''}

	Accuracy: 0.885192817192 	 Accuracy normalized: 0.357734614669
	Precision: 0.86839400532 	 Recall: 0.885192817192
	F1 score: 0.876562662027


In [72]:
#n_estimators=50, learningrate=0.1, min samplesleaf=3
gradient_predict(X_train, X_test, y_train_food, y_test_food, name_task='PCA_food', class_weight='')

	Accuracy: 0.925816897262 	 Accuracy normalized: 0.334221933413
	Precision: 0.864506898846 	 Recall: 0.925816897262
	F1 score: 0.891070252813


###### Social

In [63]:
#n_neighbors 3, metric manhattan, weights distance
knn_predict(X_train, X_test, y_train_social, y_test_social, name_task='PCA_Social', class_weight='')

	Accuracy: 0.654548130704 	 Accuracy normalized: 0.54989399161
	Precision: 0.748304209074 	 Recall: 0.654548130704
	F1 score: 0.5545178198


In [64]:
linear_SVM_predict(X_train, X_test, y_train_social, y_test_social, name_task='PCA_Social', class_weight='balanced')
#{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'}

	Accuracy: 0.613188107153 	 Accuracy normalized: 0.545703153205
	Precision: 0.585161110091 	 Recall: 0.613188107153
	F1 score: 0.578356238279


In [65]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train_social, y_test_social, name_task='PCA_Social', class_weight='')

	Accuracy: 0.637327053282 	 Accuracy normalized: 0.538671157837
	Precision: 0.636880634302 	 Recall: 0.637327053282
	F1 score: 0.549857369336


#### Table

In [66]:
#n_neighbors 3, metric manhattan, weights distance
knn_predict(X_train, X_test, y_train_table, y_test_table, name_task='PCA_table', class_weight='')

	Accuracy: 0.734765969974 	 Accuracy normalized: 0.549103118287
	Precision: 0.800182667102 	 Recall: 0.734765969974
	F1 score: 0.647509858033


In [67]:
linear_SVM_predict(X_train, X_test, y_train_table, y_test_table, name_task='PCA_table', class_weight='balanced')
#{'loss': 'squared_hinge', 'C': 10, 'class_weight': 'balanced'}

	Accuracy: 0.685457756844 	 Accuracy normalized: 0.53616574255
	Precision: 0.632972519945 	 Recall: 0.685457756844
	F1 score: 0.638146967744


In [68]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train_table, y_test_table, name_task='PCA_table', class_weight='')

	Accuracy: 0.719899911687 	 Accuracy normalized: 0.542533923891
	Precision: 0.696650164436 	 Recall: 0.719899911687
	F1 score: 0.64287207616


## Power Set

In [46]:
#n_neighbors 5, metric euclidean, weights distance
knn_predict(X_train, X_test, y_train, y_test, name_task='PCA', class_weight='')

	Accuracy: 0.407271121578 	 Accuracy normalized: 0.0828223205629
	Precision: 0.290684029124 	 Recall: 0.407271121578
	F1 score: 0.324469246326


  'precision', 'predicted', average, warn_for)


In [50]:
linear_SVM_predict(X_train, X_test, y_train, y_test, name_task='PCA', class_weight='')
#{'loss': 'squared_hinge', 'C': 0.01, 'class_weight': ''}

	Accuracy: 0.46438033559 	 Accuracy normalized: 0.0839015345965
	Precision: 0.303684920007 	 Recall: 0.46438033559
	F1 score: 0.306373545502


In [48]:
#n_estimators=100, learningrate=0.1, min samplesleaf=5
gradient_predict(X_train, X_test, y_train, y_test, name_task='PCA', class_weight='')

	Accuracy: 0.460111863409 	 Accuracy normalized: 0.0858089415556
	Precision: 0.278204621411 	 Recall: 0.460111863409
	F1 score: 0.302094109146
