In [1]:
import numpy
import pandas

import collections
import os
import sys
import pandas as pd
from PIL import Image

from skimage.feature import hog

from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import time

import matplotlib.pyplot as plot
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
train_directory = "small_dataset_train/"

# Data Flower
df = pd.read_csv('Flowers.csv',
                 sep = ',',
                 header = 0,
                 index_col = "MediaId",
                 encoding = "utf8").to_numpy()

# Flower Label
fl = df[:, 1].astype('int')

### Returns the dictionary of the HOG features of all the images in the directory

In [3]:
def getHogFeatures(directory):
    hogFeatures = {}
    
    for subdirectory in os.listdir(directory):
        for file in os.listdir(directory + subdirectory):
            if file.endswith(".jpg"):
                mediaId = int(os.path.splitext(file)[0])
                
                image = Image.open(os.path.join(directory + subdirectory, file))
                image = image.resize((100,100))

                hogFeature = hog(image,
                                 orientations = 8,
                                 pixels_per_cell = (16, 16),
                                 cells_per_block = (4, 4),
                                 visualize = False,
                                 multichannel = True)

                hogFeatures[mediaId] = hogFeature
                
    return hogFeatures

In [4]:
fd = getHogFeatures(train_directory)

# Sort the dictionaries so it matches the labels
fd = collections.OrderedDict(sorted(fd.items()))

# Only keep the list of values
fd = list(fd.values())[:]

In [5]:
validation_size=0.7 #30% du jeu de données pour le test

testsize= 1-validation_size

X_train, X_test, Y_train, Y_test = train_test_split(fd,
                                                    fl,
                                                    train_size = validation_size,
                                                    test_size = testsize)

### **Evaluation de LogisticRegression, KNeighborsClassifier, Linear SVC, RandomForestClassifier et DecisionTreeClassifier**

Nous créons un table des classifieurs à évaluer

In [None]:
seed = 7
scoring = 'accuracy'
models = []
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVC', SVC(gamma='auto')))
models.append(('RFC', RandomForestClassifier()))
models.append(('DTR', DecisionTreeClassifier()))

In [None]:
seed = 7
results = []
names = []
scoring='accuracy'
for name,model in models:
    kfold = KFold(n_splits=10, random_state=seed)
    start_time = time.time()
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
    print ("Time pour",name," ",time.time() - start_time)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg) 

### Comparaison de résultats de différents classifieurs

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure()
fig.suptitle('Comparaison des algorithmes')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)

### Tableau pour faciliter la comparaison des classifieurs


In [6]:
# Dictionary of all the classifiers we want to test
classifiers = {
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(),
    'RandomForestClassifier': RandomForestClassifier(),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'SVC': SVC()
}

# Dictionary of all their hyper-parameters to tune
parameters = {
    'KNeighborsClassifier': {
        'n_neighbors': [3, 5, 11],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan']
    },
    'LogisticRegression': {
        'C': [0.1, 1],
        'solver': ['saga'],
        'multi_class': ['auto']
    },
    'RandomForestClassifier': [
        {'n_estimators': [4, 6, 9]}, 
        {'max_features': ['log2', 'sqrt','auto']}, 
        {'criterion': ['entropy', 'gini']},
        {'max_depth': [2, 3, 5, 10]}, 
        {'min_samples_split': [2, 3, 5]},
        {'min_samples_leaf': [1,5,8]}
    ],
    'DecisionTreeClassifier': [
        {'max_depth': [1,2,3,4,5,6,7,8,9,10]},
        {'criterion': ['gini', 'entropy']},
        {'min_samples_leaf': [1,2,3,4,5,6,7,8,9,10]}
    ],
    'SVC': {
        'C': [0.1, 1],
        'gamma': ['scale'],
        'kernel': ['linear', 'rbf', 'sigmoid'],
        'random_state': [0]
    }
}

### Application de GridSearchCV afin de trouver les meilleurs paramètres pour chaque classifieur

In [7]:
# Class representing a model
class Model:
    def __init__(self, classifier, parameters, score):
        self.classifier = classifier
        self.parameters = parameters
        self.score = score

    def __repr__(self):
        return repr((self.classifier, self.parameters, self.score))

results = []
for classifierName, classifier in classifiers.items():
    gridSearch = GridSearchCV(
        estimator = classifier,
        param_grid = parameters[classifierName],
        scoring = "accuracy",
        cv = 5,
        n_jobs = -1,
        iid = True
    )

    gridSearch.fit(X_train, Y_train)

    result = Model(classifierName, gridSearch.best_params_, gridSearch.best_score_)
    results.append(result)
    print (result.classifier, ": ", result.score, '\n')

results = sorted(results, key = lambda result: result.score, reverse = True)

KNeighborsClassifier :  0.5421637186343069 

LogisticRegression :  0.5767174002468121 

RandomForestClassifier :  0.579185520361991 

DecisionTreeClassifier :  0.5767174002468121 

SVC :  0.5804195804195804 



### **Recupération du meilleur classifieur avec ses paramètres**

In [8]:
bestModel = results[0]
classifier = eval(bestModel.classifier)()
classifier.set_params(**bestModel.parameters)

classifier.fit(X_train, Y_train)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=0, shrinking=True,
  tol=0.001, verbose=False)

In [9]:
prediction = classifier.predict(X_test)

print("Accuracy: \n", accuracy_score(Y_test, prediction), '\n')
print("Confusion matrix: \n", confusion_matrix(Y_test, prediction), '\n')

Accuracy: 
 0.5896452540747843 

Confusion matrix: 
 [[  0   0  13   0   0]
 [  0   0  62   0   0]
 [  0   0 614   1   0]
 [  0   0 301   1   0]
 [  0   0  51   0   0]] 

