## Partie : Prédiction basée sur les clusters

## Importation des bibliothèques

In [1]:
import os
import numpy as np
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.utils import shuffle
import random

## Importation des données

In [11]:
format_fichier = [".jpg"]

def lire_images(cheminRacine, array, y, index):     
    for lists in os.listdir(cheminRacine): 
        chemin = os.path.join(cheminRacine, lists) 
        filename, extension = os.path.splitext(chemin) 
        if extension in format_fichier:   
            array.append(mpimg.imread(chemin))
            y.append(index)
        if os.path.isdir(chemin): 
            lire_images(chemin, array, y, index)
images = []
classes = []

#remplissage de la liste d'images + classes
lire_images("./Training/Banana", images, classes, 0)
lire_images("./Training/Corn", images, classes, 1)
lire_images("./Training/Strawberry", images, classes, 2)
lire_images("./Training/Raspberry", images, classes, 3)
lire_images("./Training/Clementine", images, classes, 4)
lire_images("./Training/Pear Williams", images, classes, 5)
lire_images("./Training/Nectarine", images, classes, 6)
lire_images("./Training/Orange", images, classes, 7)
lire_images("./Training/Lychee", images, classes, 8)
lire_images("./Training/Kiwi", images, classes, 9)
lire_images("./Training/Blueberry", images, classes, 10)
lire_images("./Training/Cherry 1", images, classes, 11)
lire_images("./Training/Apricot", images, classes, 12)
lire_images("./Training/Lemon", images, classes, 13)
lire_images("./Training/Watermelon", images, classes, 14)
lire_images("./Training/Tangelo", images, classes, 15)
lire_images("./Training/Plum", images, classes, 16)
lire_images("./Training/Peach", images, classes, 17)
lire_images("./Training/Kaki", images, classes, 18)
lire_images("./Training/Mango", images, classes, 19)


fruits = ["Banane","Mais","Fraise","Framboise","Clementine","Poire",
          "Nectarine","Orange","Lychee","Kiwi","Mirtille","Cerise",
          "Abricot","Citron","Melon","Tangelo","Plum","Peche","Kaki","Mangue"]

K = len(fruits)

images,classes = shuffle(images, classes, random_state=10) 

## Fonctions de préparation des données

In [3]:
def transformer_en_gris(img_orig):
    im = np.zeros((img_orig.shape[0],img_orig.shape[1])) # On fait une copie de l'original
    for i in range(img_orig.shape[0]):
        for j in range(img_orig.shape[1]):
            r, v, b = img_orig[i, j]
            moyenne = np.mean(img_orig[i, j])
            im[i, j] = int(moyenne)
    return im

def decouper_image(image, taille) :
    return [image[x:x+taille,y:y+taille] for x in range(0,image.shape[0],taille) for y in range(0,image.shape[1],taille)]



## Fonctions de preprocessing

In [4]:
def preprocessing(images,nbPatch):
    taillePatch = int((np.shape(images)[1]+np.shape(images)[2])/nbPatch)
    X = np.zeros((np.shape(images)[0]*nbPatch,taillePatch,taillePatch))
    insertion = 0
    for i in range(np.shape(images)[0]):
        image_grise = transformer_en_gris(images[i]) # Recuperation de l'image i
        image_decoupe_list = decouper_image(image_grise,taillePatch) # image découpé
        for j in range(nbPatch):
            X[insertion] = image_decoupe_list[j]
            insertion += 1
    return X

## Algorithme des K-Moyennes

In [5]:
def dist(x1, x2) :
    return np.abs(x1-x2).sum()

class Kmoyennes:

    def __init__(self,K, taillePatch, IterationMax): ## instanciation d'un objet du type de la classe.
        self.K = K
        self.taillePatch = taillePatch
        self.N = 0
        self.D = 0
        self.IterationMax = IterationMax
        self.affectations = np.zeros((self.N))
        self.representants = np.zeros((self.K, self.taillePatch, self.taillePatch)) #les representants sont des patches
    
    def fit(self,X):
        self.N = X.shape[0]
        self.D = X.shape[1]
        representants_initiaux = []
        for i in range(self.K) :
            representants_initiaux.append(np.random.randint(255, size=(self.taillePatch, self.taillePatch)))
        representants = np.array(representants_initiaux)    
        
        for iterations in range(self.IterationMax):
            affectations = self.maj_affectations(X, representants)
            representants = self.maj_representants(X, affectations)
            
        self.representants = representants
        self.affectations = affectations
        return self.representants
    
    #retourne l'image moyenne (moyenne pixel a pixel de chaque patch du groupe X)
    def barycentre(self, X) :
        if X.shape[0] == 0 :
            return None
        return X.mean(axis=0)
    
    def maj_affectations(self, X, r):
        N = X.shape[0]
        K = r.shape[0]
        a = np.zeros((N, K))
        for n in range(N):
            distances = np.zeros(K)
            for k in range(K):
                distances[k] = dist(X[n], r[k])
            a[n, np.argmin(distances, axis=0)] = 1
        return a

    def maj_representants(self, X, a):
        K = self.K
        r = np.zeros((K,self.taillePatch, self.taillePatch))
        for k in range(K):
            masque = a[:,k] == 1
            barycentre = self.barycentre(X[masque])
            if barycentre is not None :
                r[k] = barycentre
        return r

In [6]:
#retourne la matrice des appartenances des patches aux features pour chaque image
def matrice_resultat_kmoyennes(affectationsKmeans, nbPatches):
    resultats = np.zeros((int(affectationsKmeans.shape[0]/nbPatches),affectationsKmeans.shape[1]))
    for image in range(resultats.shape[0]):
        patchesImage = affectationsKmeans[(nbPatches*image):(nbPatches*image+nbPatches)]
        somme = np.sum(patchesImage, axis=0)
        somme[somme >= 1] = 1
        resultats[image] = somme
    return resultats

## Modèle d'apprentissage : Le Bayésien Naïf

In [7]:
def BayesienNaif_fit(X,y):
    N = X.shape[0]
    D = X.shape[1]
    
    pkd = np.zeros((K,D))
    Pk = np.zeros((K,))
    
    for k in range(K):
        mask = (y == k)
        Pk[k] = np.sum(mask) / N
        Xk = X[mask]
        
        for d in range(D):
            count = 0
            for n in range(Xk.shape[0]):
                count += Xk[n , d]
            
            pkd[k , d] = count / Xk.shape[0]
    
    return pkd, Pk

def BayesienNaif_predict(X, pkd, Pk):
    epsilon = 1e-8
    N = X.shape[0]
    D = X.shape[1]
    
    y_pred = np.zeros(N)
    
    for n in range(N):
        x = X[n]
        score = np.zeros(K)
        for k in range(K):
            score[k] = (
                np.log(Pk[k])
                +
                np.sum(
                x * np.log(pkd[k] + epsilon)
                    +
                (1 - x) * np.log(1 - pkd[k] + epsilon)
                )
            )
        k = np.argmax(score)
        y_pred[n] = k
    return y_pred

## Fonctions de Cross-Validation sur le nombre de cluster

In [8]:
def appartenances_images_cibles(imagesValidation, nbPatches, representants) :
    patchesImages = preprocessing(imagesValidation,nbPatches)
    representantsCible = np.zeros((np.shape(imagesValidation)[0]*4,representants.shape[0]))
    for n in range(patchesImages.shape[0]) :
        distances = np.zeros(representants.shape[0])
        for k in range(representants.shape[0]) :
            distances[k] = dist(patchesImages[n], representants[k])
        representantsCible[n][np.argmin(distances)] = 1
    return matrice_resultat_kmoyennes(representantsCible, nbPatches)

In [9]:
#propTest entre 0 et 1
def separer_train_test(images, classes, propTest) :
    N = np.shape(images)[0]
    Ntest = int(propTest * N)
    return images[:Ntest], classes[:Ntest], images[Ntest :], classes[Ntest :]

#indexVal entre 0 et 4
def decouper_folds(imagesSansTest,classesSansTest, indexVal) :
    indexReel = int((np.shape(imagesSansTest)[0]/5)*indexVal)
    largeurIntervalle = int(np.shape(imagesSansTest)[0]/5)
    
    images_train =  imagesSansTest[: indexReel] + imagesSansTest[(indexReel+largeurIntervalle) : ]
    classes_train = classesSansTest[: indexReel] + classesSansTest[(indexReel+largeurIntervalle) : ]
    images_validation = imagesSansTest[indexReel:(indexReel+largeurIntervalle)]
    classes_validation = classesSansTest[indexReel:(indexReel+largeurIntervalle)]
        
    return images_train, classes_train, images_validation, classes_validation

images_sans_test, classes_sans_test, images_test, classes_test =  separer_train_test(images, classes, 0.80)
images_train, classes_train, images_validation, classes_validation = decouper_folds(images_sans_test, classes_sans_test, 4)

print(np.shape(images))
print("-----------------------------")
print(np.shape(images_sans_test))
print(np.shape(classes_sans_test))
print(np.shape(images_test))
print(np.shape(classes_test))     
print("-----------------------------")
print(np.shape(images_train))
print(np.shape(classes_train))
print(np.shape(images_validation))
print(np.shape(classes_validation))                         

(11051,)
-----------------------------
(8840,)
(8840,)
(2211,)
(2211,)
-----------------------------
(7072,)
(7072,)
(1768,)
(1768,)


In [10]:
#cross-validation

nbPatches = 4
nbClustersToTest = [60, 65, 70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125]
erreurs = []

images_sans_test, classes_sans_test, images_test, classes_test =  separer_train_test(images, classes, 0.70)

for nb in nbClustersToTest :
    accuracyCum = 0
    for i in range(5) :
        images_train, classes_train, images_validation, classes_validation = decouper_folds(images_sans_test, classes_sans_test, i)
        patches_train = preprocessing(images_train, nbPatches) #pas optimal, en plus de prendre bcp de temps
        Km = Kmoyennes(K=nb, taillePatch=25, IterationMax=10)
        Km.fit(X=np.array(patches_train)) 
        representants = Km.representants
        affectations = Km.affectations
        X_train = matrice_resultat_kmoyennes(affectations, nbPatches)
        pkd , Pk = BayesienNaif_fit(X_train, np.array(classes_train))
        X_validation = appartenances_images_cibles(images_validation, nbPatches, representants)
        ypred = BayesienNaif_predict(np.array(X_validation), pkd,Pk)
        accuracy = np.sum(classes_validation == ypred) / np.shape(images_validation)[0]
        print(f'accuracy : {accuracy} nb cluster : {nb} it : {i}')
        accuracyCum += accuracy
    accuracyCum /= 5
    erreurs.append(1-accuracyCum)
    print('fin itération')
    
    
plt.plot(nbClustersToTest, erreurs)


IndexError: tuple index out of range

## Test et résultat du modèle

In [None]:
#test du modele

images_sans_test, classes_sans_test, images_test, classes_test =  separer_train_test(images, classes, 0.70)

nbPatches = 4
patches_train = preprocessing(images_sans_test, nbPatches)
Km = Kmoyennes(K=120, taillePatch=25, IterationMax=20)
Km.fit(X=np.array(patches_train)) 
representants = Km.representants
affectations = Km.affectations

X_train = matrice_resultat_kmoyennes(affectations, nbPatches)
pkd , Pk = BayesienNaif_fit(X_train, np.array(classes_sans_test))

X_test = appartenances_images_cibles(images_test, nbPatches, representants)
ypred = BayesienNaif_predict(np.array(X_test), pkd,Pk)
accuracy = np.sum(classes_test == ypred) / np.shape(images_test)[0]

value = 1000
print(f'valeur de l\'image : {classes_test[value]} valeur predite : {ypred[value]}')

print("accuracy : ",np.sum(classes_test == ypred) / np.shape(images_test)[0])