# HMIN339 : Méthodes Avncées de la Science de données


## **`Réconnaissance Visuelle de Plantes`**

### Object du Project :
Réconnaissance d'espèces de plantes à partir de photos

### Jeu de Départ : 
3474 images appartenant à 50 espèces différentes

### Encadrement :
* **`Konstantin TODOROV`**
* **`Pascal PONCELET`**
 
### Fait par :
* **`BEYA NTUMBA Joel`**
* **`MINKO AMOA Dareine`**
* **`QUENETTE Christophe`**
* **`SHAQURA Tasnim`**

In [1]:
import os
import glob
import json
import csv
import numpy as np
from cv2 import cv2 
import numpy as np 
import pandas as pd
import xml.etree.ElementTree as et
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from skimage.feature import hog
from skimage import data, exposure
from PIL import Image

## Step 1 : Preprocessing

**Dataset's creation**

#### ***XML file's conversion to one CSV***

In [2]:
def xml_to_csv(path):
    xml_list = []
    for xml_file in glob.glob(path + '/*.xml'):
        tree = et.parse(xml_file)
        root = tree.getroot()
        value = (int(root.find('ObservationId').text),
                 int(root.find('MediaId').text),
                 float(root.find('Vote').text),
                 root.find('Content').text,
                 int(root.find('ClassId').text),
                 root.find('Family').text,
                 root.find('Genus').text,
                 root.find('Species').text,
                 root.find('Location').text,
                 root.find('Date').text
                 )
        xml_list.append(value)
    column_name = ['ObservationId', 'MediaId', 'Vote', 'Content', 'ClassId', 'Family', 'Genus', 'Species','Location','Date']
    xml_df = pd.DataFrame(xml_list, columns=column_name)
    return xml_df


def main():
    image_path = os.path.join(os.getcwd(), 'small_dataset_train/train')
    xml_df = xml_to_csv(image_path)
    xml_df.to_csv('Flowers.csv', index=None)
    print('Successfully converted xml to csv.')


main()

Successfully converted xml to csv.


In [3]:
df = pd.read_csv('Flowers.csv')
df = df.sort_values(['MediaId'], ascending=True)
df.head()

Unnamed: 0,ObservationId,MediaId,Vote,Content,ClassId,Family,Genus,Species,Location,Date
397,3087,23,2.0,Flower,5148,Salicaceae,Salix,Salix caprea L.,Champ-sur-Drac,2006-3-10
1108,664,133,4.0,Flower,576,Asteraceae,Cyanus,Cyanus segetum Hill,Paris,2013-6-28
3447,33480,177,3.0,Flower,30471,Orchidaceae,Ophrys,Ophrys scolopax Cav.,Lauret,2009-5-20
1440,4130,194,4.0,Flower,1047,Asteraceae,Leucanthemum,Leucanthemum vulgare Lam.,,2013-6-27
476,5260,281,4.0,Flower,8600,Orchidaceae,Anacamptis,Anacamptis pyramidalis (L.) Rich.,La Tour-sur-Orb,2013-5-21


### ***Put images into array***

In [4]:
# function to load folder into arrays and then it returns that same array
def loadImages():
    # Put files into lists and return them as one list of size 4
    image_path = os.path.join(os.getcwd(), 'small_dataset_train/train')
    image_files = sorted([os.path.join(image_path, file)
         for file in os.listdir(image_path) if file.endswith('.jpg')])
    
    return image_files

In [5]:
def reloadImages():
    images = []
    image_files = sorted([os.path.join('small_dataset_train/train', file)
         for file in os.listdir('small_dataset_train/train') if file.endswith('.png')])
    
    for image in sorted(image_files):
        temp = Image.open(image)
        keep = temp.copy()
        images.append(keep)
        
    
    return images

### ***Image Dataframe Creation***

In [6]:
image_path = os.path.join(os.getcwd(), 'small_dataset_train/train')
image_files = [os.path.basename(os.path.join(image_path, file))
         for file in os.listdir(image_path) if      file.endswith('.jpg')]

fileIds = []
for image_file in image_files:
    fileId = image_file.split('.')[0]
    fileIds.append(fileId)
        
        
    
# Creation of the Images Dataframe
df2 = pd.DataFrame({
    'Filename': image_files,
    'MediaId': fileIds
})

# Conversion of types of MediaId rows from 'object' to 'int64'
df2["MediaId"] = pd.to_numeric(df2["MediaId"])

df2.head()

Unnamed: 0,Filename,MediaId
0,48317.jpg,48317
1,75041.jpg,75041
2,64605.jpg,64605
3,5765.jpg,5765
4,83008.jpg,83008


### ***Merge of Image Dataframe and XML Dataframe by 'MediaId'***

In [7]:
df3 = pd.merge(df, df2, on='MediaId', how='inner')
df3[['MediaId','Species','Filename']].head()

Unnamed: 0,MediaId,Species,Filename
0,23,Salix caprea L.,23.jpg
1,133,Cyanus segetum Hill,133.jpg
2,177,Ophrys scolopax Cav.,177.jpg
3,194,Leucanthemum vulgare Lam.,194.jpg
4,281,Anacamptis pyramidalis (L.) Rich.,281.jpg


### ***Creation of the final dataframe***

#### ***Only keeping the 'Species' and 'Filename' columns***

In [8]:
# Copy of initial dataframe
data = df3.copy()

# Deleting non-useful columns
data=data.drop(['ObservationId', 'MediaId', 'Vote', 'Content', 
                'ClassId', 'Family', 'Genus','Location','Date'],
axis=1)

data.head()

Unnamed: 0,Species,Filename
0,Salix caprea L.,23.jpg
1,Cyanus segetum Hill,133.jpg
2,Ophrys scolopax Cav.,177.jpg
3,Leucanthemum vulgare Lam.,194.jpg
4,Anacamptis pyramidalis (L.) Rich.,281.jpg


### ***Resizing images***

In [9]:
##### Display one image
def display_one(a, title1 = "Original"):
    plt.imshow(a), plt.title(title1)
    plt.xticks([]), plt.yticks([])
    plt.show()
    
# Display two images
def display(a, b, title1 = "Original", title2 = "Edited"):
    plt.subplot(121), plt.imshow(a), plt.title(title1)
    plt.xticks([]), plt.yticks([])
    plt.subplot(122), plt.imshow(b), plt.title(title2)
    plt.xticks([]), plt.yticks([])
    plt.show()
    
# Preprocessing
def resize(images):
    # Getting 500 images to work with 
    img = [cv2.imread(i, cv2.IMREAD_UNCHANGED) for i in images]
    #print('Original size', img[1].shape)
    # --------------------------------
    # setting dimensions of the resize
    height = 220
    width = 220
    dim = (width, height)
    resized_images = []
    for i in range(len(img)):
        resized = cv2.resize(img[i], dim, interpolation=cv2.INTER_LINEAR)
        resized_images.append(resized)

    # Checking the size
    #print("RESIZED", res_img[1].shape)
    
    return resized_images
    # Visualizing one of the images in the array
    # original = res_img[1]
    # display_one(original)
    

### Apply greyscale, resize, and rename images 

In [10]:
def formatting(images):
    for i in images:
        img = Image.open(i)      
        temp = i.split('.')
        new_image = img.resize((220, 220))    
        greyscale_image = new_image.convert('L')
       # croppedIm = greyscale_image.crop((290, 250, 650, 650)) 
        name = str(temp[0])+'.png'       
       # croppedIm.save(name)
        greyscale_image.save(name)

## Apply HOG on images

In [11]:
def HOG(images):
    images_HOG = [];
    for image in images:
        hog = cv2.HOGDescriptor()
        im = cv2.imread(image)
        h = hog.compute(im)

        images_HOG.append(h)
        
        
    print(len(images_HOG))
    return images_HOG

In [12]:
def Histogram_Oriented_Gradients(images_list):
    listfig, listfd = [], []
    for image in images_list:
        fd, hog_image = hog(image, orientations=8, pixels_per_cell=(16, 16),
                        cells_per_block=(1, 1), visualize=True, multichannel=False)

        # Rescale histogram for better display
        hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10))
        listfig.append(hog_image_rescaled)
        listfd.append(fd)
    
    return listfd

In [13]:
def process_images(images):
    nrows = 100
    ncolumns = 100
    channels = 3
    X = []
    
    for image in images:
        X.append(
            cv2.resize(cv2.imread(image, cv2.IMREAD_COLOR), (nrows, ncolumns), interpolation = cv2.INTER_CUBIC)
        )
    
    return X

### load images from the dataset

In [14]:
print("Loading images ...")
images = loadImages()
print("Images loaded \n")

print("Processing images ...")    
X = process_images(images)
print("Processing loaded \n")

X = np.array(X)
y = df3[['Species']].values.ravel()

print("shape of X :", X.shape)
print("shape of y :", y.shape)

Loading images ...
Images loaded 

Processing images ...
Processing loaded 

shape of X : (3474, 100, 100, 3)
shape of y : (3474,)


### Store variables to pass it to CNN

In [15]:
preprocessed_images = (X, y)
%store preprocessed_images

Stored 'preprocessed_images' (tuple)
