# Data Processing

**`Imports`**

In [1]:
from __future__ import print_function, division

import copy
import numpy
import numpy as np
import os
import pandas

import xml.etree.ElementTree as ET

from sklearn import svm
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import GridSearchCV

import time

import matplotlib.pyplot as plot

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import torchvision
from torchvision import datasets, models, transforms
import torchvision.transforms.functional as TF
from torch.utils.data import Subset

**`Converting the XMLs to CSV file`**

In [2]:
dataDirectory = "small_dataset_train/"

# Parses all XML files in the directory, then return a dataframe of all the infos
def parseFilesInDirectory(directory):
    flowers = []
    
    for file in os.listdir(directory):
        if file.endswith(".xml"):
            flowers = flowers + getInfosfromXML(os.path.join(directory, file))
            
    return flowers

# Parses the XML file to keep the relevant informations only, then return a dataframe
def getInfosfromXML(fileContent):
    root = ET.parse(fileContent).getroot()
    
    # We want a list of dictionnaries
    flowerInfos = []
    flowerInfos.append(
                    dict(MediaId = root.find("MediaId").text,
                         ObservationId = root.find("ObservationId").text,
                         ClassId = root.find("ClassId").text,
                         Content = root.find("Content").text,
                         Family = root.find("Family").text,
                         Genus = root.find("Genus").text,
                         Species = root.find("Species").text,
                         Date = root.find("Date").text,
                         Location = root.find("Location").text,
                         Latitude = root.find("Latitude").text,
                         Longitude = root.find("Longitude").text,
                         Author = root.find("Author").text,
                         Vote = root.find("Vote").text
                        )
                 )

    return flowerInfos

flowers = pandas.DataFrame(parseFilesInDirectory(dataDirectory + "train/"))
flowers.set_index("MediaId", inplace = True)

flowers.to_csv(dataDirectory + "Flowers.csv")
flowers.head()

Unnamed: 0_level_0,Author,ClassId,Content,Date,Family,Genus,Latitude,Location,Longitude,ObservationId,Species,Vote
MediaId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1378,pierre bonnet,493,Flower,2011-3-20,Asteraceae,Bellis,43.65188,Palavas-les-Flots,3.86169,23116,Bellis perennis L.,3
15580,mathieu menand,4477,Flower,2007-7-7,Ranunculaceae,Anemone,,Aston,,28135,Anemone alpina L.,3
97913,herve goeau,4516,Flower,2013-4-21,Ranunculaceae,Ficaria,48.84401,Vert-le-Petit,2.35995,37273,Ficaria verna Huds.,4
101501,liliane roubaudi,2394,Flower,2012-4-11,Cistaceae,Cistus,,Narbonne,,25202,Cistus albidus L.,4
76867,herve goeau,5148,Flower,2013-3-24,Salicaceae,Salix,48.8567,Sainte-Geneviève-des-Bois,2.24104,11603,Salix caprea L.,3


**`Converting the XMLs to CSV file`**

In [3]:
data_dir = 'small_dataset_train/'
# check path
# os.path.isfile(data_dir)
os.path.isdir(data_dir)

True

**`Transformation of the training and validation sets`**

In [4]:
# arrays to normalization
normalize_mean = np.array([0.485, 0.456, 0.406])
normalize_std = np.array([0.229, 0.224, 0.225])

data_transforms = {}

# transforms to train data set
data_transforms['train'] = transforms.Compose([
    transforms.RandomChoice([
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.RandomRotation(180),
        ]),
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        normalize_mean,
        normalize_std)
    ])

# transforms to valid data set
data_transforms['valid'] = transforms.Compose([
    transforms.Resize(255),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        normalize_mean,
        normalize_std)
    ])

**`Load the datasets with ImageFolder`**

In [5]:
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor

image_datasets = {}
image_datasets = ImageFolder(root='small_dataset_train', transform=data_transforms['train'])
print(image_datasets.classes)

valid_dataset_to_split = ImageFolder(root='small_dataset_train', transform=data_transforms['valid'])
print(valid_dataset_to_split.classes)

['train']
['train']


**`Obtain validation and training datasets that will be used to evaluate the network`**

In [6]:
valid_data_index_list = []
test_data_index_list = []

for index in range(0, len(valid_dataset_to_split), 2):
    valid_data_index_list.append(index)
    test_data_index_list.append(index+1)

image_datasets['valid_data'] = list(Subset(valid_dataset_to_split, valid_data_index_list))
image_datasets['test_data'] = list(Subset(valid_dataset_to_split, test_data_index_list))

TypeError: 'ImageFolder' object does not support item assignment

**`Using the image datasets and the transforms, define the dataloaders`**

In [None]:
dataloaders = {}
dataloaders['train_data'] = torch.utils.data.DataLoader(image_datasets['train_data'], batch_size=32, shuffle=True, num_workers=32)
dataloaders['valid_data'] = torch.utils.data.DataLoader(image_datasets['valid_data'], batch_size=23, shuffle=False, num_workers=32)
dataloaders['test_data'] = torch.utils.data.DataLoader(image_datasets['test_data'], batch_size=23, shuffle=False, num_workers=32)
print(f"Train data: {len(dataloaders['train_data'].dataset)} images / {len(dataloaders['train_data'])} batches")
print(f"Valid data: {len(dataloaders['valid_data'].dataset)} images / {len(dataloaders['valid_data'])} batches")
print(f"Test  data: {len(dataloaders['test_data'].dataset)} images / {len(dataloaders['test_data'])} batches")