# Classification de plantes

## Imports

### Library

In [1]:
# Imports de base
import os
import pandas as pd
import numpy as np

# Classification sans features
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

# Mesures : accuracy , rappel, précision, f1 score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# Skimage 
from skimage import io
from skimage import transform
from skimage.util import img_as_ubyte
from skimage.util import img_as_int

# MultiThreading
from joblib import Parallel, delayed

# torchvision
import torch
import torchvision
import torchvision.transforms as transforms

# torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# plot
import matplotlib.pyplot as plt

### Supprimer les warnings futurs

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Chargement des données

In [None]:
file = ".."+os.sep+"data"+os.sep+"raw_data"+ os.sep + "metadata.csv"
df = pd.read_csv(file,sep=",")
df.head()

### Combien de données (photos) on a pour chaque espèce

In [None]:
df.groupby('species')['mediaid'].count()

### Nombre total de données

In [None]:
tailleTotale = df.mediaid.unique().size
print(tailleTotale)

## Plan des étapes à faire
    Faire la somme cumulée des plantes : si y'a un gros pic => on a une espècee qui représente une grosse partie des données
    
    Est ce que les données sont bien balancées? 
    
    Métrique: comment comparer les différents modèles? 
    Si données pas trop balancées: accuracy pas trop mal pour mesurer!
    Moyenner par espèce: taux moyen pour chaque espèce
    Mesurer le taux d'erreur : données non balancées (issues de l'app plantNet: attiré plus vers certaines plantes plutot que d'autres) 
    stratifySplit (split, trainset split) 
    
    1) Prendre une baseline basique : 
        - Prendre un classifieur le plus naif possible et calculer son accuracy et on aura un accuracy de base 
        - permet de donner un sens aux chiffres 
        - Si moins bien que le truc très naif c'est qu'il y a un soucis
    2) Mettre les images en entrée et de regarder ce qui sort d'un classifier linéaire. 
        a) Redimensionner : vectoriser 
        b) Balancer au classifier: tableau 3D 
    3) https://scikit-image.org/docs/stable/auto_examples/features_detection/plot_hog.html#sphx-glr-auto-examples-features-detection-plot-hog-py
    Celui ci retire les couleurs: importants pour les fleurs
    Descripteur global sur image : si rotation alors problème , baisse de luminosité etc..
    
    Combiner les descripteurs : hog combiné à un descripteur de couleurs
    
    4) Bag-of-Visual-Words (BoVW) : Prend les descripteurs et en fait une représentation (marche plutot bien)
    
    5) Réseau neuronal : passer en entrée les images et lui va apprendre les features
    Pytorch: CNN (exemples tutos) 
    MLP (Multi Layer perceptron : bcp plus rapide que le convutionnel)
    Transfer learning : prendre un autre modele déja entrainé et le réutiliser : utile qd on a peu de données
    Meilleur pour réseau neuronnaux de partir d'un modèle pré-enrtainé

## Classification naïve: prédire toujours même classe (BaseLine)

C'est le classifieur le plus bas : si on a plu bas c'est qu'il y a un soucis qqs part.

Papaver rhoeas L.                                          119 (à prédire)

Calculer accuracy , f-measure, rappel

Accuracy = nbre d'éléments corrects / nombre total
Précision = nbre d'éléments corrects pour la classe / nbre prédit
Rappel = nbre d'éléments correctement attribués à la classe / nbre de documents appartenant à la classe i (ici 119) 

In [None]:
# Définition de la prédiction qui prédit toujours la même classe (avec la classe qui a le plus de photo)
def naivePredict(row):
    return 30269

In [None]:
# Application de la prédiction naïve sur toutes les lignes du dataframe
def predictionNaive(df):
    count = 0
    for index, row in df.iterrows():
        res = naivePredict(row)
        if (res == row["classid"]):
            count = count + 1
    #print("count : ", count)
    acc = count / tailleTotale
    prec = count / tailleTotale
    rappel = count / 119
    print("Accuracy : ", acc)
    print("Précision : ", prec)
    print("Rappel : ", rappel)
    

In [None]:
# Test prediction naive : 
predictionNaive(df)

## Classification sans features (SVM, LR)

Réduire la taille de l'image car taille d'image trop grande (ne rentrera pas en mémoire) => descripteur de taille + petite

### Load Train data without features

In [None]:
threads = 6
subset = 'train'

folder = ".."+ os.sep + "data" + os.sep
file =  subset + os.sep + subset + ".csv"

train_raw_df = pd.read_csv(folder + file,sep=",")
train_raw_df.head()

In [None]:
loadpath = folder + 'resized' + os.sep + subset + os.sep

### Classifieurs

In [None]:
models_classifiers = []
models_classifiers.append(('LinearSVC', LinearSVC(random_state=0, tol=1e-5)))
models_classifiers.append(('SGDClassifier', SGDClassifier(loss="hinge", penalty="l2", max_iter=5)))
models_classifiers.append(('LR', LogisticRegression()))

### Train Classifiers

In [None]:
def imgToVector(fname):
    image = io.imread(loadpath + fname)
    image = img_as_ubyte(image)
    vector = image.reshape(100*100*3)
    #vector = vector.flatten()
    return vector

In [None]:
liste_image = Parallel(n_jobs=threads)(delayed(imgToVector)(str(i) + '.jpg') for i in train_raw_df['mediaid']);

In [None]:
#Regarde comment c'est magique la liste d'images tout d'un coup et parallelise en plus  ! ;)
liste_image

In [None]:
y_train = train_raw_df['classid']

X_train = liste_image
for name, clf in models_classifiers:
    clf.fit(X_train, y_train)

### Load Test data

In [None]:
#Load Test Data
subset = 'test'
file =  subset + os.sep + subset + ".csv"

test_raw_df = pd.read_csv(folder + file,sep=",")
test_raw_df.head()

loadpath = folder + 'resized' + os.sep + subset + os.sep

In [None]:
liste_image_test = Parallel(n_jobs=threads)(delayed(imgToVector)(str(i) + '.jpg') for i in test_raw_df['mediaid']);

### Test Classifiers

In [None]:
X_test = liste_image_test
y_true = test_raw_df['classid']

y_predicts = []
for name,clf in models_classifiers:
    y_predicts.append(clf.predict(X_test))

In [None]:
for y_predict in y_predicts:
    print("Accuracy: " + str(accuracy_score(y_true, y_predict)))
    print("Rappel: " + str(recall_score(y_true, y_predict,average='micro')))
    print("Precision: " + str(precision_score(y_true, y_predict,average='micro')))
    print("F-Measure: " + str(precision_score(y_true, y_predict,average='micro')))
    print('------------')

## Classification basique avec features HOG (SVM, LR)

### Load data with features to train

In [None]:
#Load Train Data
subset = 'train'

folder = ".."+os.sep+"data"+os.sep
file =  subset + os.sep + subset + ".csv"

train_raw_df = pd.read_csv(folder + file,sep=",")
train_raw_df.head()

hogpath = folder + 'resized' +os.sep+ subset + os.sep + subset + "_hog.csv"
train_hog_df = pd.read_csv(hogpath,sep=",")
train_hog_df.head()

### Classifieurs

In [None]:
models_classifiers = []
models_classifiers.append(('LinearSVC', LinearSVC(random_state=0, tol=1e-5)))
models_classifiers.append(('SGDClassifier', SGDClassifier(loss="hinge", penalty="l2", max_iter=5)))
models_classifiers.append(('LR', LogisticRegression()))

### Train classifiers

In [None]:
X_train = train_hog_df
y_train = train_raw_df['classid']

In [None]:
for name,clf in models_classifiers:
    clf.fit(X_train, y_train)

### Load Test data

In [None]:
#Load Test Data
subset = 'test'
file =  subset + os.sep + subset + ".csv"

test_raw_df = pd.read_csv(folder + file,sep=",")
test_raw_df.head()

hogpath = folder + 'resized' +os.sep + subset + os.sep + subset + "_hog.csv"
test_hog_df = pd.read_csv(hogpath,sep=",")
test_hog_df.head()

### Test Classifiers

In [None]:
X_test = test_hog_df
y_true = test_raw_df['classid']

y_predicts = []
for name,clf in models_classifiers:
    y_predicts.append(clf.predict(X_test))

In [None]:
for y_predict in y_predicts:
    print("Accuracy: " + str(accuracy_score(y_true, y_predict)))
    print("Rappel: " + str(recall_score(y_true, y_predict,average='micro')))
    print("Precision: " + str(precision_score(y_true, y_predict,average='micro')))
    print("F-Measure: " + str(precision_score(y_true, y_predict,average='micro')))
    print('---------------')

## Réseaux de neurones

Training an image classifier

We will do the following steps in order:

    1) Load and normalizing the CIFAR10 training and test datasets using torchvision
    2) Define a Convolutional Neural Network
    3) Define a loss function
    4) Train the network on the training data
    5) Test the network on the test data


### Load and normalizing 

In [None]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


In [3]:
threads = 6

In [4]:
#Load Train Data
subset_train = 'train'

folder = ".."+os.sep+"data"+os.sep
file_train =  subset_train + os.sep + subset_train + ".csv"

train_raw_df = pd.read_csv(folder + file_train,sep=",")
train_raw_df.head()

Unnamed: 0,observationid,mediaid,vote,content,classid,family,genus,species,author,date,location,latitude,longitude,yearinclef,observationid2014,imageid2014,learntag
0,12510,1648,4.0,Flower,4369,Primulaceae,Primula,Primula veris L.,liliane roubaudi,2014-4-1,La Chapelle en Guinchay,,,PlantCLEF2015,,,Train
1,27147,111066,3.0,Flower,30162,Orchidaceae,Himantoglossum,Himantoglossum hircinum (L.) Spreng.,julien barataud,2006-6-14,Plévenon,,,ImageCLEF2013,14913.0,48553.0,Train
2,35639,17104,3.0,Flower,6538,Orchidaceae,Anacamptis,"Anacamptis morio (L.) R.M.Bateman, Pridgeon & ...",marie portas,2012-3-19,Hyères,,,ImageCLEF2013,6637.0,33907.0,Train
3,18747,30734,3.0,Flower,6415,Orchidaceae,Orchis,Orchis anthropophora (L.) All.,errol vela,2014-4-11,"Sidi Aich, ALGERIE",36.6026,4.69448,PlantCLEF2015,,,Train
4,26828,71634,4.0,Flower,5148,Salicaceae,Salix,Salix caprea L.,inge wullweber,2009-3-11,,,,PlantCLEF2014,2982.0,58691.0,Train


In [5]:
#Load Test Data
subset_test = 'test'
file_test =  subset_test + os.sep + subset_test + ".csv"

test_raw_df = pd.read_csv(folder + file_test,sep=",")
test_raw_df.head()

Unnamed: 0,observationid,mediaid,vote,content,classid,family,genus,species,author,date,location,latitude,longitude,yearinclef,observationid2014,imageid2014,learntag
0,36886,99451,4.0,Flower,8534,Ranunculaceae,Aconitum,Aconitum napellus L.,thierry pernot,1800-1-1,,,,PlantCLEF2014,208.0,22422.0,Train
1,31390,71276,3.0,Flower,2394,Cistaceae,Cistus,Cistus albidus L.,herve goeau,2013-6-5,Paris,48.84059,2.36158,PlantCLEF2014,2338.0,43988.0,Train
2,38327,61697,4.0,Flower,493,Asteraceae,Bellis,Bellis perennis L.,alexis joly,2014-3-8,Clermont-L'Hérault,43.6469,3.38675,PlantCLEF2015,,,Train
3,5105,34873,4.0,Flower,661,Asteraceae,Cichorium,Cichorium intybus L.,liliane roubaudi,2013-9-13,Fleury,,,PlantCLEF2014,3130.0,20340.0,Train
4,18307,56154,4.0,Flower,588,Asteraceae,Centaurea,Centaurea jacea L.,liliane roubaudi,2014-9-5,Vézins-de-Lévézou,,,PlantCLEF2015,,,Train


In [6]:
#loadpath
loadpath_train = folder + os.sep + subset_train + os.sep
loadpath_test = folder + os.sep + subset_test + os.sep

In [7]:
def imgToVector2(loadpath, fname):
    image = io.imread(loadpath + fname)
    image = img_as_ubyte(image)
    return image

In [8]:
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

In [9]:
trainset = Parallel(n_jobs=threads)(delayed(imgToVector2)(loadpath_train, str(i) + '.jpg') for i in train_raw_df['mediaid']);

In [10]:
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

In [None]:
trainloader

In [None]:
testset = Parallel(n_jobs=threads)(delayed(imgToVector2)(loadpath_test, str(i) + '.jpg') for i in test_raw_df['mediaid']);

In [None]:
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=2)

In [None]:
classes = tuple(train_raw_df['mediaid'].unique())

In [None]:
#classes

In [11]:
# functions to show an image


def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.show()


# get some random training images
dataiter = iter(trainloader)
images, labels = dataiter.next()

# show images
imshow(torchvision.utils.make_grid(images))
# print labels
print(' '.join('%5s' % classes[labels[j]] for j in range(4)))


OSError: [Errno 12] Cannot allocate memory

In [None]:
images[0].shape

### Define Convolutional Neural Network

In [None]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


net = Net()

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

### Train the network

In [None]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

In [None]:
# SAVE TRAIN MODEL
PATH = './cifar_net.pth'
torch.save(net.state_dict(), PATH)

### Test the network on the test data

In [None]:
dataiter = iter(testloader)
images, labels = dataiter.next()

# print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join('%5s' % classes[labels[j]] for j in range(4)))