# Classification de plantes

## Imports

### Library

In [1]:
# Imports de base
import os
import pandas as pd
import numpy as np

# Classification sans features
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

# Mesures : accuracy , rappel, précision, f1 score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

# Skimage 
from skimage import io
from skimage import transform
from skimage.util import img_as_ubyte
from skimage.util import img_as_int

# MultiThreading
from joblib import Parallel, delayed

### Supprimer les warnings futurs

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Chargement des données

In [3]:
file = ".."+os.sep+"data"+os.sep+"raw_data"+ os.sep + "metadata.csv"
df = pd.read_csv(file,sep=",")
df.head()

Unnamed: 0,observationid,mediaid,vote,content,classid,family,genus,species,author,date,location,latitude,longitude,yearinclef,observationid2014,imageid2014,learntag
0,15010,100019,3.0,Flower,8527,Asteraceae,Achillea,Achillea millefolium L.,mathieu menand,2009-4-24,Aucamville,,,PlantCLEF2014,4688.0,52287.0,Train
1,10732,100056,4.0,Flower,4516,Ranunculaceae,Ficaria,Ficaria verna Huds.,alain06,2014-9-7,Biot,43.62495,7.1044,PlantCLEF2015,,,Train
2,24005,100087,4.0,Flower,588,Asteraceae,Centaurea,Centaurea jacea L.,thierry pernot,1800-1-1,,,,PlantCLEF2014,2559.0,32864.0,Train
3,19892,100182,4.0,Flower,9930,Orchidaceae,Epipactis,Epipactis helleborine (L.) Crantz,thierry pernot,1800-1-1,,,,PlantCLEF2014,11733.0,40782.0,Train
4,36220,10019,4.0,Flower,6487,Orchidaceae,Ophrys,Ophrys apifera Huds.,jean-claude echardour,2010-6-26,Saffré,47.49689,-1.59442,PlantCLEF2014,13210.0,8080.0,Train


### Combien de données (photos) on a pour chaque espèce

In [4]:
df.groupby('species')['mediaid'].count()

species
Achillea millefolium L.                                     62
Aconitum napellus L.                                        61
Anacamptis morio (L.) R.M.Bateman, Pridgeon & M.W.Chase     49
Anacamptis pyramidalis (L.) Rich.                           85
Anemone alpina L.                                           52
Anemone hepatica L.                                         52
Anemone nemorosa L.                                         55
Aquilegia vulgaris L.                                       75
Astrantia major L.                                          56
Bellis perennis L.                                          97
Centaurea aspera L.                                         65
Centaurea jacea L.                                          66
Chelidonium majus L.                                        60
Cichorium intybus L.                                       109
Cirsium vulgare (Savi) Ten.                                 58
Cistus albidus L.                              

### Nombre total de données

In [5]:
tailleTotale = df.mediaid.unique().size
print(tailleTotale)

3474


## Plan des étapes à faire
    Faire la somme cumulée des plantes : si y'a un gros pic => on a une espècee qui représente une grosse partie des données
    
    Est ce que les données sont bien balancées? 
    
    Métrique: comment comparer les différents modèles? 
    Si données pas trop balancées: accuracy pas trop mal pour mesurer!
    Moyenner par espèce: taux moyen pour chaque espèce
    Mesurer le taux d'erreur : données non balancées (issues de l'app plantNet: attiré plus vers certaines plantes plutot que d'autres) 
    stratifySplit (split, trainset split) 
    
    1) Prendre une baseline basique : 
        - Prendre un classifieur le plus naif possible et calculer son accuracy et on aura un accuracy de base 
        - permet de donner un sens aux chiffres 
        - Si moins bien que le truc très naif c'est qu'il y a un soucis
    2) Mettre les images en entrée et de regarder ce qui sort d'un classifier linéaire. 
        a) Redimensionner : vectoriser 
        b) Balancer au classifier: tableau 3D 
    3) https://scikit-image.org/docs/stable/auto_examples/features_detection/plot_hog.html#sphx-glr-auto-examples-features-detection-plot-hog-py
    Celui ci retire les couleurs: importants pour les fleurs
    Descripteur global sur image : si rotation alors problème , baisse de luminosité etc..
    
    Combiner les descripteurs : hog combiné à un descripteur de couleurs
    
    4) Bag-of-Visual-Words (BoVW) : Prend les descripteurs et en fait une représentation (marche plutot bien)
    
    5) Réseau neuronal : passer en entrée les images et lui va apprendre les features
    Pytorch: CNN (exemples tutos) 
    MLP (Multi Layer perceptron : bcp plus rapide que le convutionnel)
    Transfer learning : prendre un autre modele déja entrainé et le réutiliser : utile qd on a peu de données
    Meilleur pour réseau neuronnaux de partir d'un modèle pré-enrtainé

## Classification naïve: prédire toujours même classe (BaseLine)

C'est le classifieur le plus bas : si on a plu bas c'est qu'il y a un soucis qqs part.

Papaver rhoeas L.                                          119 (à prédire)

Calculer accuracy , f-measure, rappel

Accuracy = nbre d'éléments corrects / nombre total
Précision = nbre d'éléments corrects pour la classe / nbre prédit
Rappel = nbre d'éléments correctement attribués à la classe / nbre de documents appartenant à la classe i (ici 119) 

In [6]:
# Définition de la prédiction qui prédit toujours la même classe (avec la classe qui a le plus de photo)
def naivePredict(row):
    return 30269

In [7]:
# Application de la prédiction naïve sur toutes les lignes du dataframe
def predictionNaive(df):
    count = 0
    for index, row in df.iterrows():
        res = naivePredict(row)
        if (res == row["classid"]):
            count = count + 1
    #print("count : ", count)
    acc = count / tailleTotale
    prec = count / tailleTotale
    rappel = count / 119
    print("Accuracy : ", acc)
    print("Précision : ", prec)
    print("Rappel : ", rappel)
    

In [8]:
# Test prediction naive : 
predictionNaive(df)

Accuracy :  0.03425446171560161
Précision :  0.03425446171560161
Rappel :  1.0


## Classification sans features (SVM, LR)

Réduire la taille de l'image car taille d'image trop grande (ne rentrera pas en mémoire) => descripteur de taille + petite

### Load Train data without features

In [17]:
threads = 6
subset = 'train'

folder = ".."+ os.sep + "data" + os.sep
file =  subset + os.sep + subset + ".csv"

train_raw_df = pd.read_csv(folder + file,sep=",")
train_raw_df.head()

Unnamed: 0,observationid,mediaid,vote,content,classid,family,genus,species,author,date,location,latitude,longitude,yearinclef,observationid2014,imageid2014,learntag
0,12510,1648,4.0,Flower,4369,Primulaceae,Primula,Primula veris L.,liliane roubaudi,2014-4-1,La Chapelle en Guinchay,,,PlantCLEF2015,,,Train
1,27147,111066,3.0,Flower,30162,Orchidaceae,Himantoglossum,Himantoglossum hircinum (L.) Spreng.,julien barataud,2006-6-14,Plévenon,,,ImageCLEF2013,14913.0,48553.0,Train
2,35639,17104,3.0,Flower,6538,Orchidaceae,Anacamptis,"Anacamptis morio (L.) R.M.Bateman, Pridgeon & ...",marie portas,2012-3-19,Hyères,,,ImageCLEF2013,6637.0,33907.0,Train
3,18747,30734,3.0,Flower,6415,Orchidaceae,Orchis,Orchis anthropophora (L.) All.,errol vela,2014-4-11,"Sidi Aich, ALGERIE",36.6026,4.69448,PlantCLEF2015,,,Train
4,26828,71634,4.0,Flower,5148,Salicaceae,Salix,Salix caprea L.,inge wullweber,2009-3-11,,,,PlantCLEF2014,2982.0,58691.0,Train


In [18]:
loadpath = folder + 'resized' + os.sep + subset + os.sep

### Classifieurs

In [19]:
models_classifiers = []
models_classifiers.append(('LinearSVC', LinearSVC(random_state=0, tol=1e-5)))
models_classifiers.append(('SGDClassifier', SGDClassifier(loss="hinge", penalty="l2", max_iter=5)))
models_classifiers.append(('LR', LogisticRegression()))

### Train Classifiers

In [20]:
def imgToVector(fname):
    image = io.imread(loadpath + fname)
    image = img_as_ubyte(image)
    vector = image.reshape(100*100*3)
    #vector = vector.flatten()
    return vector

In [21]:
liste_image = Parallel(n_jobs=threads)(delayed(imgToVector)(str(i) + '.jpg') for i in train_raw_df['mediaid']);

In [22]:
#Regarde comment c'est magique la liste d'images tout d'un coup et parallelise en plus  ! ;)
liste_image

[array([14, 12, 13, ...,  8,  8, 10], dtype=uint8),
 array([155, 161,  87, ...,  55,  62,  11], dtype=uint8),
 array([107,  44,  97, ..., 144, 151,  97], dtype=uint8),
 array([157, 143,  68, ...,  39,  45,  11], dtype=uint8),
 array([ 46, 107, 138, ...,  48,  49,  35], dtype=uint8),
 array([ 30,  53,  35, ..., 208, 233, 194], dtype=uint8),
 array([  2,   4,   1, ..., 187, 171, 111], dtype=uint8),
 array([ 79, 133,  39, ...,  57, 100,  28], dtype=uint8),
 array([ 55,  86, 106, ...,  73, 104, 122], dtype=uint8),
 array([107,  99,  53, ...,  81,  65,  42], dtype=uint8),
 array([105, 126,  49, ...,  98, 104, 102], dtype=uint8),
 array([122, 125, 108, ...,  52,  50,  61], dtype=uint8),
 array([37, 38, 32, ..., 21, 31, 23], dtype=uint8),
 array([ 43,  31,  19, ..., 104,  73,  52], dtype=uint8),
 array([ 80,  71,  56, ..., 116,  97,  80], dtype=uint8),
 array([179, 173, 187, ...,  96,  84,  70], dtype=uint8),
 array([ 52,  69,  59, ..., 126, 110, 139], dtype=uint8),
 array([21, 19,  7, ..., 2

In [23]:
y_train = train_raw_df['classid']

X_train = liste_image
for name, clf in models_classifiers:
    clf.fit(X_train, y_train)



### Load Test data

In [24]:
#Load Test Data
subset = 'test'
file =  subset + os.sep + subset + ".csv"

test_raw_df = pd.read_csv(folder + file,sep=",")
test_raw_df.head()

loadpath = folder + 'resized' + os.sep + subset + os.sep

In [25]:
liste_image_test = Parallel(n_jobs=threads)(delayed(imgToVector)(str(i) + '.jpg') for i in test_raw_df['mediaid']);

### Test Classifiers

In [26]:
X_test = liste_image_test
y_true = test_raw_df['classid']

y_predicts = []
for name,clf in models_classifiers:
    y_predicts.append(clf.predict(X_test))

In [27]:
for y_predict in y_predicts:
    print("Accuracy: " + str(accuracy_score(y_true, y_predict)))
    print("Rappel: " + str(recall_score(y_true, y_predict,average='micro')))
    print("Precision: " + str(precision_score(y_true, y_predict,average='micro')))
    print("F-Measure: " + str(precision_score(y_true, y_predict,average='micro')))
    print('------------')

Accuracy: 0.20719424460431654
Rappel: 0.20719424460431654
Precision: 0.20719424460431654
F-Measure: 0.20719424460431654
------------
Accuracy: 0.11223021582733812
Rappel: 0.11223021582733812
Precision: 0.11223021582733812
F-Measure: 0.11223021582733812
------------
Accuracy: 0.22302158273381295
Rappel: 0.22302158273381295
Precision: 0.22302158273381295
F-Measure: 0.22302158273381295
------------


## Classification basique avec features HOG (SVM, LR)

### Load data with features to train

In [29]:
#Load Train Data
subset = 'train'

folder = ".."+os.sep+"data"+os.sep
file =  subset + os.sep + subset + ".csv"

train_raw_df = pd.read_csv(folder + file,sep=",")
train_raw_df.head()

hogpath = folder + 'resized' +os.sep+ subset + os.sep + subset + "_hog.csv"
train_hog_df = pd.read_csv(hogpath,sep=",")
train_hog_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7190,7191,7192,7193,7194,7195,7196,7197,7198,7199
0,0.485814,0.485814,0.219438,0.159252,0.485814,0.0,0.289532,0.366892,0.548796,0.229304,...,0.560253,0.0,0.544908,0.265194,0.192605,0.0,0.544908,0.203023,0.486721,0.143559
1,0.353553,0.353553,0.353553,0.353553,0.353553,0.353553,0.353553,0.353553,0.436803,0.436803,...,0.377178,0.377178,0.380913,0.380913,0.226869,0.380913,0.380913,0.279218,0.380913,0.380913
2,0.397287,0.085241,0.086637,0.062499,0.45373,0.45373,0.45373,0.45373,0.394855,0.023979,...,0.51892,0.075144,0.123989,0.0,0.245486,0.0,0.655185,0.588457,0.385764,0.0
3,0.392025,0.392025,0.254712,0.114115,0.392025,0.392025,0.392025,0.392025,0.512384,0.512384,...,0.554194,0.0,0.336845,0.251069,0.452526,0.0,0.73143,0.218635,0.18955,0.0
4,0.545601,0.545601,0.46561,0.097398,0.287365,0.04279,0.303704,0.041401,0.678511,0.171469,...,0.322494,0.108256,0.439907,0.439907,0.439907,0.321641,0.439907,0.091582,0.069689,0.330503


### Classifieurs

In [30]:
models_classifiers = []
models_classifiers.append(('LinearSVC', LinearSVC(random_state=0, tol=1e-5)))
models_classifiers.append(('SGDClassifier', SGDClassifier(loss="hinge", penalty="l2", max_iter=5)))
models_classifiers.append(('LR', LogisticRegression()))

### Train classifiers

In [31]:
X_train = train_hog_df
y_train = train_raw_df['classid']

In [32]:
for name,clf in models_classifiers:
    clf.fit(X_train, y_train)



### Load Test data

In [33]:
#Load Test Data
subset = 'test'
file =  subset + os.sep + subset + ".csv"

test_raw_df = pd.read_csv(folder + file,sep=",")
test_raw_df.head()

hogpath = folder + 'resized' +os.sep + subset + os.sep + subset + "_hog.csv"
test_hog_df = pd.read_csv(hogpath,sep=",")
test_hog_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7190,7191,7192,7193,7194,7195,7196,7197,7198,7199
0,0.204076,0.0,0.382911,0.615967,0.615967,0.225245,0.04655,0.0,0.479311,0.0,...,0.36167,0.36167,0.399978,0.130546,0.399978,0.399978,0.399978,0.399978,0.399978,0.151862
1,0.415877,0.092098,0.111781,0.338016,0.415877,0.415877,0.415877,0.415877,0.325248,0.02222,...,0.264512,0.054895,0.419615,0.419615,0.24443,0.153779,0.419615,0.419615,0.419615,0.190322
2,0.241196,0.108855,0.098362,0.206433,0.537167,0.537167,0.537167,0.109723,0.398204,0.155901,...,0.116798,0.016864,0.468831,0.23481,0.206926,0.468831,0.468831,0.0,0.151114,0.468831
3,0.365089,0.312301,0.365089,0.365089,0.365089,0.365089,0.365089,0.320509,0.374733,0.374733,...,0.536307,0.0,0.458736,0.401347,0.124761,0.300086,0.458736,0.18966,0.458736,0.25694
4,0.521317,0.521317,0.5081,0.13839,0.357581,0.0,0.220705,0.050648,0.381562,0.381562,...,0.393497,0.207274,0.486437,0.486437,0.479151,0.110677,0.472375,0.023688,0.059926,0.240071


### Test Classifiers

In [34]:
X_test = test_hog_df
y_true = test_raw_df['classid']

y_predicts = []
for name,clf in models_classifiers:
    y_predicts.append(clf.predict(X_test))

In [35]:
for y_predict in y_predicts:
    print("Accuracy: " + str(accuracy_score(y_true, y_predict)))
    print("Rappel: " + str(recall_score(y_true, y_predict,average='micro')))
    print("Precision: " + str(precision_score(y_true, y_predict,average='micro')))
    print("F-Measure: " + str(precision_score(y_true, y_predict,average='micro')))
    print('---------------')

Accuracy: 0.13525179856115108
Rappel: 0.13525179856115108
Precision: 0.13525179856115108
F-Measure: 0.13525179856115108
---------------
Accuracy: 0.051798561151079135
Rappel: 0.051798561151079135
Precision: 0.051798561151079135
F-Measure: 0.051798561151079135
---------------
Accuracy: 0.14964028776978416
Rappel: 0.14964028776978416
Precision: 0.14964028776978416
F-Measure: 0.14964028776978416
---------------
