In [2]:
import pandas as pd

import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from collections import Counter

from tqdm import tqdm

# Extracting a subset of spipoll.csv containing only relevant data

In [3]:
# Import spipoll.csv as a pandas dataframe
spipoll = pd.read_csv("../data/spipoll.csv",low_memory=False)

In [4]:
# Display the first 5 rows of the dataframe
display(spipoll.head())

# Display the columns names of the dataframe
spipoll.columns

Unnamed: 0,collection_id,protocole_long,user_id,plante_sc,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,...,fleur_ombre,insecte_sc,insecte_fr,insecte_denominationPlusPrecise,insecte_CdNomtaxref,insecte_abondance,insecte_commentaire,insecte_vu_sur_fleur,nb_validation,nb_suggestion
0,1,,12 657,Les Trèfles à fleurs blanches ou roses en boul...,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,...,0,Apis mellifera,L'Abeille mellifère,,,plus de 5,,0.0,3.0,
1,1,,12 657,Les Trèfles à fleurs blanches ou roses en boul...,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,...,0,Bombus,Les Bourdons noirs à bande(s) jaune(s) et cul ...,,,1,,0.0,3.0,
2,2,,10 918,Acanthus mollis,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,...,1,Bombus,Les Bourdons noirs à bande(s) jaune(s) et cul ...,,,je n'ai pas l'information,,0.0,3.0,
3,2,,10 918,Acanthus mollis,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,...,1,Sarcophaga,Les Mouches à damier,,,entre 2 et 5,,0.0,3.0,1.0
4,2,,10 918,Acanthus mollis,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,...,1,Apis mellifera,L'Abeille mellifère,,,entre 2 et 5,,0.0,3.0,


Index(['collection_id', 'protocole_long', 'user_id', 'plante_sc', 'plante_fr',
       'plante_precision', 'plante_inconnue', 'plante_caractere',
       'photo_fleur', 'photo_plante', 'photo_feuille', 'coordonnees_GPS',
       'code_postal', 'habitat', 'grande_culture', 'collection_date',
       'collection_heure_debut', 'nebulosite', 'temperature', 'vent',
       'fleur_ombre', 'insecte_sc', 'insecte_fr',
       'insecte_denominationPlusPrecise', 'insecte_CdNomtaxref',
       'insecte_abondance', 'insecte_commentaire', 'insecte_vu_sur_fleur',
       'nb_validation', 'nb_suggestion'],
      dtype='object')

In [5]:
# Extract relevant columns from the spipoll dataframe
plantes = spipoll[['collection_id', 'plante_sc', 'plante_fr',
       'plante_precision', 'plante_inconnue', 'plante_caractere',
       'photo_fleur', 'photo_plante', 'photo_feuille']]

# Shape of the dataframe
plantes.shape

(670744, 9)

## Group by collections id

In [6]:
# Keep only the first row for each unique value of collection_id
plantes = plantes.drop_duplicates(subset='collection_id', keep='first')

# Shape of the dataframe
plantes.shape

(75372, 9)

## Save the dataset as a .csv file

In [7]:
# Save the extracted columns as a csv file
plantes.to_csv("../data/plantes.csv", index=False)

# Analyzing relevant data for "Plants"

In [8]:
display(plantes.head())

Unnamed: 0,collection_id,plante_sc,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,2,Acanthus mollis,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
5,3,"Les Orchidées à fleurs blanches, jaunes ou ver...",,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
6,4,Les Vipérines à fleurs bleues-violacées (des e...,,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
10,5,Les Pâquerettes (Bellis sp),,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


## Missing data in the plants dataset

In [9]:
# Missing values in plantes.csv
missing_values = plantes.isnull().sum()

print(missing_values)

collection_id           0
plante_sc             478
plante_fr           33438
plante_precision    56873
plante_inconnue     27922
plante_caractere        0
photo_fleur             2
photo_plante            2
photo_feuille           2
dtype: int64


In [10]:
# Total number of rows
total_rows = plantes.shape[0]

# Compute the ratio of missing values
ratio_missing_values = missing_values / total_rows

print(ratio_missing_values)

collection_id       0.000000
plante_sc           0.006342
plante_fr           0.443640
plante_precision    0.754564
plante_inconnue     0.370456
plante_caractere    0.000000
photo_fleur         0.000027
photo_plante        0.000027
photo_feuille       0.000027
dtype: float64


## Checking the content of the pictures

In [11]:
# display columns photo_fleur, photo_plante and photo_feuille
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']]

Unnamed: 0,photo_fleur,photo_plante,photo_feuille
0,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
5,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
6,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
10,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
...,...,...,...
670677,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
670688,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
670716,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
670728,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


In [12]:
# Display columns photo_fleur, photo_plante and photo_feuille
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].values

array([['https://spgp-api.65mo.fr/api/containers/spgp/download/1560495176632.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/1560495176632.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/1560495176632.jpg'],
       ['https://spgp-api.65mo.fr/api/containers/spgp/download/dd7dd45f-a25d-4648-a2b6-d936b8dde197.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/301826da-6625-4b4c-ab9d-3c26b305cc66.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/ea80835b-c42a-4892-9a60-50712e6d48b2.jpg'],
       ['https://spgp-api.65mo.fr/api/containers/spgp/download/1560783259725.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/1560783259725.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/1560783259725.jpg'],
       ...,
       ['https://spgp-api.65mo.fr/api/containers/spgp/download/977333f4-5047-4ee8-b1c8-63593641364f.jpg',
        'https://spgp-api.65mo.fr/api/containers/spgp/download/

We will only use flower photos for the API calls, as it seems to be the most precise pictures.

## Distribution of data for each column of the plants dataset

In [13]:
# Count unique values for each column
plantes[['plante_sc', 'plante_fr', 'plante_precision', 'plante_inconnue', 'plante_caractere']].nunique()

plante_sc           2622
plante_fr           2193
plante_precision    7475
plante_inconnue        2
plante_caractere       3
dtype: int64

In [14]:
print(total_rows,"\n")

# Ratio of unique values for each column
plantes[['plante_sc', 'plante_fr', 'plante_precision', 'plante_inconnue', 'plante_caractere']].nunique() / total_rows

75372 



plante_sc           0.034787
plante_fr           0.029096
plante_precision    0.099175
plante_inconnue     0.000027
plante_caractere    0.000040
dtype: float64

In [15]:
# Count unique values for each column
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].nunique()

photo_fleur      75365
photo_plante     75365
photo_feuille    75365
dtype: int64

In [16]:
print(total_rows,"\n")
# Ratio of unique values for each column

plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].nunique() / total_rows

75372 



photo_fleur      0.999907
photo_plante     0.999907
photo_feuille    0.999907
dtype: float64

## Compacting the data to speed-up the analysis

In [35]:
# extract a subset from plantes named plantes_subset with only the rows that contain unique values for plante_sc
plantes_subset = plantes.drop_duplicates(subset='plante_sc')

# save as a csv file
plantes_subset.to_csv("../data/plantes_subset.csv", index=False)

total_rows_subset = plantes_subset.shape[0]
print(total_rows_subset,"\n")

display(plantes_subset.head())

2623 



Unnamed: 0,collection_id,plante_sc,data_augmentation,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,"[Trèfles, boule]",,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
1,2,Acanthus mollis,"[Acanthus, mollis]",Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,3,"Les Orchidées à fleurs blanches, jaunes ou ver...","[Orchidées, verdâtres]",,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
3,4,Les Vipérines à fleurs bleues-violacées (des e...,"[Vipérines, bleues-violacées]",,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
4,5,Les Pâquerettes (Bellis sp),[Pâquerettes],,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
plantes_subset = pd.read_csv("../data/plantes_subset.csv")

# List of columns to plot
columns = ['plante_sc', 'plante_fr', 'plante_precision', 'plante_inconnue', 'plante_caractere']

# Plot a histogram for each column
for column in columns:
    plt.figure()  # Create a new figure
    plantes_subset[column].hist()  # Plot a histogram
    plt.title(f'Histogram of {column}')  # Set the title
    plt.show()  # Display the plot

## Reorganizing the order of rows : grouping by plante_sc

In [37]:
# Re-order the rows of the dataframe to group the rows by same plante_sc values
plantes = plantes.sort_values(by='plante_sc')

display(plantes.head())

Unnamed: 0,collection_id,plante_sc,data_augmentation,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
59480,60 060,Abelia triflora,"[Abelia, triflora]",,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
51614,52 152,Abelia triflora,"[Abelia, triflora]",,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
68049,68 666,Abelia triflora,"[Abelia, triflora]",,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
74447,75 088,Abelia triflora,"[Abelia, triflora]",,,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
74849,75 490,Abelia triflora,"[Abelia, triflora]",,,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


## NLP to extract common features from the plant names

In [38]:
# Function to calculate the most common trigrams
def calculate_most_common_trigrams(column):
    
    trigrams = []
    
    stop_words = set(stopwords.words('french'))
    
    for text in column.dropna():
        tokens = nltk.word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
        trigrams.extend(list(ngrams(tokens, 3)))
    
    # Convert the list of trigrams into a DataFrame
    df_trigrams = pd.DataFrame(Counter(trigrams).most_common(), columns=['Trigram', 'Count'])
    
    return df_trigrams

# Apply the function to 'plante_sc'
df_trigrams = calculate_most_common_trigrams(plantes_subset['plante_sc'])

# Order the DataFrame by decreasing number of occurrences
df_trigrams = df_trigrams.sort_values(by='Count', ascending=False)

print("Most common trigrams in plante_sc:")
print(df_trigrams)

Most common trigrams in plante_sc:
                               Trigram  Count
0             (jaunes, espèces, genre)     17
1           (blanches, espèces, genre)     15
2          (fleurs, blanches, espèces)     13
3              (roses, espèces, genre)      9
4            (fleurs, jaunes, espèces)      7
..                                 ...    ...
303            (espèces, genre, Ajuga)      1
302           (Bugles, fleurs, bleues)      1
301              (Les, Bugles, fleurs)      1
300          (rosette, espèces, genre)      1
656  (morio, Anacamptis, papilionacea)      1

[657 rows x 2 columns]


In [39]:
# Function to calculate the most common monograms
def calculate_most_common_monograms(column):
    
    monograms = []
    
    stop_words = set(stopwords.words('french'))
    
    for text in column.dropna():
        tokens = nltk.word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
        monograms.extend(tokens)
    
    # Convert the list of monograms into a DataFrame
    df_monograms = pd.DataFrame(Counter(monograms).most_common(), columns=['Monogram', 'Count'])
    
    return df_monograms

# Apply the function to 'plante_sc'
df_monograms = calculate_most_common_monograms(plantes_subset['plante_sc'])

# Order the DataFrame by decreasing number of occurrences
df_monograms = df_monograms.sort_values(by='Count', ascending=False)

print("Most common monograms in plante_sc:")
print(df_monograms)

Most common monograms in plante_sc:
           Monogram  Count
0           espèces    158
1             genre    151
2            fleurs     80
3             Rubus     35
4              Rosa     33
...             ...    ...
1431  linariifolium      1
1430     acicularis      1
1429      cucullata      1
1428     myrtifolia      1
2609        falcata      1

[2610 rows x 2 columns]


## Data engineering

In [40]:
# ONLY IF NEEDED : drop data_augmentation from plantes
if 'data_augmentation' in plantes_subset.columns:
    plantes_subset.drop(columns=['data_augmentation'], inplace=True)

In [42]:
# Check if 'data augmentation' column exists in the dataframe
if 'data_augmentation' not in plantes_subset.columns:
    # If not, insert a new blank column 'data augmentation' between 'plante_sc' and 'plante_fr'
    plantes_subset.insert(2, 'data_augmentation', [None]*len(plantes_subset))

display(plantes_subset.head())

Unnamed: 0,collection_id,plante_sc,data_augmentation,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
1,2,Acanthus mollis,,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,3,"Les Orchidées à fleurs blanches, jaunes ou ver...",,,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
3,4,Les Vipérines à fleurs bleues-violacées (des e...,,,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
4,5,Les Pâquerettes (Bellis sp),,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


In [43]:
# Convert the monograms to a set to speed up the process
monograms_set = set(df_monograms_unique['Monogram'])

# Function to find the monograms in a string
def find_words_in_set(input_string, monograms_set):
    if isinstance(input_string, str):
        words = input_string.split()
        return [word for word in words if word in monograms_set]
    else:
        return []

In [44]:
# More efficient version WITHOUT THE LOOP# Use apply function for efficient computation
plantes_subset['data_augmentation'] = tqdm(plantes_subset['plante_sc'].apply(lambda x: find_words_in_set(x, monograms_set)))

100%|██████████| 2623/2623 [00:00<00:00, 1332726.76it/s]




In [45]:
# Display the number of missing data in the 'data_augmentation' column
plantes_subset['data_augmentation'].isnull().sum()

0

In [46]:
# Display the dataframe
display(plantes_subset.shape[0],plantes_subset.head())

Unnamed: 0,collection_id,plante_sc,data_augmentation,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,"[Trèfles, boule]",,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
1,2,Acanthus mollis,"[Acanthus, mollis]",Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,3,"Les Orchidées à fleurs blanches, jaunes ou ver...","[Orchidées, verdâtres]",,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
3,4,Les Vipérines à fleurs bleues-violacées (des e...,"[Vipérines, bleues-violacées]",,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
4,5,Les Pâquerettes (Bellis sp),[Pâquerettes],,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


# Call API on flower pictures to infer the missing data

In [27]:
import data_treatment.plantnet_api as plantnet_api

In [28]:
key_api = '2b10Me1HF0rfjoGWCseolNa8e'

predictor = plantnet_api.PlantNetPredictor(key_api)

In [29]:
prediction = predictor.predict(plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].values[0][0], 'flower',True)

In [30]:
prediction.keys()

prediction['results'][0]['species']

{'scientificNameWithoutAuthor': 'Trifolium pratense',
 'scientificNameAuthorship': 'L.',
 'genus': {'scientificNameWithoutAuthor': 'Trifolium',
  'scientificNameAuthorship': 'L.',
  'scientificName': 'Trifolium L.'},
 'family': {'scientificNameWithoutAuthor': 'Fabaceae',
  'scientificNameAuthorship': '',
  'scientificName': 'Fabaceae'},
 'commonNames': ['Trèfle des prés', 'Trèfle rouge', 'Trèfle commun'],
 'scientificName': 'Trifolium pratense L.'}