In [106]:
import pandas as pd

import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from collections import Counter

from tqdm import tqdm

# Extracting a subset of spipoll.csv containing only relevant data

In [81]:
# Import spipoll.csv as a pandas dataframe
spipoll = pd.read_csv("../data/spipoll.csv",low_memory=False)

In [None]:
# Display the first 5 rows of the dataframe
display(spipoll.head())

# Display the columns names of the dataframe
spipoll.columns

In [82]:
# Extract relevant columns from the spipoll dataframe
plantes = spipoll[['collection_id', 'plante_sc', 'plante_fr',
       'plante_precision', 'plante_inconnue', 'plante_caractere',
       'photo_fleur', 'photo_plante', 'photo_feuille']]

# Shape of the dataframe
plantes.shape

(670744, 9)

## Group by collections id

In [83]:
# Keep only the first row for each unique value of collection_id
plantes = plantes.drop_duplicates(subset='collection_id', keep='first')

# Shape of the dataframe
plantes.shape

(75372, 9)

## Save the dataset as a .csv file

In [84]:
# Save the extracted columns as a csv file
plantes.to_csv("../data/plantes.csv", index=False)

# Analyzing relevant data for "Plants"

In [85]:
display(plantes.head())

Unnamed: 0,collection_id,plante_sc,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,2,Acanthus mollis,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
5,3,"Les Orchidées à fleurs blanches, jaunes ou ver...",,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
6,4,Les Vipérines à fleurs bleues-violacées (des e...,,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
10,5,Les Pâquerettes (Bellis sp),,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


## Missing data in the plants dataset

In [None]:
# Missing values in plantes.csv
missing_values = plantes.isnull().sum()

print(missing_values)

In [None]:
# Total number of rows
total_rows = plantes.shape[0]

# Compute the ratio of missing values
ratio_missing_values = missing_values / total_rows

print(ratio_missing_values)

## Checking the content of the pictures

In [None]:
# display columns photo_fleur, photo_plante and photo_feuille
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']]

In [None]:
# Display columns photo_fleur, photo_plante and photo_feuille
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].values

We will only use flower photos for the API calls, as it seems to be the most precise pictures.

## Histogram of the distribution of data for each column of the plants dataset

In [None]:
# Count unique values for each column
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].nunique()

In [None]:
print(total_rows,"\n")

# Ratio of unique values for each column
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].nunique() / total_rows

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
plantes = pd.read_csv("../data/plantes.csv")

# List of columns to plot
columns = ['plante_sc', 'plante_fr', 'plante_precision', 'plante_inconnue', 'plante_caractere']

# Plot a histogram for each column
for column in columns:
    plt.figure()  # Create a new figure
    plantes[column].hist()  # Plot a histogram
    plt.title(f'Histogram of {column}')  # Set the title
    plt.show()  # Display the plot

## NLP to extract common features from the plant names

In [97]:
# Function to calculate the most common trigrams
def calculate_most_common_trigrams(column):
    
    trigrams = []
    
    stop_words = set(stopwords.words('french'))
    
    for text in column.dropna():
        tokens = nltk.word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
        trigrams.extend(list(ngrams(tokens, 3)))
    
    # Convert the list of trigrams into a DataFrame
    df_trigrams = pd.DataFrame(Counter(trigrams).most_common(), columns=['Trigram', 'Count'])
    
    return df_trigrams

# Apply the function to 'plante_sc'
df_trigrams = calculate_most_common_trigrams(plantes['plante_sc'])

# Order the DataFrame by decreasing number of occurrences
df_trigrams = df_trigrams.sort_values(by='Count', ascending=False)

print("Most common trigrams in plante_sc:")
print(df_trigrams)

Most common trigrams in plante_sc:
                            Trigram  Count
0            (fleurs, jaunes, port)   1799
1            (jaunes, port, dressé)   1799
2          (Les, Composées, fleurs)   1760
5     (dressé, famille, Asteraceae)   1760
3       (Composées, fleurs, jaunes)   1760
..                              ...    ...
648  (Gentianes, pourpres, espèces)      1
647      (Les, Gentianes, pourpres)      1
646      (fleurs, ombelle, espèces)      1
644    (bleues, fleurs, solitaires)      1
656     (italicum, Arum, maculatum)      1

[657 rows x 2 columns]


In [99]:
# Function to calculate the most common monograms
def calculate_most_common_monograms(column):
    
    monograms = []
    
    stop_words = set(stopwords.words('french'))
    
    for text in column.dropna():
        tokens = nltk.word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words and len(token) > 3]
        monograms.extend(tokens)
    
    # Convert the list of monograms into a DataFrame
    df_monograms = pd.DataFrame(Counter(monograms).most_common(), columns=['Monogram', 'Count'])
    
    return df_monograms

# Apply the function to 'plante_sc'
df_monograms = calculate_most_common_monograms(plantes['plante_sc'])

# Order the DataFrame by decreasing number of occurrences
df_monograms = df_monograms.sort_values(by='Count', ascending=False)

print("Most common monograms in plante_sc:")
print(df_monograms)

Most common monograms in plante_sc:
         Monogram  Count
0         espèces  14007
1           genre  13055
2          Plante  10651
3        inconnue  10651
4          fleurs   8625
...           ...    ...
2232    Sannantha      1
2231  fenestralis      1
2230       gratus      1
2229       pictum      1
2609      falcata      1

[2610 rows x 2 columns]


In [100]:
# Filter the df_monograms dataframe to keep only unique values
df_monograms_unique = df_monograms[df_monograms['Count'] == 1]

print(df_monograms_unique)

         Monogram  Count
2434      pygmaea      1
2426    aquatilis      1
2421     coulteri      1
2422  vesiculosum      1
2423     subovata      1
...           ...    ...
2232    Sannantha      1
2231  fenestralis      1
2230       gratus      1
2229       pictum      1
2609      falcata      1

[556 rows x 2 columns]


## Data engineering

In [117]:
# drop data_augmentation from plantes
plantes.drop(columns=['data_augmentation'], inplace=True)

In [119]:
# Check if 'data augmentation' column exists in the dataframe
if 'data_augmentation' not in plantes.columns:
    # If not, insert a new blank column 'data augmentation' between 'plante_sc' and 'plante_fr'
    plantes.insert(2, 'data_augmentation', 'Na')

display(plantes.head())

# Display the number of cells containing 'Na' in the 'data augmentation' column
plantes['data_augmentation'].value_counts()

Unnamed: 0,collection_id,plante_sc,data_augmentation,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,Na,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,2,Acanthus mollis,Na,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
5,3,"Les Orchidées à fleurs blanches, jaunes ou ver...",Na,,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
6,4,Les Vipérines à fleurs bleues-violacées (des e...,Na,,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
10,5,Les Pâquerettes (Bellis sp),Na,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


Nombre de lignes vides dans data_augmentation : 0


In [112]:
# Convert the df_monograms_unique dataframe into a set
monograms_set = set(df_monograms_unique['Monogram'])

In [107]:
# Fill the 'data augmentation' column with the monograms that are in the 'plante_sc' column

for index, row in tqdm(plantes.iterrows(), total=plantes.shape[0]):
    words = set(str(row['plante_sc']).split())
    matching_words = words.intersection(monograms_set)
    if matching_words:
        plantes.loc[index, 'data augmentation'] = ' '.join(matching_words)
    else:
        plantes.loc[index, 'data augmentation'] = ''

100%|██████████| 75372/75372 [00:13<00:00, 5644.50it/s]


In [110]:
# Display the dataframe
display(plantes.head())

Unnamed: 0,collection_id,plante_sc,data augmentation,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,2,Acanthus mollis,,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
5,3,"Les Orchidées à fleurs blanches, jaunes ou ver...",,,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
6,4,Les Vipérines à fleurs bleues-violacées (des e...,,,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
10,5,Les Pâquerettes (Bellis sp),,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


0

# Call API on flower pictures to infer the missing data

In [None]:
import data_treatment.plantnet_api as plantnet_api

In [None]:
key_api = '2b10Me1HF0rfjoGWCseolNa8e'

predictor = plantnet_api.PlantNetPredictor(key_api)

In [None]:
prediction = predictor.predict(plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].values[0][0], 'flower',True)

In [None]:
prediction.keys()

prediction['results'][0]['species']