In [80]:
import pandas as pd

import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from collections import Counter

# Extracting a subset of spipoll.csv containing only relevant data

In [81]:
# Import spipoll.csv as a pandas dataframe
spipoll = pd.read_csv("../data/spipoll.csv",low_memory=False)

In [None]:
# Display the first 5 rows of the dataframe
display(spipoll.head())

# Display the columns names of the dataframe
spipoll.columns

In [82]:
# Extract relevant columns from the spipoll dataframe
plantes = spipoll[['collection_id', 'plante_sc', 'plante_fr',
       'plante_precision', 'plante_inconnue', 'plante_caractere',
       'photo_fleur', 'photo_plante', 'photo_feuille']]

# Shape of the dataframe
plantes.shape

(670744, 9)

## Group by collections id

In [83]:
# Keep only the first row for each unique value of collection_id
plantes = plantes.drop_duplicates(subset='collection_id', keep='first')

# Shape of the dataframe
plantes.shape

(75372, 9)

## Save the dataset as a .csv file

In [84]:
# Save the extracted columns as a csv file
plantes.to_csv("../data/plantes.csv", index=False)

# Analyzing relevant data for "Plants"

In [85]:
display(plantes.head())

Unnamed: 0,collection_id,plante_sc,plante_fr,plante_precision,plante_inconnue,plante_caractere,photo_fleur,photo_plante,photo_feuille
0,1,Les Trèfles à fleurs blanches ou roses en boul...,,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
2,2,Acanthus mollis,Acanthe molle,,,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
5,3,"Les Orchidées à fleurs blanches, jaunes ou ver...",,orchis bouc - Himantoglossum hircinum,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
6,4,Les Vipérines à fleurs bleues-violacées (des e...,,Theaceae,0.0,plantée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...
10,5,Les Pâquerettes (Bellis sp),,,0.0,spontanée,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...,https://spgp-api.65mo.fr/api/containers/spgp/d...


## Missing data in the plants dataset

In [None]:
# Missing values in plantes.csv
missing_values = plantes.isnull().sum()

print(missing_values)

In [None]:
# Total number of rows
total_rows = plantes.shape[0]

# Compute the ratio of missing values
ratio_missing_values = missing_values / total_rows

print(ratio_missing_values)

## Checking the content of the pictures

In [None]:
# display columns photo_fleur, photo_plante and photo_feuille
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']]

In [None]:
# Display columns photo_fleur, photo_plante and photo_feuille
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].values

We will only use flower photos for the API calls, as it seems to be the most precise pictures.

## Histogram of the distribution of data for each column of the plants dataset

In [None]:
# Count unique values for each column
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].nunique()

In [None]:
print(total_rows,"\n")

# Ratio of unique values for each column
plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].nunique() / total_rows

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Load the data
plantes = pd.read_csv("../data/plantes.csv")

# List of columns to plot
columns = ['plante_sc', 'plante_fr', 'plante_precision', 'plante_inconnue', 'plante_caractere']

# Plot a histogram for each column
for column in columns:
    plt.figure()  # Create a new figure
    plantes[column].hist()  # Plot a histogram
    plt.title(f'Histogram of {column}')  # Set the title
    plt.show()  # Display the plot

## NLP to extract common features from the plant names

In [86]:
# Function to calculate the most common trigrams
def calculate_most_common_trigrams(column):
    
    trigrams = []
    
    stop_words = set(stopwords.words('french'))
    
    for text in column.dropna():
        tokens = nltk.word_tokenize(text)
        tokens = [token for token in tokens if token not in stop_words]
        trigrams.extend(list(ngrams(tokens, 3)))
    
    return Counter(trigrams).most_common()

In [89]:
# Apply the function to 'plante_sc'
trigrams = calculate_most_common_trigrams(plantes['plante_sc'])

In [90]:
print("Most common trigrams in plante_sc:")

for trigram, count in trigrams:
    print(f"{trigram} - {count}")
print("\n")

Most common trigrams in plante_sc:
('(', 'espèces', 'genre') - 13055
(')', '(', 'espèces') - 3652
('port', 'dressé', '(') - 1910
('(', 'famille', 'Asteraceae') - 1891
('famille', 'Asteraceae', ')') - 1891
('fleurs', 'jaunes', 'port') - 1799
('jaunes', 'port', 'dressé') - 1799
('Les', 'Composées', 'fleurs') - 1760
('Composées', 'fleurs', 'jaunes') - 1760
('dressé', '(', 'famille') - 1760
('Les', 'Pissenlits', '(') - 1155
('Pissenlits', '(', 'Taraxacum') - 1155
('(', 'Taraxacum', 'sp') - 1155
('Taraxacum', 'sp', ')') - 1155
('(', 'fleurs', 'blanches') - 1052
('Les', 'Chardons', 'Cirses') - 1017
('Chardons', 'Cirses', '(') - 1017
('Cirses', '(', 'espèces') - 1017
('espèces', 'genre', 'Cirsium') - 1017
('genre', 'Cirsium', 'Carduus') - 1017
('Cirsium', 'Carduus', ')') - 1017
('jaunes', '(', 'espèces') - 941
('violettes', '(', 'espèces') - 899
('fleurs', 'violettes', '(') - 843
('Les', 'Centaurées', 'fleurs') - 838
('Centaurées', 'fleurs', 'violettes') - 838
('espèces', 'genre', 'Centaurea'

In [91]:
# Extract the most common monograms from the trigrams
print("Most common monograms among plante_sc trigrams:")
for trigram, count in trigrams:
    for monogram in trigram:
        print(f"{monogram} - {count}")
print("\n")

Most common monograms among plante_sc trigrams:
( - 13055
espèces - 13055
genre - 13055
) - 3652
( - 3652
espèces - 3652
port - 1910
dressé - 1910
( - 1910
( - 1891
famille - 1891
Asteraceae - 1891
famille - 1891
Asteraceae - 1891
) - 1891
fleurs - 1799
jaunes - 1799
port - 1799
jaunes - 1799
port - 1799
dressé - 1799
Les - 1760
Composées - 1760
fleurs - 1760
Composées - 1760
fleurs - 1760
jaunes - 1760
dressé - 1760
( - 1760
famille - 1760
Les - 1155
Pissenlits - 1155
( - 1155
Pissenlits - 1155
( - 1155
Taraxacum - 1155
( - 1155
Taraxacum - 1155
sp - 1155
Taraxacum - 1155
sp - 1155
) - 1155
( - 1052
fleurs - 1052
blanches - 1052
Les - 1017
Chardons - 1017
Cirses - 1017
Chardons - 1017
Cirses - 1017
( - 1017
Cirses - 1017
( - 1017
espèces - 1017
espèces - 1017
genre - 1017
Cirsium - 1017
genre - 1017
Cirsium - 1017
Carduus - 1017
Cirsium - 1017
Carduus - 1017
) - 1017
jaunes - 941
( - 941
espèces - 941
violettes - 899
( - 899
espèces - 899
fleurs - 843
violettes - 843
( - 843
Les - 838

In [92]:
print("Unique monograms among plante_sc trigrams (excluding 1,2 and 3 letters monograms):")
unique_monograms = set()
for trigram, count in trigrams:
    for monogram in trigram:
        if len(monogram) > 3:
            unique_monograms.add(monogram)

df_monograms = pd.DataFrame(list(unique_monograms), columns=['Monogram'])
print(df_monograms)

Unique monograms among plante_sc trigrams (excluding 1 and 2 letter monograms):
       Monogram
0    Clématites
1         jaune
2      Mélilots
3        Acacia
4        Ophrys
..          ...
474  Saponaires
475   Saxifraga
476   idaeoides
477  Asteraceae
478  Malcolmies

[479 rows x 1 columns]


In [93]:
# Convertir les valeurs de 'Monogram' en un ensemble
monograms_set = set(df_monograms['Monogram'])

In [94]:
# Fonction pour trouver les monogrammes communs
def find_common_monograms(text):
    if isinstance(text, str):
        tokens = nltk.word_tokenize(text)
        return [token for token in tokens if token in monograms_set]
    else:
        return

In [None]:
# Remove stopwords from the trigrams

## Data engineering

In [None]:
# Add a new blank column 'data augmentation' between 'plante_sc' and 'plante_fr'
plantes.insert(2, 'data augmentation', '')

display(plantes.head())

In [None]:
# Apply the function to 'plante_sc' to fill the row 'data augmentation'
plantes['data augmentation'] = plantes['plante_sc'].apply(find_common_monograms)

In [None]:
# Display the dataframe
display(plantes.head())

# Call API on flower pictures to infer the missing data

In [None]:
import data_treatment.plantnet_api as plantnet_api

In [None]:
key_api = '2b10Me1HF0rfjoGWCseolNa8e'

predictor = plantnet_api.PlantNetPredictor(key_api)

In [None]:
prediction = predictor.predict(plantes[['photo_fleur', 'photo_plante', 'photo_feuille']].values[0][0], 'flower',True)

In [None]:
prediction.keys()

prediction['results'][0]['species']