In [7]:
import pandas as pd

file_path = "../data/01_raw/PlantCLEF2022_trusted_training_metadata.csv"
try:
    df = pd.read_csv(file_path, delimiter=";")
    if df.empty:
        print("The CSV file is empty")
    else:
        print(df.head(1))

except FileNotFoundError:
    print(
        f"File not found: {file_path}. Please check the file path and that the file is downloaded."
    )

   classid                                    image_name  \
0  2683260  d0749fe4f8ade13dd9402b0f43bc29e8f28af27a.jpg   

                                          image_path               species  \
0  2683260/d0749fe4f8ade13dd9402b0f43bc29e8f28af2...  Cycas angulata R.Br.   

   genus      family      order        class source manual_tag predicted_tag  \
0  Cycas  Cycadaceae  Cycadales  Cycadopsida    NaN        NaN         habit   

   predicted_tag_probability  \
0                     0.7386   

                                        original_url  \
0  https://images.ala.org.au/image/proxyImageThum...   

                                       license  \
0  http://creativecommons.org/licenses/by/4.0/   

                           publisher  gbif_occurrence_id aggregator  \
0  ALA species sightings and OzAtlas        1.632965e+09       gbif   

                            dataset_key  \
0  84a649ce-ff81-420d-9c41-aa1de59e3766   

                                    image_backup_url

In [8]:
# Drop columns that are not needed
df = df.drop(
    columns=[
        "image_name",
        "source",
        "manual_tag",
        "predicted_tag",
        "predicted_tag_probability",
        "original_url",
        "license",
        "publisher",
        "gbif_occurrence_id",
        "aggregator",
        "dataset_key",
    ]
)

In [9]:
# amount of rows
print(df.shape[0])

# amount of unique species
print(df["species"].nunique())

# print unique classes
print(df["species"].unique())

2886761
80000
['Cycas angulata R.Br.' 'Cycas armstrongii Miq.' 'Cycas beddomei Dyer' ...
 'Schizaea pusilla Pursh' 'Schizaea robusta Baker'
 'Schizaea tenella Kaulf.']


This are way too many species to train on any device in a reasonable time. Some species might not be necessary to train on, because they are not present in switzerland. I will try to find out which species are present in switzerland and only train on those.

lets load the list with invasive species from https://www.infoflora.ch/de/neophyten/listen-und-infoblätter.html and see which species are present in switzerland.

In [12]:
file_path = "../data/01_raw/liste-inv-neoph-ch-2021-d-f-i.csv"
try:
    df_neophytes = pd.read_csv(file_path, delimiter=";")
    if df_neophytes.empty:
        print("The CSV file is empty")
    else:
        print(df_neophytes.head(1))

except FileNotFoundError:
    print(
        f"File not found: {file_path}. Please check the file path and that the file is downloaded."
    )

   taxon_id (Info Flora) Wissenschaftlicher Name LIST 2021 Habitus  \
0                1000030    Acacia dealbata Link       INV      Ba   

          Name DE Unnamed: 5                                 Unnamed: 6  \
0  Falsche Mimose       link  https://www.infoflora.ch/de/flora/1000030   

              Nom FR Unnamed: 8                                 Unnamed: 9  \
0  Mimosa blanchâtre       link  https://www.infoflora.ch/fr/flore/1000030   

   ... Unnamed: 11                                Unnamed: 12 Jura Mittelland  \
0  ...        link  https://www.infoflora.ch/it/flora/1000030  NaN          *   

  Alpennordflanke Zentralalpen W Zentralalpen O Alpensüdflanke IFL 2014  \
0               *            NaN            NaN             **       WL   

  Anhang 2 (FrSV)  
0             NaN  

[1 rows x 21 columns]


minimize the df that only species appear that are present in switzerland and classified.

In [15]:
# Preprocessing
def normalize_species(name):
    return name.lower().strip()

df['species'] = df['species'].apply(normalize_species)
df_neophytes['species'] = df_neophytes['Wissenschaftlicher Name'].apply(normalize_species)

# Merge dataframes on 'species'
merged_df = df.merge(df_neophytes[['species']], on='species', how='inner')

# Select desired columns
result = merged_df[df.columns]

print(result)

      classid                                         image_path  \
0     5328909  5328909/de73353bbf8431ec594df8c0c070fa5d562756...   
1     5328909  5328909/cb1b1aac1895f8f5a52e1c85ef8ceae7580e68...   
2     5328909  5328909/4bdb06e3f9b4b61c9ed2a269498d064b41b1d0...   
3     5328909  5328909/2e5095764b764e63fca8150bab3aa1bbc2157a...   
4     5328909  5328909/9886a43a74aa9c493667235bb98786df965706...   
...       ...                                                ...   
4135  2650107  2650107/0bfd87eb9b212f51169fb11d6f93117f47da55...   
4136  2650107  2650107/875bf244d751fd70fa3c5878052ac9b59d2a16...   
4137  2650107  2650107/929e6a561868452e7471ff9ade24a596bc6b7a...   
4138  2650107  2650107/9ded7167346ee1d1dc989a7a1d504b850280a2...   
4139  2650107  2650107/37e467f9e687ea6b2ad5127551cbe5ecb93717...   

                          species       genus        family        order  \
0     sagittaria latifolia willd.  Sagittaria  Alismataceae  Alismatales   
1     sagittaria latifolia will

In [17]:
# save the df to a csv file
result.to_csv("../data/02_processed/merged_data.csv", index=False)