# basic plant detection for starting Bachelor thesis

## read metadata

In [1]:
# load data PlantCLEF2022_trusted_training_metadata.csv form /data/01_raw/PlantCLEF2022_trusted_training_metadata.csv

import pandas as pd

amount_pictures = 1_000
file_path = "../data/01_raw/PlantCLEF2022_trusted_training_metadata.csv"
try:
    data = pd.read_csv(file_path, delimiter=";", nrows=amount_pictures)
    if data.empty:
        print("The CSV file is empty")
    else:
        print(data.head())

except FileNotFoundError:
    print(
        f"File not found: {file_path}. Please check the file path and that the file is downloaded."
    )

   classid                                    image_name  \
0  2683260  d0749fe4f8ade13dd9402b0f43bc29e8f28af27a.jpg   
1  2683260  7549c8ddadd95d996e10aa76f8e39bedb059c637.jpg   
2  2683260  8c6049ebdff482b901f32e0ac9e421c69bc652b4.jpg   
3  2683260  b932fd8619c667a5b4f3c38596911937a2692b26.jpg   
4  2683260  9f9dbdb0b9ce8bc3fb562f648af7717b5511b597.jpg   

                                          image_path               species  \
0  2683260/d0749fe4f8ade13dd9402b0f43bc29e8f28af2...  Cycas angulata R.Br.   
1  2683260/7549c8ddadd95d996e10aa76f8e39bedb059c6...  Cycas angulata R.Br.   
2  2683260/8c6049ebdff482b901f32e0ac9e421c69bc652...  Cycas angulata R.Br.   
3  2683260/b932fd8619c667a5b4f3c38596911937a2692b...  Cycas angulata R.Br.   
4  2683260/9f9dbdb0b9ce8bc3fb562f648af7717b5511b5...  Cycas angulata R.Br.   

   genus      family      order        class source manual_tag predicted_tag  \
0  Cycas  Cycadaceae  Cycadales  Cycadopsida    NaN        NaN         habit   
1  Cycas  

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## remove unnecessary columns

In [2]:
# remove column 'Unnamed: 0'
data = data.drop(
    columns=[
        "image_name",
        "source",
        "manual_tag",
        "predicted_tag",
        "predicted_tag_probability",
        "original_url",
        "license",
        "publisher",
        "gbif_occurrence_id",
        "aggregator",
        "dataset_key",
    ]
)

## delete already existing files

In [3]:
import os

temp_dir = "temp"

# delete and create tmp dir to ensure it's empty
os.system(f"rm -rf {temp_dir}")

# Ensure the tmp/ directory exists
os.makedirs(temp_dir, exist_ok=True)

## download data per plant parallelized

In [4]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor


def download_image(index_url):
    index, url = index_url
    try:
        response = requests.get(url)
        file_path = os.path.join(temp_dir, f"{index}.jpg")

        with open(file_path, "wb") as file:
            file.write(response.content)
        # print(f"Downloaded {url} to {file_path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


# Prepare a list of tuples containing the index and URL for each image
index_url_list = [(index, row["image_backup_url"]) for index, row in data.iterrows()]

# Use ThreadPoolExecutor to download images in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_image, index_url_list)

## add image path to metadata

In [5]:
data["image_path"] = [f"{temp_dir}/{index}.jpg" for index in data.index]
data.head()

Unnamed: 0,classid,image_path,species,genus,family,order,class,image_backup_url
0,2683260,temp/0.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
1,2683260,temp/1.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
2,2683260,temp/2.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
3,2683260,temp/3.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
4,2683260,temp/4.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...


## prepare data

In [6]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split

target_column = "species"

# Split the dataset into training and validation sets
train_data, valid_data = train_test_split(data, test_size=0.2, random_state=42)

# Create ImageDataGenerators for data augmentation and normalization
train_datagen = ImageDataGenerator(
    rescale=1.0 / 255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode="nearest",
)

valid_datagen = ImageDataGenerator(rescale=1.0 / 255)

# Create generators that read images from the dataframe
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_data,
    directory=None,  # Directory is None because paths are given in dataframe
    x_col="image_path",
    y_col=target_column,
    target_size=(150, 150),
    batch_size=32,
    class_mode="categorical",
)

valid_generator = valid_datagen.flow_from_dataframe(
    dataframe=valid_data,
    directory=None,
    x_col="image_path",
    y_col=target_column,
    target_size=(150, 150),
    batch_size=32,
    class_mode="categorical",
)

Found 800 validated image filenames belonging to 62 classes.
Found 200 validated image filenames belonging to 46 classes.


## Model building

In [9]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense
from tensorflow.keras.models import Model

# amount distinct values of classid
number_of_plant_classes = len(data[target_column].unique()) - 1

# Load pre-trained ResNet50 model, excluding the top layer
base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

# Freeze the layers except the last 4 layers
for layer in base_model.layers[:-4]:
    layer.trainable = False

# Add new top layers
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation="relu")(x)  # New FC layer, random init
predictions = Dense(number_of_plant_classes, activation="softmax")(
    x
)  # New softmax layer

# Final model
model = Model(inputs=base_model.input, outputs=predictions)

# Compile the model (should be done *after* setting layers to non-trainable)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

## Model training

In [11]:
history = model.fit(
    train_generator,
    steps_per_epoch=10,  # Number of batches per epoch, adjust based on your data size
    epochs=5,
    validation_data=valid_generator,
    validation_steps=50  # Adjust based on your validation data size
)

Epoch 1/5

InvalidArgumentError: Graph execution error:

Detected at node categorical_crossentropy/softmax_cross_entropy_with_logits defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py", line 607, in run_forever

  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once

  File "/opt/homebrew/Cellar/python@3.11/3.11.6_1/Frameworks/Python.framework/Versions/3.11/lib/python3.11/asyncio/events.py", line 80, in _run

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 542, in dispatch_queue

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 531, in process_one

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 775, in execute_request

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3051, in run_cell

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3106, in _run_cell

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3311, in run_cell_async

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3493, in run_ast_nodes

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "/var/folders/tl/zxb0jl8951j9q7tl76bjx2n40000gn/T/ipykernel_89122/3223925112.py", line 1, in <module>

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 1856, in fit

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 2296, in evaluate

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 4108, in run_step

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 2066, in test_function

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 2049, in step_function

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 2037, in run_step

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 1919, in test_step

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/training.py", line 1209, in compute_loss

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/engine/compile_utils.py", line 277, in __call__

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/losses.py", line 143, in __call__

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/losses.py", line 270, in call

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/losses.py", line 2221, in categorical_crossentropy

  File "/Users/simi/workarea/vscode/bachelor/plant_detection_model/venv/lib/python3.11/site-packages/keras/src/backend.py", line 5579, in categorical_crossentropy

logits and labels must be broadcastable: logits_size=[32,62] labels_size=[32,46]
	 [[{{node categorical_crossentropy/softmax_cross_entropy_with_logits}}]] [Op:__inference_test_function_19391]