# basic plant detection for starting Bachelor thesis

## read metadata

In [1]:
# load data PlantCLEF2022_trusted_training_metadata.csv form /data/01_raw/PlantCLEF2022_trusted_training_metadata.csv

import pandas as pd

file_path = "../data/02_processed/merged_data.csv"
try:
    data = pd.read_csv(file_path, delimiter=";")
    if data.empty:
        print("The CSV file is empty")
    else:
        print(data.head())

except FileNotFoundError:
    print(
        f"File not found: {file_path}. Please check the file path and that the file is downloaded."
    )

   classid                                         image_path  \
0  5328909  5328909/de73353bbf8431ec594df8c0c070fa5d562756...   
1  5328909  5328909/cb1b1aac1895f8f5a52e1c85ef8ceae7580e68...   
2  5328909  5328909/4bdb06e3f9b4b61c9ed2a269498d064b41b1d0...   
3  5328909  5328909/2e5095764b764e63fca8150bab3aa1bbc2157a...   
4  5328909  5328909/9886a43a74aa9c493667235bb98786df965706...   

                       species       genus        family        order  \
0  sagittaria latifolia willd.  Sagittaria  Alismataceae  Alismatales   
1  sagittaria latifolia willd.  Sagittaria  Alismataceae  Alismatales   
2  sagittaria latifolia willd.  Sagittaria  Alismataceae  Alismatales   
3  sagittaria latifolia willd.  Sagittaria  Alismataceae  Alismatales   
4  sagittaria latifolia willd.  Sagittaria  Alismataceae  Alismatales   

        class                                   image_backup_url  
0  Liliopsida  https://lab.plantnet.org/LifeCLEF/PlantCLEF202...  
1  Liliopsida  https://lab.plantnet.

## use the 5 most occuring species 

In [10]:
# find the 5 most occuring species in the dataset and remove the rest

AMOUNT_OF_SPECIES = 5

species = data["species"].value_counts().head(AMOUNT_OF_SPECIES)
species = species.index.tolist()
data = data[data["species"].isin(species)]

species = data["species"].unique()
len(species)

5

## delete already existing images

In [11]:
import os

temp_dir = "temp"

# delete and create tmp dir to ensure it's empty
os.system(f"rm -rf {temp_dir}")

# Ensure the tmp/ directory exists
os.makedirs(temp_dir, exist_ok=True)

## download data per plant parallelized

In [12]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor


def download_image(index_url: str) -> None:
    index, url = index_url
    try:
        response = requests.get(url)
        file_path = os.path.join(temp_dir, f"{index}.jpg")

        with open(file_path, "wb") as file:
            file.write(response.content)
        # print(f"Downloaded {url} to {file_path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


# Prepare a list of tuples containing the index and URL for each image
index_url_list = [(index, row["image_backup_url"]) for index, row in data.iterrows()]

# Use ThreadPoolExecutor to download images in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_image, index_url_list)

## add image path to metadata

In [13]:
data["image_path"] = [f"{temp_dir}/{index}.jpg" for index in data.index]
data.head()

Unnamed: 0,classid,image_path,species,genus,family,order,class,image_backup_url
580,8002952,temp/580.jpg,ambrosia artemisiifolia l.,Ambrosia,Asteraceae,Asterales,Magnoliopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
581,8002952,temp/581.jpg,ambrosia artemisiifolia l.,Ambrosia,Asteraceae,Asterales,Magnoliopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
582,8002952,temp/582.jpg,ambrosia artemisiifolia l.,Ambrosia,Asteraceae,Asterales,Magnoliopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
583,8002952,temp/583.jpg,ambrosia artemisiifolia l.,Ambrosia,Asteraceae,Asterales,Magnoliopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
584,8002952,temp/584.jpg,ambrosia artemisiifolia l.,Ambrosia,Asteraceae,Asterales,Magnoliopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...


## split into train and test set

In [21]:
from typing import Tuple

import tensorflow as tf
from sklearn.preprocessing import LabelEncoder


IMAGE_SIZE = [224, 224] # default for MobileNetV2
BATCH_SIZE = 32
CHANNELS = 3

train_df = data.sample(frac=0.8, random_state=123)  # 80% for training
val_df = data.drop(train_df.index)                  # Remaining 20% for validation


# Fit the label encoder and transform the 'classid' column
label_encoder = LabelEncoder()
data['classid_encoded'] = label_encoder.fit_transform(data['classid'])

def preprocess_image(image_path: str, label: str) -> Tuple[tf.Tensor, tf.Tensor]:
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=CHANNELS)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = image / 255.0  # Normalize pixel values
    return image, label

def df_to_dataset(dataframe: pd.DataFrame, shuffle=True, batch_size=BATCH_SIZE) -> tf.data.Dataset:
    images = dataframe['image_path'].values
    labels = dataframe['classid_encoded'].values
    ds = tf.data.Dataset.from_tensor_slices((images, labels))
    ds = ds.map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds

train_dataset = df_to_dataset(train_df, shuffle=True, batch_size=BATCH_SIZE)
val_dataset = df_to_dataset(val_df, shuffle=False, batch_size=BATCH_SIZE)

## save classes

In [None]:
# TODO: REWRITE THIS, MAYBE DONT USE CLASSID AS LABEL BUT SPECIES

import json

classes = {label: class_name for label, class_name in enumerate(encoder.classes_)}
with open('classes.json', 'w') as f:
    json.dump(classes, f)

## Set up MobileNet

In [22]:
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.models import Model

# Load the MobileNetV2 model, excluding the top fully connected layer
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(*IMAGE_SIZE, CHANNELS))

# Freeze the base model
base_model.trainable = False

# Add new layers on top
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)  # Large Dense layer for feature interpretation
predictions = Dense(AMOUNT_OF_SPECIES, activation='softmax')(x)  # Final layer with softmax activation for AMOUNT_OF_SPECIES classes

model = Model(inputs=base_model.input, outputs=predictions)


## compile model

In [23]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## model training

In [24]:
history = model.fit(
    train_dataset,
    steps_per_epoch=len(train_df) // BATCH_SIZE,
    epochs=10,
    validation_data=val_dataset,
    validation_steps=len(val_df) // BATCH_SIZE
)


Epoch 1/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 373ms/step - accuracy: 0.3570 - loss: 2.2462 - val_accuracy: 0.6250 - val_loss: 1.0562
Epoch 2/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.6364 - loss: 0.9659 - val_accuracy: 0.4000 - val_loss: 1.9300
Epoch 3/10


2024-04-24 08:48:26.591042: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
  self.gen.throw(typ, value, traceback)
2024-04-24 08:48:26.651868: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 322ms/step - accuracy: 0.7577 - loss: 0.6519 - val_accuracy: 0.6979 - val_loss: 0.8288
Epoch 4/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7273 - loss: 0.7825 - val_accuracy: 0.6000 - val_loss: 0.7288
Epoch 5/10


2024-04-24 08:48:30.889109: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-24 08:48:30.941137: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 321ms/step - accuracy: 0.9260 - loss: 0.2853 - val_accuracy: 0.7292 - val_loss: 0.7562
Epoch 6/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9545 - loss: 0.2202 - val_accuracy: 0.6000 - val_loss: 0.6350
Epoch 7/10


2024-04-24 08:48:35.061374: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-24 08:48:35.116794: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 330ms/step - accuracy: 0.9460 - loss: 0.1987 - val_accuracy: 0.7500 - val_loss: 0.6893
Epoch 8/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9545 - loss: 0.1559 - val_accuracy: 0.6000 - val_loss: 1.3977
Epoch 9/10


2024-04-24 08:48:39.351904: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-24 08:48:39.401719: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 315ms/step - accuracy: 0.9810 - loss: 0.1053 - val_accuracy: 0.7292 - val_loss: 0.6078
Epoch 10/10
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 1.0000 - loss: 0.1101 - val_accuracy: 0.4000 - val_loss: 1.6608


2024-04-24 08:48:43.517978: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
2024-04-24 08:48:43.572995: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]


## evaluate model

In [25]:
val_loss, val_accuracy = model.evaluate(val_dataset, steps=len(val_df) // BATCH_SIZE)
print(f'Validation loss: {val_loss}, Validation accuracy: {val_accuracy}')


[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 260ms/step - accuracy: 0.7630 - loss: 0.5911
Validation loss: 0.612205445766449, Validation accuracy: 0.7291666865348816


## save model

In [26]:
from datetime import datetime

save_model = True

if save_model:
    now = datetime.now().strftime("%Y%m%d%H%M%S")
    model.save(f"../models/model-{now}.keras")
else:
    print("Model not saved. Set save_model to True to save the model.")

TODO: Save the classes like classes.json