# basic plant detection for starting Bachelor thesis

## read metadata

In [None]:
# load data PlantCLEF2022_trusted_training_metadata.csv form /data/01_raw/PlantCLEF2022_trusted_training_metadata.csv

import pandas as pd

amount_pictures = 1_000
file_path = "../data/01_raw/PlantCLEF2022_trusted_training_metadata.csv"
try:
    data = pd.read_csv(file_path, delimiter=";", nrows=amount_pictures)
    if data.empty:
        print("The CSV file is empty")
    else:
        print(data.head())

except FileNotFoundError:
    print(
        f"File not found: {file_path}. Please check the file path and that the file is downloaded."
    )

## remove unnecessary columns

In [None]:
# remove column 'Unnamed: 0'
data = data.drop(
    columns=[
        "image_name",
        "source",
        "manual_tag",
        "predicted_tag",
        "predicted_tag_probability",
        "original_url",
        "license",
        "publisher",
        "gbif_occurrence_id",
        "aggregator",
        "dataset_key",
    ]
)

## delete already existing files

In [None]:
import os

temp_dir = "temp"

# delete and create tmp dir to ensure it's empty
os.system(f"rm -rf {temp_dir}")

# Ensure the tmp/ directory exists
os.makedirs(temp_dir, exist_ok=True)

## download data per plant parallelized

In [None]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor


def download_image(index_url):
    index, url = index_url
    try:
        response = requests.get(url)
        file_path = os.path.join(temp_dir, f"{index}.jpg")

        with open(file_path, "wb") as file:
            file.write(response.content)
        # print(f"Downloaded {url} to {file_path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


# Prepare a list of tuples containing the index and URL for each image
index_url_list = [(index, row["image_backup_url"]) for index, row in data.iterrows()]

# Use ThreadPoolExecutor to download images in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_image, index_url_list)

## add image path to metadata

In [None]:
data["image_path"] = [f"{temp_dir}/{index}.jpg" for index in data.index]
data.head()

## split into train and test set

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from PIL import Image
from sklearn.model_selection import train_test_split

target_size = (224, 224) # todo: is now random choose a good size afterwards

# Function to load images
def load_image(image_path: str) -> np.ndarray:
    image = Image.open(image_path).convert('RGB')  # Ensure RGB format
    image = image.resize(target_size)
    image_array = np.asarray(image)
    return image_array

# Load images into arrays
images = np.array([load_image(path) for path in data['image_path']])

# Encode labels 
# transform into numerical values to train classifier
encoder = LabelEncoder()
data['encoded_labels'] = encoder.fit_transform(data['species'])  
labels = data['encoded_labels'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    images, labels, test_size=0.2, random_state=42  # Adjust test_size as needed
)

## Model building

In [None]:
import tensorflow as tf

num_classes = len(np.unique(y_train))

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = (224, 224, 3)),
    tf.keras.layers.Dense(128, activation='relu', kernel_initializer = tf.keras.initializers.he_normal),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer = tf.keras.regularizers.L2(0.01)),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(num_classes + 1, activation = 'softmax') # todo: +1 makes no sense?
])

In [None]:
# Thoroughly examine y_train
print("Unique labels: ", np.unique(y_train))
print("Min label: ", np.min(y_train))
print("Max label: ", np.max(y_train))
print("Full y_train array:\n", y_train) 

# If there's an unexpected value, investigate further with:
if np.max(y_train) >= 62: 
    for i, label in enumerate(y_train):
        if label >= 62:
            print(f"Invalid label {label} found at index {i}")

In [None]:
df = data
mapping = {}

for species, label in zip(df['species'], df['encoded_labels']):
    if species not in mapping:
        mapping[species] = label

for species, label in mapping.items():
    print(f"{species}: {label}")

# Method 2: Using pandas.DataFrame.drop_duplicates 
unique_df = df[['species', 'encoded_labels']].drop_duplicates()

for index, row in unique_df.iterrows():
    print(f"{row['species']}: {row['encoded_labels']}")

In [55]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_8 (Flatten)         (None, 150528)            0         
                                                                 
 dense_26 (Dense)            (None, 128)               19267712  
                                                                 
 dense_27 (Dense)            (None, 64)                8256      
                                                                 
 dropout_8 (Dropout)         (None, 64)                0         
                                                                 
 dense_28 (Dense)            (None, 63)                4095      
                                                                 
Total params: 19280063 (73.55 MB)
Trainable params: 19280063 (73.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [56]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## define callback

In [57]:
log_dir = "logs/my_model"
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

## Model training

In [58]:
history = model.fit(X_train, y_train, epochs=30, callbacks=[tensorboard_cb])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# evaluate model

In [59]:
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)

print('\nTest accuracy:', test_acc)

7/7 - 0s - loss: 4.4181 - accuracy: 0.1050 - 112ms/epoch - 16ms/step

Test accuracy: 0.10499999672174454
