# basic plant detection for starting Bachelor thesis

## read metadata

In [10]:
# load data PlantCLEF2022_trusted_training_metadata.csv form /data/01_raw/PlantCLEF2022_trusted_training_metadata.csv

import pandas as pd

amount_pictures = 1_000
file_path = "../data/01_raw/PlantCLEF2022_trusted_training_metadata.csv"
try:
    data = pd.read_csv(file_path, delimiter=";", nrows=amount_pictures)
    if data.empty:
        print("The CSV file is empty")
    else:
        print(data.head())

except FileNotFoundError:
    print(
        f"File not found: {file_path}. Please check the file path and that the file is downloaded."
    )

   classid                                    image_name  \
0  2683260  d0749fe4f8ade13dd9402b0f43bc29e8f28af27a.jpg   
1  2683260  7549c8ddadd95d996e10aa76f8e39bedb059c637.jpg   
2  2683260  8c6049ebdff482b901f32e0ac9e421c69bc652b4.jpg   
3  2683260  b932fd8619c667a5b4f3c38596911937a2692b26.jpg   
4  2683260  9f9dbdb0b9ce8bc3fb562f648af7717b5511b597.jpg   

                                          image_path               species  \
0  2683260/d0749fe4f8ade13dd9402b0f43bc29e8f28af2...  Cycas angulata R.Br.   
1  2683260/7549c8ddadd95d996e10aa76f8e39bedb059c6...  Cycas angulata R.Br.   
2  2683260/8c6049ebdff482b901f32e0ac9e421c69bc652...  Cycas angulata R.Br.   
3  2683260/b932fd8619c667a5b4f3c38596911937a2692b...  Cycas angulata R.Br.   
4  2683260/9f9dbdb0b9ce8bc3fb562f648af7717b5511b5...  Cycas angulata R.Br.   

   genus      family      order        class source manual_tag predicted_tag  \
0  Cycas  Cycadaceae  Cycadales  Cycadopsida    NaN        NaN         habit   
1  Cycas  

## remove unnecessary columns

In [11]:
# remove column 'Unnamed: 0'
data = data.drop(
    columns=[
        "image_name",
        "source",
        "manual_tag",
        "predicted_tag",
        "predicted_tag_probability",
        "original_url",
        "license",
        "publisher",
        "gbif_occurrence_id",
        "aggregator",
        "dataset_key",
    ]
)

## delete already existing files

In [12]:
import os

temp_dir = "temp"

# delete and create tmp dir to ensure it's empty
os.system(f"rm -rf {temp_dir}")

# Ensure the tmp/ directory exists
os.makedirs(temp_dir, exist_ok=True)

## download data per plant parallelized

In [13]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor


def download_image(index_url):
    index, url = index_url
    try:
        response = requests.get(url)
        file_path = os.path.join(temp_dir, f"{index}.jpg")

        with open(file_path, "wb") as file:
            file.write(response.content)
        # print(f"Downloaded {url} to {file_path}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")


# Prepare a list of tuples containing the index and URL for each image
index_url_list = [(index, row["image_backup_url"]) for index, row in data.iterrows()]

# Use ThreadPoolExecutor to download images in parallel
with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(download_image, index_url_list)

## add image path to metadata

In [14]:
data["image_path"] = [f"{temp_dir}/{index}.jpg" for index in data.index]
data.head()

Unnamed: 0,classid,image_path,species,genus,family,order,class,image_backup_url
0,2683260,temp/0.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
1,2683260,temp/1.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
2,2683260,temp/2.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
3,2683260,temp/3.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...
4,2683260,temp/4.jpg,Cycas angulata R.Br.,Cycas,Cycadaceae,Cycadales,Cycadopsida,https://lab.plantnet.org/LifeCLEF/PlantCLEF202...


## split into train and test set

In [15]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from PIL import Image
from sklearn.model_selection import train_test_split

target_size = (224, 224) # todo: is now random choose a good size afterwards

# Function to load images
def load_image(image_path: str) -> np.ndarray:
    image = Image.open(image_path).convert('RGB')  # Ensure RGB format
    image = image.resize(target_size)
    image_array = np.asarray(image)
    return image_array

# Load images into arrays
images = np.array([load_image(path) for path in data['image_path']])

# Encode labels 
# transform into numerical values to train classifier
encoder = LabelEncoder()
data['encoded_labels'] = encoder.fit_transform(data['species'])  
labels = data['encoded_labels'].values

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    images, labels, test_size=0.2, random_state=42  # Adjust test_size as needed
)

## Model building

In [16]:
import tensorflow as tf

num_classes = len(np.unique(y_train))

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape = (224, 224, 3)),
    tf.keras.layers.Dense(128, activation='relu', kernel_initializer = tf.keras.initializers.he_normal),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer = tf.keras.regularizers.L2(0.01)),
    tf.keras.layers.Dropout(rate=0.2),
    tf.keras.layers.Dense(num_classes + 1, activation = 'softmax') # todo: +1 makes no sense?
])

In [17]:
# Thoroughly examine y_train
print("Unique labels: ", np.unique(y_train))
print("Min label: ", np.min(y_train))
print("Max label: ", np.max(y_train))
print("Full y_train array:\n", y_train) 

# If there's an unexpected value, investigate further with:
if np.max(y_train) >= 62: 
    for i, label in enumerate(y_train):
        if label >= 62:
            print(f"Invalid label {label} found at index {i}")

Unique labels:  [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53 54 55 56 57 58 59 60 61 62]
Min label:  0
Max label:  62
Full y_train array:
 [19  3 41  9 49 13 30 55  1 48 23 30 56 48 41 34 41 32 55 41 34 41 59 21
 52 18  5 34 55  4 34 62  1 10 19 19 31 59 30 34 18 30  1 47 34 26  0 23
 23 34 23 61 46 51 37 34  1 55 56 19 41  1 23 34 35 49 46 36 40 36  1 27
 54 32  1 19 30 14 23 46 11 46 31 34 30 41 35  1 31 57 57 46  1  1 35  1
 56  0 19 30  4  1  1  0 46 57 34 11 56  9 13  1 11 41  1  1 31 34 41 30
 23 30 34 17 46  1 34  1 41 30 41 34 41 40 18 34 41  6 49  2  1 37 30 29
 36  0  7 54 23 56 30 34 52 23 34 30 34  0 34 30  0 11 30  1 30 19 46 30
 36 49 30 41 30 30 19 44  1 23 55 48 31 30 46 31 30 56  1 30 60 46 30 19
 41 25 11 22 38  1 45 62 19  1  4  2 14 41 56 34 21  1  1 55 46 34 48 41
 41 49  1  1 19 41  1  9  0 35 23 23 41 34 34 35 34  1 34 19 30 35 49 14
 30 10 59 41  

In [18]:
df = data
mapping = {}

for species, label in zip(df['species'], df['encoded_labels']):
    if species not in mapping:
        mapping[species] = label

for species, label in mapping.items():
    print(f"{species}: {label}")

# Method 2: Using pandas.DataFrame.drop_duplicates 
unique_df = df[['species', 'encoded_labels']].drop_duplicates()

for index, row in unique_df.iterrows():
    print(f"{row['species']}: {row['encoded_labels']}")

Cycas angulata R.Br.: 18
Cycas armstrongii Miq.: 19
Cycas beddomei Dyer: 20
Cycas calcicola Maconochie: 21
Cycas candida K.D.Hill: 22
Cycas circinalis L.: 23
Cycas clivicola K.D.Hill: 24
Cycas debaoensis Y.C.Zhong & C.J.Chen: 25
Cycas edentata de Laub.: 26
Cycas falcata K.D.Hill: 27
Cycas indica A.Lindstr. & K.D.Hill: 28
Cycas inermis Lour.: 29
Cycas media R.Br.: 30
Cycas micronesica K.D.Hill: 31
Cycas pectinata Buch.-Ham.: 32
Cycas pruinosa Maconochie: 33
Cycas revoluta Thunb.: 34
Cycas rumphii Miq.: 35
Cycas seemannii Braun: 36
Cycas siamensis Miq.: 37
Cycas tropophylla K.D.Hill & P.K.Lôc: 38
Cycas zeylanica A.Lindstr. & K.D.Hill (J.Schust.): 39
Bowenia serrulata Chamb. (W.Bull): 0
Bowenia spectabilis Hook. ex Hook.f.: 1
Ceratozamia alvarezii Pérez-Farr., Vovides & Iglesias: 2
Ceratozamia decumbens Vovides, Avendaño, Pérez-Farr. & Gonz.-Astorga: 3
Ceratozamia hildae G.P.Landry & M.C.Wilson: 4
Ceratozamia hondurensis J.L.Haynes, Whitelock, Schutzman & R.S.Adams: 5
Ceratozamia kuesteri

In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 150528)            0         
                                                                 
 dense (Dense)               (None, 128)               19267712  
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 63)                4095      
                                                                 
Total params: 19280063 (73.55 MB)
Trainable params: 19280063 (73.55 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [20]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

## define callback

In [21]:
log_dir = "logs/my_model"
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

## Model training

In [22]:
history = model.fit(X_train, y_train, epochs=30, callbacks=[tensorboard_cb])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30

KeyboardInterrupt: 

# evaluate model

In [None]:
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)

print('\nTest accuracy:', test_acc)