# New baseline with 12?

In [1]:
from pathlib import Path
import sys
import os
import numpy as np
import pandas as pd

# Had to add this stuff to get notebook to find files, because it is not
# located in the root folder
ROOT = Path.cwd().parent
sys.path.insert(0, str(ROOT))

from tensorflow.keras import Sequential, Input, layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.utils import load_img, img_to_array
from tensorflow.keras.models import load_model

from bee_tector.config import (
    PROJECT_ROOT,
    BEST_MODEL_PATH,
    FULL_DATA_DIR,
    IMAGE_SIZE,
    CURATED_DATA_DIR,
    MODELS_DIR,
    BEES_CSV_PATH
)
from bee_tector.plots import plot_history
from bee_tector.data import (
    load_datasets,
    undersample_dataset,
    load_selected_classes
)
from api.predict import load_best_model, preprocess_image, predict

In [2]:
train_ds, val_ds, test_ds = load_datasets()

Found 3619 files belonging to 12 classes.
Found 781 files belonging to 12 classes.
Found 781 files belonging to 12 classes.


In [2]:
# TODO edit shape=IMAGE_SIZE + (3,) for future models.

In [16]:
img_path = os.path.join(
    FULL_DATA_DIR, "test", "Red-tailed_Bumble_bee", "535031756.jpg"
)

img = load_img(img_path, target_size=IMAGE_SIZE)

img_array = img_to_array(img)

# We now have an array (224, 224, 3)
# Batch size is 1 for 1 image, our model accepts (batch_size, 224, 224, 3)
img_array = np.expand_dims(img_array, axis=0)

In [20]:
baseline_model = load_model(MODELS_DIR / "baseline_model.keras")

In [21]:
pred = baseline_model.predict(img_array)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step


In [22]:
pred

array([[0.0618692 , 0.12679347, 0.09051337, 0.11306442, 0.12979054,
        0.06030054, 0.07622439, 0.09393378, 0.04390454, 0.03508817,
        0.06704979, 0.10146785]], dtype=float32)

In [23]:
prediction = np.argmax(pred, axis=1)

In [24]:
prediction

array([4])

In [25]:
train_cn, val_cn, test_cn = load_datasets()

Found 3619 files belonging to 12 classes.
Found 781 files belonging to 12 classes.
Found 781 files belonging to 12 classes.


# 

In [26]:
class_names = train_cn.class_names
id_to_class = {i: name for i, name in enumerate(class_names)}

In [27]:
print("Predicted class:", id_to_class[prediction[0]])

Predicted class: Common_Eastern_Bumble_Bee


# Try predict

In [3]:
img_array = preprocess_image()
model = load_best_model()

val_loss, val_acc = model.evaluate(val_ds)
print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_acc:.4f}")

test_loss, test_acc = model.evaluate(test_ds)
print(f"Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}")

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 483ms/step - accuracy: 0.8030 - loss: 0.8133
Validation loss: 0.8662, Validation accuracy: 0.7875
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 467ms/step - accuracy: 0.8285 - loss: 0.6730
Test loss: 0.8091, Test accuracy: 0.8067


In [4]:
img_array = preprocess_image()
model = load_best_model()
pred = predict(model, img_array=img_array)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 645ms/step


In [5]:
pred

{'class': 'Red-tailed_Bumble_Bee', 'confidence': '99.83%'}

# CSV

In [5]:
full_df = pd.read_csv(BEES_CSV_PATH)

In [6]:
full_df.head()

Unnamed: 0,id,observed_on,scientific_name,common_name,photo_url,location,license
0,306444296,2025-08-15,Bombus,Bumble Bees,https://inaturalist-open-data.s3.amazonaws.com...,"63.6582816672,28.3958057128",cc-by
1,306442436,2025-08-14,Bombus bohemicus,Bohemian Cuckoo Bumble bee,https://inaturalist-open-data.s3.amazonaws.com...,"55.1668805086,25.8369277045",cc-by
2,306442093,2025-08-09,Bombus lucorum,White-tailed Bumble Bee,https://inaturalist-open-data.s3.amazonaws.com...,"55.8554033333,-4.0173916667",cc0
3,306441992,2025-07-29,Bombus pascuorum,Common Carder Bumble Bee,https://inaturalist-open-data.s3.amazonaws.com...,"56.480225,-5.9940616667",cc0
4,306439346,2025-08-14,Bombus,Bumble Bees,https://inaturalist-open-data.s3.amazonaws.com...,"55.9393141806,-3.1699150801",cc0


In [7]:
name_loc_df = full_df.drop(columns=["id", "photo_url", "license"])

In [8]:
name_loc_df.head()

Unnamed: 0,observed_on,scientific_name,common_name,location
0,2025-08-15,Bombus,Bumble Bees,"63.6582816672,28.3958057128"
1,2025-08-14,Bombus bohemicus,Bohemian Cuckoo Bumble bee,"55.1668805086,25.8369277045"
2,2025-08-09,Bombus lucorum,White-tailed Bumble Bee,"55.8554033333,-4.0173916667"
3,2025-07-29,Bombus pascuorum,Common Carder Bumble Bee,"56.480225,-5.9940616667"
4,2025-08-14,Bombus,Bumble Bees,"55.9393141806,-3.1699150801"


In [9]:
name_loc_df.groupby(name_loc_df["common_name"]).count()

Unnamed: 0_level_0,observed_on,scientific_name,location
common_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alpine Bumble Bee,2,2,2
Amazonian Bumble Bee,1,1,1
American Bumble Bee,218,218,217
Andean Red Bumble Bee,1,1,1
Ashton's Cuckoo Bumble Bee,1,1,1
...,...,...,...
Yellow-banded Bumble Bee,75,75,75
Yellow-faced Bumble Bee,285,285,285
Yellow-fronted Bumble Bee,58,58,57
Yellowish Cuckoo Bumble bee,67,67,67


In [10]:
name_loc_df['class_name'] = name_loc_df['common_name'].str.replace(" ", "_")

In [11]:
name_loc_df

Unnamed: 0,observed_on,scientific_name,common_name,location,class_name
0,2025-08-15,Bombus,Bumble Bees,"63.6582816672,28.3958057128",Bumble_Bees
1,2025-08-14,Bombus bohemicus,Bohemian Cuckoo Bumble bee,"55.1668805086,25.8369277045",Bohemian_Cuckoo_Bumble_bee
2,2025-08-09,Bombus lucorum,White-tailed Bumble Bee,"55.8554033333,-4.0173916667",White-tailed_Bumble_Bee
3,2025-07-29,Bombus pascuorum,Common Carder Bumble Bee,"56.480225,-5.9940616667",Common_Carder_Bumble_Bee
4,2025-08-14,Bombus,Bumble Bees,"55.9393141806,-3.1699150801",Bumble_Bees
...,...,...,...,...,...
9986,2025-07-11,Bombus terrestris,Buff-tailed Bumble Bee,"51.392155,1.39308",Buff-tailed_Bumble_Bee
9987,2025-07-11,Bombus terrestris,Buff-tailed Bumble Bee,"51.3923416667,1.39232",Buff-tailed_Bumble_Bee
9988,2025-07-09,Bombus griseocollis,Brown-belted Bumble Bee,"36.1364903292,-95.907794499",Brown-belted_Bumble_Bee
9989,2025-07-11,Bombus terrestris,Buff-tailed Bumble Bee,"51.391675,1.3835133333",Buff-tailed_Bumble_Bee


In [24]:
from geopy.geocoders import Nominatim
import time

lat, lon = 51.5074, -0.1278
geo = Nominatim(user_agent="bee-tector")
loc = geo.reverse((lat, lon), language="en")

country = loc.raw["address"].get("country")
code = loc.raw["address"].get("country_code", "").upper()
print(country, code)

United Kingdom GB


In [12]:
name_loc_df[["lat","lon"]] = name_loc_df["location"].str.split(",", n=1, expand=True).astype(float)

In [19]:
name_loc_df

Unnamed: 0,observed_on,scientific_name,common_name,location,class_name,lat,lon
0,2025-08-15,Bombus,Bumble Bees,"63.6582816672,28.3958057128",Bumble_Bees,63.658282,28.395806
1,2025-08-14,Bombus bohemicus,Bohemian Cuckoo Bumble bee,"55.1668805086,25.8369277045",Bohemian_Cuckoo_Bumble_bee,55.166881,25.836928
2,2025-08-09,Bombus lucorum,White-tailed Bumble Bee,"55.8554033333,-4.0173916667",White-tailed_Bumble_Bee,55.855403,-4.017392
3,2025-07-29,Bombus pascuorum,Common Carder Bumble Bee,"56.480225,-5.9940616667",Common_Carder_Bumble_Bee,56.480225,-5.994062
4,2025-08-14,Bombus,Bumble Bees,"55.9393141806,-3.1699150801",Bumble_Bees,55.939314,-3.169915
...,...,...,...,...,...,...,...
9986,2025-07-11,Bombus terrestris,Buff-tailed Bumble Bee,"51.392155,1.39308",Buff-tailed_Bumble_Bee,51.392155,1.393080
9987,2025-07-11,Bombus terrestris,Buff-tailed Bumble Bee,"51.3923416667,1.39232",Buff-tailed_Bumble_Bee,51.392342,1.392320
9988,2025-07-09,Bombus griseocollis,Brown-belted Bumble Bee,"36.1364903292,-95.907794499",Brown-belted_Bumble_Bee,36.136490,-95.907794
9989,2025-07-11,Bombus terrestris,Buff-tailed Bumble Bee,"51.391675,1.3835133333",Buff-tailed_Bumble_Bee,51.391675,1.383513


In [20]:
geo = Nominatim(user_agent="bee-tector")

In [21]:
name_loc_df["country"] = pd.NA
name_loc_df["country_code"] = pd.NA

In [25]:
for i, row in name_loc_df.iterrows():
    try:
        loc = geo.reverse((row["lat"], row["lon"]), language="en", exactly_one=True)
        if loc and "address" in loc.raw:
            addr = loc.raw["address"]
            name_loc_df.loc[i, "country"] = addr.get("country")
            name_loc_df.loc[i, "country_code"] = (addr.get("country_code") or "").upper()
    except Exception:
        pass
    time.sleep(1)

KeyboardInterrupt: 