In [193]:
from roboflow import Roboflow
from PIL import Image
import numpy as np
import pandas as pd
import os 
import re 

In [194]:
rf = Roboflow("132cxQxyrOVmPD63wJrV") # api keys are individual, change to your own
project = rf.workspace().project("elephant-seals-project-mark-1")
model = project.version("14").model

loading Roboflow workspace...
loading Roboflow project...


In [199]:
# image paths 
path_to_beach_imgs = "../Beach_Images/LS 2.13.23"
beach_imgs_paths = [os.path.join(path_to_beach_imgs, file) for file in os.listdir(path_to_beach_imgs)]

In [288]:
# extracting clumps and getting rid of overlaps 
clump_imgs_dct = {} # dictionary of clumps. image id will be the key and a list of clumps will be its value. 
num_seals = [] # number of individual seals 

# define confidence levels for seals and clumps
seal_conf_lvl = 0
clump_conf_lvl = 0 

def intersects(seal, clump):
    """Check if seal and clump overlap"""
    seal_x1 = seal['x'] - seal['width'] / 2
    seal_x2 = seal['x'] + seal['width'] / 2
    seal_y1 = seal['y'] - seal['height'] / 2
    seal_y2 = seal['y'] + seal['height'] / 2

    clump_x1 = clump['x'] - clump['width'] / 2
    clump_x2 = clump['x'] + clump['width'] / 2
    clump_y1 = clump['y'] - clump['height'] / 2
    clump_y2 = clump['y'] + clump['height'] / 2

    return not (
        seal_x2 <= clump_x1 or
        seal_x1 >= clump_x2 or
        seal_y2 <= clump_y1 or
        seal_y1 >= clump_y2
    )

if '../Beach_Images/LS 2.13.23/.DS_Store' in beach_imgs_paths:
    beach_imgs_paths.remove('../Beach_Images/LS 2.13.23/.DS_Store')

for path in beach_imgs_paths:

    image = Image.open(path)

    preds = model.predict(path, confidence=25, overlap=22).json().get('predictions', []) # our preset values of min confidence and overlap, based on vibes

    seals = [pred for pred in preds if pred['class'] == 'seals' and pred['confidence'] > seal_conf_lvl]
    clumps = [pred for pred in preds if pred['class'] == 'clump' and pred['confidence'] > clump_conf_lvl]

    # getting individual seals 
    filtered_seals = [seal for seal in seals if not any(intersects(seal, clump) for clump in clumps)]
    num_seals.append(len(filtered_seals))

    key = re.sub(r'.*/([A-Za-z]+) (\d+)\.(\d+)\.(\d+)\\(DJI_\d+)\.JPG', r'\1\2\3\4_\5', path)
    
    # getting clumps 
    clump_imgs_dct[key] = [] 
    for clump in clumps:
        clump_x1 = clump['x'] - clump['width'] / 2
        clump_x2 = clump['x'] + clump['width'] / 2
        clump_y1 = clump['y'] - clump['height'] / 2
        clump_y2 = clump['y'] + clump['height'] / 2

        top_left_clump = (clump_x1, clump_y1)
        bottom_right_clump = (clump_x2, clump_y2)

        subimage = image.crop((*top_left_clump, *bottom_right_clump))
        
        clump_imgs_dct[key].append(subimage)

In [289]:
# Extracting Length, Width and RGB metrics 

keys = []
widths = []
heights = []
avg_r = []
sd_r = []
avg_g = []
sd_g = []
avg_b = []
sd_b = [] 

for key, clump_lst in clump_imgs_dct.items():

    for idx, clump in enumerate(clump_lst): 

        keys.append(f"{key}_clump_{idx+1:04d}")
    
        width, height = clump.size

        widths.append(width)
        heights.append(height)

        img_array = np.array(clump)

        avg_r.append(np.mean(img_array[1, :, :]))
        sd_r.append(np.std(img_array[1, :, :]))
        avg_g.append(np.mean(img_array[:, 1, :]))
        sd_g.append(np.std(img_array[:, 1, :]))
        avg_b.append(np.mean(img_array[:, :, 1]))
        sd_b.append(np.std(img_array[:, :, 1]))

heuristics = pd.DataFrame({'key': keys, 
                          'width': widths, 
                          'height': heights,
                          'avg_r': avg_r, 
                          'sd_r': sd_r, 
                          'avg_g': avg_g,
                          'sd_g': sd_g,
                          'avg_b': avg_b,
                          'sd_b': sd_b
                          })

In [179]:
heuristics.head()

Unnamed: 0,key,width,height,avg_r,sd_r,avg_g,sd_g,avg_b,sd_b
0,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,180,238,120.290741,22.469047,126.781513,13.440079,130.990406,36.180868
1,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,222,154,112.792793,12.6754,121.071429,21.659138,127.206973,33.346414
2,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,222,142,113.56006,12.063776,129.929577,16.708063,137.808939,36.100996
3,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,204,200,107.465686,12.158423,116.411667,16.438741,120.157353,35.440961
4,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,248,200,103.75,13.469175,108.841667,19.700167,123.94127,47.738503


In [290]:
# writing heuristics 
heuristics.to_csv('heuristics.csv', index=False)

Training the Random Forests Model

In [316]:
df_raw = pd.read_csv('seals_count_joined.csv')

df = df_raw.iloc[:669]
df.head()

Unnamed: 0,clump,Total Number of Seals,width,height,avg_r,sd_r,avg_g,sd_g,avg_b,sd_b
0,LS21323_DJI_0001_clump_0001,2.0,222,160,148.968468,8.861153,150.972917,10.472847,144.783333,35.448822
1,LS21323_DJI_0001_clump_0002,8.0,400,244,128.905,16.282485,133.304645,25.476204,125.685543,27.45159
2,LS21323_DJI_0001_clump_0003,9.0,324,314,129.433128,18.470493,148.5,10.585889,136.547643,32.810271
3,LS21323_DJI_0001_clump_0004,2.0,248,160,138.185484,11.64301,132.254167,17.457388,139.239592,29.105456
4,LS21323_DJI_0001_clump_0005,2.0,162,238,127.631687,9.237278,120.865546,9.84168,127.14724,25.30867


In [335]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score

X = df.drop(columns=['Total Number of Seals', 'clump'])
y = df['Total Number of Seals']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train_knn = scaler.fit_transform(X_train)
X_test_knn = scaler.transform(X_test)

# Initialize Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=120, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")

mae_scores = -cross_val_score(rf_regressor, X, y, cv=5, scoring='neg_mean_absolute_error')
print(f'Mean Absolute Error (Cross-Validation): {mae_scores.mean()}')

Mean Squared Error: 1.03
Mean Absolute Error: 0.60
Mean Absolute Error (Cross-Validation): 0.6183799049863464


In [336]:
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=3)
model.fit(X_train_knn, y_train)

# Predict using the model
y_pred_KNN = model.predict(X_test_knn)
mse = mean_squared_error(y_test, y_pred_KNN)
mae = mean_absolute_error(y_test, y_pred_KNN)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")

mae_scores = -cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_absolute_error')
print(f'Mean Absolute Error (Cross-Validation): {mae_scores.mean()}')

Mean Squared Error: 1.41
Mean Absolute Error: 0.65
Mean Absolute Error (Cross-Validation): 0.6063105525006547


In [339]:
y.value_counts()

Total Number of Seals
2.0     379
3.0      98
4.0      88
1.0      33
5.0      28
6.0      20
8.0       7
7.0       6
9.0       4
0.0       4
13.0      1
11.0      1
Name: count, dtype: int64

Now predict on new data

In [291]:
df_new_X = heuristics.drop(columns=['key'])

y_new_pred = rf_regressor.predict(df_new_X)

heuristics["Prediction"] = y_new_pred
heuristics.head()

Unnamed: 0,key,width,height,avg_r,sd_r,avg_g,sd_g,avg_b,sd_b,Prediction
0,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,180,238,120.290741,22.469047,126.781513,13.440079,130.990406,36.180868,2.62
1,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,222,154,112.792793,12.6754,121.071429,21.659138,127.206973,33.346414,2.25
2,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,222,142,113.56006,12.063776,129.929577,16.708063,137.808939,36.100996,2.03
3,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,204,200,107.465686,12.158423,116.411667,16.438741,120.157353,35.440961,2.29
4,../Beach_Images/LS 2.13.23/DJI_0907.JPG_clump_...,248,200,103.75,13.469175,108.841667,19.700167,123.94127,47.738503,3.3


In [292]:
heuristics.index = np.arange(0, len(heuristics))
for i in range(len(heuristics["key"])):
    name = heuristics["key"].iloc[i]
    heuristics.loc[i, "key"] = name[27:39]
heuristics

Unnamed: 0,key,width,height,avg_r,sd_r,avg_g,sd_g,avg_b,sd_b,Prediction
0,DJI_0907.JPG,180,238,120.290741,22.469047,126.781513,13.440079,130.990406,36.180868,2.62
1,DJI_0907.JPG,222,154,112.792793,12.675400,121.071429,21.659138,127.206973,33.346414,2.25
2,DJI_0907.JPG,222,142,113.560060,12.063776,129.929577,16.708063,137.808939,36.100996,2.03
3,DJI_0907.JPG,204,200,107.465686,12.158423,116.411667,16.438741,120.157353,35.440961,2.29
4,DJI_0907.JPG,248,200,103.750000,13.469175,108.841667,19.700167,123.941270,47.738503,3.30
...,...,...,...,...,...,...,...,...,...,...
9294,DJI_0922.JPG,120,78,110.447222,20.770639,129.495726,24.519876,112.499145,24.643236,1.35
9295,DJI_0922.JPG,246,84,121.032520,37.281308,117.746032,18.405027,139.501161,42.303972,2.98
9296,DJI_0922.JPG,214,68,119.933022,20.123765,124.416667,21.732282,119.136201,19.541037,1.33
9297,DJI_0922.JPG,94,118,119.918440,16.822722,122.039548,12.081812,113.674630,18.142096,1.29


In [293]:
df_grouped = heuristics.groupby('key')[['Prediction']].sum()
df_grouped

Unnamed: 0_level_0,Prediction
key,Unnamed: 1_level_1
DJI_0001.JPG,97.15
DJI_0002.JPG,92.75
DJI_0003.JPG,101.11
DJI_0004.JPG,72.95
DJI_0005.JPG,107.62
...,...
DJI_0995.JPG,160.00
DJI_0996.JPG,167.40
DJI_0997.JPG,145.29
DJI_0998.JPG,119.61


In [245]:
df_grouped[df_grouped['key'] == "DJI_0843.JPG"]

KeyError: 'key'

Compile Individuals Count

In [294]:
total_df = pd.DataFrame({'Filename': beach_imgs_paths, 'Individuals': num_seals})

total_df.index = np.arange(1, len(total_df) + 1)
for i in range(len(total_df["Filename"])):
    name = total_df["Filename"].iloc[i]
    total_df.loc[i, "Filename"] = name[27:]
total_df

Unnamed: 0,Filename,Individuals
1,DJI_0913.JPG,13.0
2,DJI_0898.JPG,20.0
3,DJI_0873.JPG,16.0
4,DJI_0867.JPG,13.0
5,DJI_0866.JPG,9.0
...,...,...
185,DJI_0895.JPG,7.0
186,DJI_0936.JPG,18.0
187,DJI_0922.JPG,14.0
188,../Beach_Images/LS 2.13.23/DJI_0922.JPG,36.0


In [301]:
total_df[total_df["Filename"] == "DJI_0840.JPG"]

Unnamed: 0,Filename,Individuals
165,DJI_0840.JPG,16.0


In [209]:
# empty set for recording 
total_count = pd.DataFrame({'clump': beach_imgs_paths,
                               'Individual Count': None,
                               'Clump Count': None
                               })

In [None]:
# writing empty set 
total_count.to_csv('seals_count.csv', index=False)