In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load multimedia.txt
multimedia_df = pd.read_csv('full_dataset/multimedia.txt', sep='\t')

# Load occurrence.txt
occurrence_df = pd.read_csv('full_dataset/occurrence.txt', sep='\t', low_memory=False)

In [3]:
multimedia_df.head(2)

Unnamed: 0,gbifID,type,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder
0,1038336887,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/1290334,,,,,2014-10-15T12:44Z,jenedson,,iNaturalist,http://creativecommons.org/licenses/by-nc/4.0/,jenedson
1,1052574205,StillImage,image/jpeg,https://inaturalist-open-data.s3.amazonaws.com...,https://www.inaturalist.org/photos/1334856,,,,,2014-11-09T10:51:54Z,"Mella, Pete",,iNaturalist,http://creativecommons.org/licenses/by-nc-nd/4.0/,"Mella, Pete"


In [4]:
occurrence_df.head(2)

Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,publishedByGbifRegion,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory
0,3054616573,,,,CC_BY_4_0,,Centre for Environmental Data and Recording,,CEDaR,,...,EUROPE,GBR,United Kingdom,GBR.2_1,Northern Ireland,GBR.2.5_1,Derry City and Strabane,GBR.2.5.2_1,Strabane,NE
1,3054616603,,,,CC_BY_4_0,,Centre for Environmental Data and Recording,,CEDaR,,...,EUROPE,GBR,United Kingdom,GBR.2_1,Northern Ireland,GBR.2.9_1,Mid Ulster,GBR.2.9.3_1,Magherafelt,NE


In [5]:
images_df = multimedia_df[multimedia_df['type'] == 'StillImage'][['gbifID', 'identifier']]

In [6]:
images_df

Unnamed: 0,gbifID,identifier
0,1038336887,https://inaturalist-open-data.s3.amazonaws.com...
1,1052574205,https://inaturalist-open-data.s3.amazonaws.com...
2,1098893451,https://inaturalist-open-data.s3.amazonaws.com...
3,1098916928,https://inaturalist-open-data.s3.amazonaws.com...
4,1098916928,https://inaturalist-open-data.s3.amazonaws.com...
...,...,...
86255,891123761,https://inaturalist-open-data.s3.amazonaws.com...
86256,891123761,https://inaturalist-open-data.s3.amazonaws.com...
86257,891123761,https://inaturalist-open-data.s3.amazonaws.com...
86258,891123761,https://inaturalist-open-data.s3.amazonaws.com...


In [7]:
# Merge occurrence data with the multimedia data based on gbifID
merged_df = pd.merge(occurrence_df, images_df, on='gbifID')

# Show the first few rows of the merged dataset
merged_df.head()

Unnamed: 0,gbifID,accessRights,bibliographicCitation,language,license,modified,publisher,references,rightsHolder,type,...,level0Gid,level0Name,level1Gid,level1Name,level2Gid,level2Name,level3Gid,level3Name,iucnRedListCategory,identifier
0,3873668190,,,,CC_BY_NC_4_0,2022-07-12T20:11:33Z,iNaturalist.org,https://www.inaturalist.org/observations/12597...,Allie_Dav,,...,GBR,United Kingdom,GBR.1_1,England,GBR.1.70_1,Peterborough,GBR.1.70.1_1,Peterborough,NE,https://inaturalist-open-data.s3.amazonaws.com...
1,3902924915,,,,CC0_1_0,2022-08-25T16:37:55Z,iNaturalist.org,https://www.inaturalist.org/observations/13228...,Sarah Adamson,,...,GBR,United Kingdom,GBR.3_1,Scotland,GBR.3.12_1,City of Edinburgh,GBR.3.12.1_1,Edinburgh,NE,https://inaturalist-open-data.s3.amazonaws.com...
2,3415453004,,,,CC_BY_NC_4_0,2024-03-13T17:30:30Z,iNaturalist.org,https://www.inaturalist.org/observations/10059...,rmhall,,...,GBR,United Kingdom,GBR.1_1,England,GBR.1.63_1,North Yorkshire,GBR.1.63.5_1,Ryedale,NE,https://inaturalist-open-data.s3.amazonaws.com...
3,4465704476,,,,CC0_1_0,2024-04-19T12:13:04Z,iNaturalist.org,https://www.inaturalist.org/observations/19311...,Jane Concannon,,...,GBR,United Kingdom,GBR.1_1,England,GBR.1.88_1,South Gloucestershire,GBR.1.88.1_1,South Gloucestershire,,https://inaturalist-open-data.s3.amazonaws.com...
4,4891738008,,,,CC_BY_4_0,2024-06-13T09:14:57Z,iNaturalist.org,https://www.inaturalist.org/observations/15748...,Paul Cook,,...,GBR,United Kingdom,GBR.1_1,England,GBR.1.36_1,Greater London,GBR.1.36.27_1,Richmond upon Thames,NE,https://inaturalist-open-data.s3.amazonaws.com...


In [8]:
# Select relevant columns
final_df = merged_df[[
    # 'gbifID',
    'scientificName',
    'species',
    # 'countryCode',
    # 'eventDate',
    # 'decimalLatitude',
    # 'decimalLongitude',
    'identifier'
    ]]


final_df = final_df.rename(columns={'identifier': 'url'})
# Show the dataset
final_df.head()

Unnamed: 0,scientificName,species,url
0,Subcoccinella vigintiquatuorpunctata (Linnaeus...,Subcoccinella vigintiquatuorpunctata,https://inaturalist-open-data.s3.amazonaws.com...
1,"Coccinella septempunctata Linnaeus, 1758",Coccinella septempunctata,https://inaturalist-open-data.s3.amazonaws.com...
2,"Harmonia axyridis (Pallas, 1773)",Harmonia axyridis,https://inaturalist-open-data.s3.amazonaws.com...
3,Harmonia axyridis f. conspicua,Harmonia axyridis,https://inaturalist-open-data.s3.amazonaws.com...
4,"Stethorus pusillus (Herbst, 1797)",Stethorus pusillus,https://inaturalist-open-data.s3.amazonaws.com...


In [9]:
# Remove urls from the data.nhm.ac.uk domain
print(f"Number of rows before removing data.nhm.ac.uk: {final_df.shape[0]}")
final_df = final_df[~final_df['url'].str.startswith('https://data.nhm.ac.uk/media')]
print(f"Number of rows after removing data.nhm.ac.uk: {final_df.shape[0]}")

Number of rows before removing data.nhm.ac.uk: 86260
Number of rows after removing data.nhm.ac.uk: 85961


In [10]:
print(f'Shape before dropping nulls: {final_df.shape}')
final_df = final_df.dropna()
print(f'Shape after dropping nulls: {final_df.shape}')

Shape before dropping nulls: (85961, 3)
Shape after dropping nulls: (85942, 3)


In [11]:
# Drop species with less than 1 image
species_counts = final_df['species'].value_counts()
species_counts = species_counts[species_counts > 10]
final_df = final_df[final_df['species'].isin(species_counts.index)]
print(f'Shape after dropping species with less than 10 images: {final_df.shape}')

Shape after dropping species with less than 10 images: (85883, 3)


In [12]:
# Convert species labels to integer labels
label_encoder = LabelEncoder()
final_df['label'] = label_encoder.fit_transform(final_df['species'])
labels = final_df['label']  # Updated labels for the dataset

In [13]:
train_df, test_df = train_test_split(
    final_df, test_size=0.2, random_state=42, stratify=final_df['species']
)

In [14]:
print(f'Train set: {len(train_df)} images')
print(f'Test set: {len(test_df)} images')

Train set: 68706 images
Test set: 17177 images


In [15]:
train_df.to_csv('ladybird_train.csv', index=False)
test_df.to_csv('ladybird_test.csv', index=False)
final_df.to_csv('ladybird_full.csv', index=False)

# Small datasets

In [16]:
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [17]:
# Considering only 1/10th of the 50_000 images
reduction_factor = 10
np.random.seed(42)

# Choosing the random indices of small train set and small test set
idx_train = np.random.choice(len(train_df), round(len(train_df)/reduction_factor), replace=False)
idx_test = np.random.choice(len(test_df), round(len(test_df)/reduction_factor), replace=False)

# Collecting the two subsamples images_train_small and images_test_small from images_train and images_test
train_df_small = train_df.loc[idx_train].reset_index(drop=True)
test_df_small = test_df.loc[idx_test].reset_index(drop=True)


print("------------------ Before -----------------")
print(train_df.shape, test_df.shape)

print("")

print("--- After applying the reduction factor ---")
print(train_df_small.shape, test_df_small.shape)

print("")
print("-"*43)

unique, counts = np.unique(train_df_small['species'], return_counts=True)
dict(zip(unique, counts))

------------------ Before -----------------
(68706, 4) (17177, 4)

--- After applying the reduction factor ---
(6871, 4) (1718, 4)

-------------------------------------------


{'Adalia bipunctata': 340,
 'Adalia decempunctata': 244,
 'Anatis ocellata': 38,
 'Anisosticta novemdecimpunctata': 18,
 'Aphidecta obliterata': 26,
 'Brumus quadripustulatus': 107,
 'Calvia quatuordecimguttata': 159,
 'Chilocorus bipustulatus': 5,
 'Chilocorus renipustulatus': 51,
 'Coccidula rufa': 5,
 'Coccidula scutellata': 3,
 'Coccinella hieroglyphica': 2,
 'Coccinella magnifica': 1,
 'Coccinella septempunctata': 1871,
 'Coccinella undecimpunctata': 16,
 'Halyzia sedecimguttata': 249,
 'Harmonia axyridis': 2654,
 'Harmonia quadripunctata': 34,
 'Henosepilachna argus': 8,
 'Hippodamia variegata': 27,
 'Myrrha octodecimguttata': 5,
 'Myzia oblongoguttata': 10,
 'Nephus quadrimaculatus': 6,
 'Propylaea quatuordecimpunctata': 391,
 'Psyllobora vigintiduopunctata': 177,
 'Rhyzobius chrysomeloides': 129,
 'Rhyzobius forestieri': 5,
 'Rhyzobius litura': 11,
 'Rhyzobius lophanthae': 5,
 'Scymnus auritus': 11,
 'Scymnus interruptus': 7,
 'Scymnus suturalis': 3,
 'Stethorus pusillus': 35,


In [18]:
train_df_small.to_csv('ladybird_train_small.csv', index=False)
test_df_small.to_csv('ladybird_test_small.csv', index=False)