In [1]:
import os
from PIL import Image
import numpy as np
from collections import Counter
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# 1. Load and Check Images
def load_images(image_paths):
    valid_images = []
    for path in image_paths:
        try:
            img = Image.open(path).convert("RGB")
            valid_images.append(img)
        except Exception as e:
            print(f"Error with image {path}: {e}")
    return valid_images

# 2. Analyze Distribution
def analyze_distribution(labels):
    label_counts = Counter(labels)
    plt.bar(label_counts.keys(), label_counts.values())
    plt.title("Class Distribution")
    plt.show()

# 3. Detect Duplicates
def detect_duplicates(images):
    hashes = [hash(img.tobytes()) for img in images]
    duplicates = [i for i, h in enumerate(hashes) if hashes.count(h) > 1]
    return duplicates

# 4. Visualize Random Samples
def visualize_samples(images, labels, n=5):
    indices = np.random.choice(len(images), n, replace=False)
    for i in indices:
        plt.imshow(images[i])
        plt.title(labels[i])
        plt.show()


In [2]:
import os

# Path to your dataset
dataset_path = "/Users/leonardo/Library/CloudStorage/Box-Box/CryptoVision/Data/fish_functions/Species_v03"

# Load image paths and labels
image_paths = []
labels = []

for class_folder in os.listdir(dataset_path):
    class_folder_path = os.path.join(dataset_path, class_folder)
    if os.path.isdir(class_folder_path):
        for image_file in os.listdir(class_folder_path):
            image_paths.append(os.path.join(class_folder_path, image_file))
            labels.append(class_folder)  # Label is the folder name


In [3]:
images = load_images(image_paths)
print(f"Loaded {len(images)} valid images out of {len(image_paths)}.")


: 

In [None]:
analyze_distribution(labels)

In [1]:
import pandas as pd

df = pd.read_excel('/Users/leonardo/Downloads/LIRS23_CryptoFishCollections_revised.xlsx')

df.head(10)

Unnamed: 0,Date,Site,Fish_ID,Family,Genus,Species_ID,TL_mm,WT_g,Storage,Notes,Guide,KJ thoughts,Unnamed: 12
0,2023-10-30,Cook's Path,LIRS23_0001,Pomacentridae,Neopomacentrus,Neopomacentrus_cyanomos,59.21,2.8601,Yellow Jar A,,Y,,
1,2023-10-30,Cook's Path,LIRS23_0002,Pseudochromidae,Pseudochromis,Pseudochromis_flammicauda,32.81,0.3715,Box 1,,Y,,
2,2023-10-30,Cook's Path,LIRS23_0003,Pseudochromidae,Pseudochromis,Pseudochromis_wilsoni,24.61,0.1709,Box 1,,Y,was thought as cyanotaenia but now wilsoni,
3,2023-10-30,Cook's Path,LIRS23_0004,Plesiopidae,Plesiops,Plesiops_coeruleolineatus,21.34,0.1319,Box 1,,Y,,
4,2023-10-30,Cook's Path,LIRS23_0005,Gobiidae,Gobiodon,Gobiodon_brochus,25.06,0.3212,Box 1,,Y,,
5,2023-10-30,Cook's Path,LIRS23_0006,Gobiidae,Gobiodon,Gobiodon_brochus,24.7,0.2761,Box 1,,Y,,
6,2023-10-30,Cook's Path,LIRS23_0007,Gobiidae,Eviota,Eviota_distigma,25.97,0.1655,Box 1,,Y,,
7,2023-10-30,Cook's Path,LIRS23_0008,Gobiidae,Gobiodon,Gobiodon_oculolineatus,20.56,0.1761,Box 1,,Y,,
8,2023-10-30,Cook's Path,LIRS23_0009,Pomacentridae,Pomacentrus,Pomacentrus_adelus,28.63,0.3539,Box 1,,Y,,
9,2023-10-30,Cook's Path,LIRS23_0010,Pomacentridae,Pomacentrus,Pomacentrus_adelus,19.93,0.1305,Box 1,,Y,,


In [10]:
data

Date               2023-10-30 00:00:00
Site                       Cook's Path
Fish_ID                    LIRS23_0001
Family                   Pomacentridae
Genus                   Neopomacentrus
Species_ID     Neopomacentrus_cyanomos
TL_mm                            59.21
WT_g                            2.8601
Storage                   Yellow Jar A
Notes                              NaN
Guide                                Y
KJ thoughts                        NaN
Unnamed: 12                        NaN
Name: 0, dtype: object

In [12]:
import shutil
import os 

source_dir = '/Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/resized'
new_dir = '/Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized'
os.makedirs(new_dir, exist_ok=True)


for index, data in df.iterrows():
    
    folde_id = f"{data['Family']}_{data['Species_ID']}"
    new_path = os.path.join(new_dir, folde_id)
    os.makedirs(new_path, exist_ok=True)
    
    old_image_path = os.path.join(source_dir, data['Fish_ID'] + '.jpeg')
    new_image_path = os.path.join(new_path, data['Fish_ID'] + '.jpeg')
    
    shutil.copyfile(old_image_path, new_image_path)
    
    print(f"Moved {data['Fish_ID']} to {new_image_path}")

Moved LIRS23_0001 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized/Pomacentridae_Neopomacentrus_cyanomos/LIRS23_0001.jpeg
Moved LIRS23_0002 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized/Pseudochromidae_Pseudochromis_flammicauda/LIRS23_0002.jpeg
Moved LIRS23_0003 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized/Pseudochromidae_Pseudochromis_wilsoni/LIRS23_0003.jpeg
Moved LIRS23_0004 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized/Plesiopidae_Plesiops_coeruleolineatus/LIRS23_0004.jpeg
Moved LIRS23_0005 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized/Gobiidae_Gobiodon_brochus/LIRS23_0005.jpeg
Moved LIRS23_0006 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organized/Gobiidae_Gobiodon_brochus/LIRS23_0006.jpeg
Moved LIRS23_0007 to /Volumes/T7_shield/CryptoVision/Data/others/hemingson_photos/LIRS23_organiz

In [7]:
data['Family']

'Pomacentridae'