In [2]:
#try 2
import os
import shutil
import pandas as pd

metadata_path = '/users/PZS0694/krishsanghvi/.cache/kagglehub/competitions/animal-clef-2025/metadata.csv'
metadata_df = pd.read_csv(metadata_path)
metadata_df['path'] = metadata_df['path'].str.strip()

# Map from relative path to identity
path_to_identity = dict(zip(metadata_df['path'], metadata_df['identity']))

#/users/PZS0694/krishsanghvi/.cache/kagglehub/competitions/animal-clef-2025
species_paths = {
    'lynx':       '/users/PZS0694/krishsanghvi/.cache/kagglehub/competitions/animal-clef-2025/images/LynxID2025/database',
    'salamander': '/users/PZS0694/krishsanghvi/.cache/kagglehub/competitions/animal-clef-2025/images/SalamanderID2025/database/images',
    'turtle':     '/users/PZS0694/krishsanghvi/.cache/kagglehub/competitions/animal-clef-2025/images/SeaTurtleID2022/database/turtles-data/data/images'
}

new_dataset_path = 'training_data'
os.makedirs(new_dataset_path, exist_ok=True)

image_records = []

for species, src_path in species_paths.items():
    dst_path = os.path.join(new_dataset_path, species)
    os.makedirs(dst_path, exist_ok=True)

    if species == 'turtle':
        for subdir in os.listdir(src_path):
            full_subdir = os.path.join(src_path, subdir)
            if os.path.isdir(full_subdir):
                for img in os.listdir(full_subdir):
                    if img.lower().endswith(('.jpg', '.jpeg', '.png')):
                        rel_path = f"{subdir}/{img}"
                        identity_key = f"images/SeaTurtleID2022/database/turtles-data/data/images/{rel_path}"
                        identity = path_to_identity.get(identity_key)
                        if pd.isna(identity):
                            continue  # Skip images without identity
                        src_img_path = os.path.join(full_subdir, img)
                        dst_img_name = f"{subdir}_{img}"
                        dst_img_path = os.path.join(dst_path, dst_img_name)
                        shutil.copy(src_img_path, dst_img_path)
                        image_records.append([dst_img_path, identity])
    elif species == 'salamander':
        for img in os.listdir(src_path):
            if img.lower().endswith(('.jpg', '.jpeg', '.png')):
                rel_path = f"images/SalamanderID2025/database/images/{img}"
                identity = path_to_identity.get(rel_path)
                if pd.isna(identity):
                    continue
                src_img_path = os.path.join(src_path, img)
                dst_img_path = os.path.join(dst_path, img)
                shutil.copy(src_img_path, dst_img_path)
                image_records.append([dst_img_path, identity])

    else:
        for img in os.listdir(src_path):
            if img.lower().endswith(('.jpg', '.jpeg', '.png')):
                rel_path = f"images/LynxID2025/database/{img}"
                identity = path_to_identity.get(rel_path)

                if pd.isna(identity):
                    continue  # Skip images without identity
                src_img_path = os.path.join(src_path, img)
                dst_img_path = os.path.join(dst_path, img)
                shutil.copy(src_img_path, dst_img_path)
                image_records.append([dst_img_path, identity])

# Create DataFrame and save
df = pd.DataFrame(image_records, columns=['image_path', 'identity'])
df.to_csv('training_data/image_identity.csv', index=False)

# Print image counts
for species in os.listdir(new_dataset_path):
    species_path = os.path.join(new_dataset_path, species)
    if not os.path.isdir(species_path):
        continue
    count = len([f for f in os.listdir(species_path) if f.endswith(('.jpg', '.jpeg', '.png'))])
    print(f"{species}: {count} images")

print("\n✅ Saved identity metadata to training_data/image_identity.csv")


salamander: 1388 images
lynx: 2957 images
turtle: 2780 images

✅ Saved identity metadata to training_data/image_identity.csv


In [3]:
#try 2
import os
import random
import pandas as pd
from PIL import Image
from torchvision import transforms
from tqdm import tqdm

# === Configs ===
target_count = 30000
base_path = 'training_data'
output_csv = os.path.join(base_path, 'full_image_identity.csv')

# === Transform setup ===
augment_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ToTensor(),
])
to_pil = transforms.ToPILImage()

# === Load existing identity CSV created from your first script ===
df_existing = pd.read_csv(os.path.join(base_path, 'image_identity.csv'))
df_existing['image_path'] = df_existing['image_path'].str.strip()

# Map: image filename → identity
filename_to_identity = {
    os.path.basename(row['image_path']): row['identity']
    for _, row in df_existing.iterrows()
}

# Start record list with originals
final_records = df_existing.values.tolist()

# === Augmentation loop ===
for species in os.listdir(base_path):
    species_path = os.path.join(base_path, species)
    if not os.path.isdir(species_path):
        continue

    image_files = [f for f in os.listdir(species_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    image_files = [f for f in image_files if filename_to_identity.get(f) is not None]
    current_count = len(image_files)

    print(f"{species}: {current_count} valid original images found.")

    if current_count >= target_count:
        print(f"✅ {species} already has enough images.")
        continue

    needed = target_count - current_count
    print(f"🔄 Augmenting {needed} images for {species}...")

    for i in tqdm(range(needed)):
        original = random.choice(image_files)
        original_path = os.path.join(species_path, original)
        identity = filename_to_identity[original]

        try:
            base_img = Image.open(original_path).convert('RGB')
            aug_img = augment_transform(base_img)
            aug_img_pil = to_pil(aug_img)

            aug_filename = f"aug_{i}_{original}"
            aug_path = os.path.join(species_path, aug_filename)
            aug_img_pil.save(aug_path)

            final_records.append([aug_path, identity])
        except Exception as e:
            print(f"⚠️ Skipped {original} due to error: {e}")

# === Save final [image_path, identity] CSV ===
df_final = pd.DataFrame(final_records, columns=['image_path', 'identity'])
df_final.to_csv(output_csv, index=False)

print(f"\n✅ Saved full dataset with originals and augmentations to {output_csv}")


salamander: 1388 valid original images found.
🔄 Augmenting 28612 images for salamander...


  0%|          | 0/28612 [00:00<?, ?it/s]  0%|          | 1/28612 [00:00<2:57:31,  2.69it/s]  0%|          | 2/28612 [00:05<26:11:46,  3.30s/it]  0%|          | 5/28612 [00:06<8:04:02,  1.02s/it]   0%|          | 6/28612 [00:07<9:29:17,  1.19s/it]  0%|          | 9/28612 [00:07<4:44:57,  1.67it/s]  0%|          | 12/28612 [00:12<8:00:42,  1.01s/it]  0%|          | 14/28612 [00:13<6:25:16,  1.24it/s]  0%|          | 15/28612 [00:13<6:07:16,  1.30it/s]  0%|          | 16/28612 [00:18<12:34:28,  1.58s/it]  0%|          | 17/28612 [00:20<12:37:38,  1.59s/it]  0%|          | 18/28612 [00:20<9:49:47,  1.24s/it]   0%|          | 20/28612 [00:22<9:54:07,  1.25s/it]  0%|          | 23/28612 [00:23<5:26:10,  1.46it/s]  0%|          | 27/28612 [00:23<2:59:30,  2.65it/s]  0%|          | 32/28612 [00:23<1:42:22,  4.65it/s]  0%|          | 38/28612 [00:23<1:01:30,  7.74it/s]  0%|          | 42/28612 [00:23<47:50,  9.95it/s]    0%|          | 46/28612 [00:23<37:34, 12.67it/s]  0%| 

lynx: 2957 valid original images found.
🔄 Augmenting 27043 images for lynx...


  0%|          | 0/27043 [00:00<?, ?it/s]  0%|          | 1/27043 [00:03<26:37:01,  3.54s/it]  0%|          | 2/27043 [00:07<26:31:25,  3.53s/it]  0%|          | 3/27043 [00:11<30:47:01,  4.10s/it]  0%|          | 4/27043 [00:16<31:43:44,  4.22s/it]  0%|          | 7/27043 [00:19<17:07:11,  2.28s/it]  0%|          | 8/27043 [00:22<18:28:36,  2.46s/it]  0%|          | 9/27043 [00:23<15:22:17,  2.05s/it]  0%|          | 10/27043 [00:23<12:17:50,  1.64s/it]  0%|          | 11/27043 [00:33<27:27:56,  3.66s/it]  0%|          | 18/27043 [00:33<7:28:53,  1.00it/s]   0%|          | 26/27043 [00:33<3:27:44,  2.17it/s]  0%|          | 31/27043 [00:33<2:23:50,  3.13it/s]  0%|          | 39/27043 [00:33<1:25:07,  5.29it/s]  0%|          | 44/27043 [00:33<1:04:40,  6.96it/s]  0%|          | 50/27043 [00:33<47:51,  9.40it/s]    0%|          | 59/27043 [00:33<30:33, 14.71it/s]  0%|          | 68/27043 [00:34<21:16, 21.13it/s]  0%|          | 76/27043 [00:34<16:40, 26.95it/s]  0%|  

turtle: 8729 valid original images found.
🔄 Augmenting 21271 images for turtle...


  0%|          | 0/21271 [00:00<?, ?it/s]  0%|          | 23/21271 [00:00<01:35, 222.35it/s]  0%|          | 48/21271 [00:00<01:31, 233.02it/s]  0%|          | 72/21271 [00:00<01:33, 227.01it/s]  0%|          | 95/21271 [00:00<01:37, 217.68it/s]  1%|          | 117/21271 [00:00<01:43, 204.52it/s]  1%|          | 142/21271 [00:00<01:37, 215.83it/s]  1%|          | 164/21271 [00:00<01:54, 183.84it/s]  1%|          | 184/21271 [00:00<02:13, 158.02it/s]  1%|          | 203/21271 [00:01<02:07, 165.35it/s]  1%|          | 221/21271 [00:01<02:11, 159.55it/s]  1%|          | 245/21271 [00:01<01:57, 179.15it/s]  1%|▏         | 268/21271 [00:01<01:49, 192.47it/s]  1%|▏         | 289/21271 [00:01<01:46, 196.68it/s]  1%|▏         | 310/21271 [00:01<01:53, 185.43it/s]  2%|▏         | 333/21271 [00:01<01:47, 194.29it/s]  2%|▏         | 357/21271 [00:01<01:47, 193.77it/s]  2%|▏         | 381/21271 [00:01<01:42, 204.24it/s]  2%|▏         | 402/21271 [00:02<01:47, 194.76it/s]  2%|▏  


✅ Saved full dataset with originals and augmentations to training_data/full_image_identity.csv


In [4]:
# Build [image_path, identity] mapping
base_path = 'training_data'
final_records = []
# Create lookup for image filename → identity
metadata_df['filename'] = metadata_df['path'].apply(lambda p: os.path.basename(p))
filename_to_identity = dict(zip(metadata_df['filename'], metadata_df['identity']))


# Build full [image_path, species] mapping
final_records = []

for species in os.listdir(base_path):
    species_path = os.path.join(base_path, species)
    if not os.path.isdir(species_path):
        continue

    for img_file in os.listdir(species_path):
        if img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
            identity = filename_to_identity.get(img_file, None)
            if identity is not None:  # only include labeled identities
                img_path = os.path.join(species_path, img_file)
                final_records.append([img_path, identity])

# Save to CSV
df_all = pd.DataFrame(final_records, columns=['image_path', 'identity'])
df_all.to_csv(os.path.join(base_path, 'full_image_identity.csv'), index=False)

print(f"\n✅ Saved full dataset to {os.path.join(base_path, 'full_image_identity.csv')}")



✅ Saved full dataset to training_data/full_image_identity.csv
