In [1]:
import pandas as pd
import random
from watermark_transform import AddWatermark
from torchvision.transforms.functional import to_tensor, to_pil_image
import math
import os
import shutil
from tqdm import tqdm
from PIL import Image
import numpy as np

In [2]:
df = pd.read_csv("metadata.csv")

In [3]:
watermarker = AddWatermark(
    image_size=256,  # Adjust based on your image size
    font_path="fonts/SourceHanSerifSC-ExtraLight.otf",  # Adjust the font path if necessary
)

train: 19:1
val: 19:1
test: 5:5

In [8]:
for idx, (split, ratio) in enumerate(zip(["train", "test", "val"], [0.05, 0.5, 0.05])):
    d = df[df.split == idx]
    print(f"Processing {split} set with ratio {ratio}")
    landbird_length = len(d[d.y == 0])
    waterbird_length = len(d[d.y == 1])
    frac = waterbird_length / landbird_length
    print(f"landbird: {landbird_length}, waterbird: {waterbird_length}, frac: {frac}")
    for class_index, classname in enumerate(["landbird", "waterbird"]):
        if classname == "waterbird": 
            ratio = 1 - ratio
        for place_index, place in enumerate(["land", "water"]):
            dd = d[(d.y == class_index)&(d.place == place_index)]
            if classname == "landbird" and frac < 0.8:
                dd = dd.sample(frac=frac, random_state=0)
            if split == "val" and class_index != place_index: ## for skewing the validation set from 5:5 to 1:19
                dd = dd.sample(frac=0.05, random_state=0)
            print(f"label={classname}, place={place}, count={len(dd)}")
            indices = random.sample(dd.index.tolist(), math.ceil(len(dd)*ratio))
            file_names = d.loc[indices, "img_filename"].tolist()
            for image_path in tqdm(file_names, desc=f"watermark O"):
                image_path = f"waterbird_complete95_forest2water2/{image_path}"
                dest_path = f"{split}/{classname}/{place}_O_{image_path.split('/')[-1]}"
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                pil_image = Image.open(image_path)
                pil_image = pil_image.resize((256, 256))
                tensor_image = to_tensor(pil_image)
                watermarked_image_tensor = watermarker(tensor_image)
                watermarked_image = to_pil_image(watermarked_image_tensor)
                watermarked_image.save(dest_path)
            
            # process other files
            other_indices = list(set(dd.index.tolist()) - set(indices))
            file_names = d.loc[other_indices, "img_filename"].tolist()
            for image_path in tqdm(file_names, desc=f"watermark X"):
                image_path = f"waterbird_complete95_forest2water2/{image_path}"
                pil_image = Image.open(image_path)
                pil_image = pil_image.resize((256, 256))
                dest_path = f"{split}/{classname}/{place}_X_{image_path.split('/')[-1]}"
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                pil_image.save(dest_path)
            
            # print result
            print(f"Watermarked: {len(indices)}, Other: {len(other_indices)}")
        
        

watermark O:   0%|          | 0/53 [00:00<?, ?it/s]

watermark O:   2%|▏         | 1/53 [00:00<00:06,  7.52it/s]

Processing train set with ratio 0.05
landbird: 3682, waterbird: 1113, frac: 0.30228136882129275
label=landbird, place=land, count=1057


watermark O: 100%|██████████| 53/53 [00:05<00:00,  9.51it/s]
watermark X: 100%|██████████| 1004/1004 [00:10<00:00, 92.50it/s]
watermark O:  33%|███▎      | 1/3 [00:00<00:00,  7.20it/s]

Watermarked: 53, Other: 1004
label=landbird, place=water, count=56


watermark O: 100%|██████████| 3/3 [00:00<00:00,  7.15it/s]
watermark X: 100%|██████████| 53/53 [00:00<00:00, 105.10it/s]
watermark O:   2%|▏         | 1/54 [00:00<00:07,  7.23it/s]

Watermarked: 3, Other: 53
label=waterbird, place=land, count=56


watermark O: 100%|██████████| 54/54 [00:06<00:00,  8.93it/s]
watermark X: 100%|██████████| 2/2 [00:00<00:00, 103.51it/s]
watermark O:   0%|          | 2/1005 [00:00<00:51, 19.53it/s]

Watermarked: 54, Other: 2
label=waterbird, place=water, count=1057


watermark O: 100%|██████████| 1005/1005 [01:41<00:00,  9.85it/s]
watermark X: 100%|██████████| 52/52 [00:00<00:00, 108.90it/s]
watermark O:   1%|▏         | 1/67 [00:00<00:09,  6.77it/s]

Watermarked: 1005, Other: 52
Processing test set with ratio 0.5
landbird: 933, waterbird: 266, frac: 0.28510182207931406
label=landbird, place=land, count=133


watermark O: 100%|██████████| 67/67 [00:06<00:00,  9.93it/s]
watermark X: 100%|██████████| 66/66 [00:00<00:00, 94.88it/s]
watermark O:   1%|▏         | 1/67 [00:00<00:07,  8.83it/s]

Watermarked: 67, Other: 66
label=landbird, place=water, count=133


watermark O: 100%|██████████| 67/67 [00:07<00:00,  9.46it/s]
watermark X: 100%|██████████| 66/66 [00:00<00:00, 102.12it/s]
watermark O:   1%|▏         | 1/67 [00:00<00:07,  8.83it/s]

Watermarked: 67, Other: 66
label=waterbird, place=land, count=133


watermark O: 100%|██████████| 67/67 [00:06<00:00, 10.58it/s]
watermark X: 100%|██████████| 66/66 [00:00<00:00, 101.09it/s]
watermark O:   3%|▎         | 2/67 [00:00<00:05, 12.28it/s]

Watermarked: 67, Other: 66
label=waterbird, place=water, count=133


watermark O: 100%|██████████| 67/67 [00:07<00:00,  9.18it/s]
watermark X: 100%|██████████| 66/66 [00:00<00:00, 105.94it/s]
watermark O:   3%|▎         | 1/33 [00:00<00:03,  9.70it/s]

Watermarked: 67, Other: 66
Processing val set with ratio 0.05
landbird: 4510, waterbird: 1284, frac: 0.2847006651884701
label=landbird, place=land, count=642


watermark O: 100%|██████████| 33/33 [00:03<00:00,  8.58it/s]
watermark X: 100%|██████████| 609/609 [00:06<00:00, 99.88it/s] 
watermark O:  50%|█████     | 1/2 [00:00<00:00,  7.55it/s]

Watermarked: 33, Other: 609
label=landbird, place=water, count=32


watermark O: 100%|██████████| 2/2 [00:00<00:00,  7.56it/s]
watermark X: 100%|██████████| 30/30 [00:00<00:00, 97.18it/s]
watermark O:   3%|▎         | 1/31 [00:00<00:03,  7.95it/s]

Watermarked: 2, Other: 30
label=waterbird, place=land, count=32


watermark O: 100%|██████████| 31/31 [00:04<00:00,  7.63it/s]
watermark X: 100%|██████████| 1/1 [00:00<00:00, 94.07it/s]
watermark O:   0%|          | 1/610 [00:00<01:19,  7.70it/s]

Watermarked: 31, Other: 1
label=waterbird, place=water, count=642


watermark O: 100%|██████████| 610/610 [01:04<00:00,  9.41it/s]
watermark X: 100%|██████████| 32/32 [00:00<00:00, 95.58it/s]

Watermarked: 610, Other: 32



