In [24]:
import os 
import dotenv
from glob import glob
import httpx

dotenv.load_dotenv()

True

In [25]:
api_key = os.environ['COMU_SUPABASE_API_KEY']
proj_url = os.environ['COMU_SUPABASE_URL']
bucket_name = os.environ['COMU_BUCKET_NAME']

In [None]:
from supabase import create_client, ClientOptions, Client
import json

def init_supabase(url: str, key: str) -> Client:
    """
    Initialize Supabase client.
    """
    supabase: Client = create_client(url, key, options=ClientOptions(
        httpx_client=httpx.Client(timeout=httpx.Timeout(180.0))
    ))
    return supabase

In [31]:
supabase = init_supabase(proj_url, api_key)
supabase

<supabase._sync.client.SyncClient at 0x1851338f510>

In [41]:
supabase.storage.from_(bucket_name).list('', {'limit':1000})

[{'name': 'SynthDog_en',
  'id': None,
  'updated_at': None,
  'created_at': None,
  'last_accessed_at': None,
  'metadata': None},
 {'name': 'SynthDog_pt',
  'id': None,
  'updated_at': None,
  'created_at': None,
  'last_accessed_at': None,
  'metadata': None}]

In [42]:
get_filename = lambda x: os.path.splitext(os.path.basename(x))[0]
get_language = lambda x : x.split('\\')[2].split('_')[-1]
get_files_from_storage = lambda bucket_name, path, limit=100: [i['name'] for i in supabase.storage.from_(bucket_name).list(path, {'limit':limit})]

def read_file(local_path):
    with open(local_path, "rb") as f:
        file_bytes = f.read()
    return file_bytes

def upload_image(bucket: str, local_path: str, storage_path: str) -> str:
    """
    Upload a local image file to Supabase Storage bucket.
    Returns the public URL.
    """
    file_bytes = read_file(local_path)
    
    # Upload to bucket
    supabase.storage.from_(bucket).upload(
        storage_path, 
        file_bytes, 
        file_options={"content-type": "image/jpeg","upsert": "true"})
    
    return True


from tqdm import tqdm

def upload_images_batch(bucket: str, file_paths: list[str], return_metadata = True, split = 'train') -> dict:
    """
    Upload multiple images and return dict {filename: url}
    """
    results = []
    en_files = set(get_files_from_storage(bucket_name, f'SynthDog_en/{split}', 21000))
    pt_files = set(get_files_from_storage(bucket_name, f'SynthDog_pt/{split}', 21000))
    for ind, path in tqdm(enumerate(file_paths), desc=f'Uploading to supabase storage bucket {bucket} ...', total=len(file_paths)):
        filename = os.path.basename(path)
        lang = get_language(path)

        if lang == 'en':
            if filename in en_files:
                continue
        elif lang == 'pt':
            if filename in pt_files:
                continue

        root_path = f"SynthDog_{lang}/{split}"
        filepath = f"{root_path}/{filename}"
        if upload_image(bucket, path, filepath):
            results.append(filepath)
    print(f'Uploaded {ind+1} images to supabase storage bucket ...')
    if return_metadata:
        return results
    
    

In [43]:
def get_synth_images_json_path(data_root= os.path.join('synthdog','outputs'), split='train'):
    ipath = os.path.join(data_root, '*', split, '*.jpg')
    json_path = os.path.join(data_root, '*', split, 'metadata.jsonl')

    return glob(ipath), glob(json_path)

In [44]:
root_path = os.path.join('synthdog', 'outputs_ol')
train_ipath, train_json_metadata = get_synth_images_json_path(data_root=root_path, split='train')
val_ipath, val_json_metadata = get_synth_images_json_path(data_root=root_path, split='validation')
test_ipath, test_json_metadata = get_synth_images_json_path(data_root=root_path, split='test')

In [45]:
len(train_ipath)

32011

In [49]:
upload_images_batch(bucket_name, train_ipath, False, 'train',)

Uploading to supabase storage bucket synthdog ...: 100%|██████████| 32011/32011 [01:42<00:00, 311.06it/s]   

Uploaded 32011 images to supabase storage bucket ...





In [51]:
upload_images_batch(bucket_name, val_ipath, False, 'validation')

Uploading to supabase storage bucket synthdog ...: 100%|██████████| 4008/4008 [07:52<00:00,  8.49it/s]  

Uploaded 4008 images to supabase storage bucket ...





In [52]:
upload_images_batch(bucket_name, test_ipath, False, 'test')

Uploading to supabase storage bucket synthdog ...: 100%|██████████| 3978/3978 [14:40<00:00,  4.52it/s]

Uploaded 3978 images to supabase storage bucket ...





In [None]:
import pypickle

def save_pickle(data, filename):
    pypickle.save(data, filename)
    print(f"✅ Saved pickle to: {filename}")

def load_pickle(fname):
    return pypickle.load(fname)
