#### Import Packages

In [None]:
import pandas as pd
import os
import urllib.request

from concurrent.futures import ThreadPoolExecutor
from functools import partial
from PIL import Image
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent

from utils.system import *
from class_data.preprocess import Preprocess

#### SBU Caption
##### https://huggingface.co/datasets/sbu_captions

In [None]:
sbu_caption = load_dataset("sbu_captions")

In [None]:
sbu_caption.set_format(type='pandas')
sbu_caption_df = sbu_caption['train'][:]
sbu_caption_df = sbu_caption_df[['image_url', 'caption']]
sbu_caption_df = sbu_caption_df.rename(columns={'image_url':'image_name'})

In [None]:
sbu_caption_preprocess = Preprocess(data=sbu_caption_df, column_name='caption', default_name='caption', type='sbu_train', max_words=30)._preprocess()

In [None]:
folder_path = get_data() / 'sbu' / 'chunks'
file_name = 'sbu_train_preprocess'

Preprocess(folder_path=folder_path, file_name=file_name, data=sbu_caption_preprocess)._export_in_chunks()

#### Fetch Images

In [None]:
USER_AGENT = get_datasets_user_agent()

def fetch_and_save_image(image_url, save_dir, image_index, timeout=None, retries=0):
    image_name = f"image_{image_index}.jpg"
    save_path = os.path.join(save_dir, image_name)
    for _ in range(retries + 1):
        try:
            request = urllib.request.Request(image_url, data=None, headers={"user-agent": USER_AGENT})
            with urllib.request.urlopen(request, timeout=timeout) as req:
                with open(save_path, 'wb') as f:
                    f.write(req.read())
            return save_path
        except Exception as e:
            print(f"Error downloading {image_url}: {e}")
            pass
    return None

def fetch_images_for_df(df, save_dir, num_threads=20, timeout=None, retries=0):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        tasks = [(url, save_dir, idx, timeout, retries) for idx, url in enumerate(df['image_url'])]
        image_paths = list(executor.map(lambda args: fetch_and_save_image(*args), tasks))

    df['image_path'] = image_paths
    return df

In [None]:
save_directory = get_data() / 'sbu' / 'images'

sbu_caption_images = fetch_images_and_save(sbu_caption_preprocess, save_directory, num_threads=100)