#### Import Package

In [None]:
import pandas as pd
import io
import ray
import urllib
import warnings

warnings.filterwarnings('ignore')

from utils.system import *
from class_data.preprocess import Preprocess

from PIL import Image
from torch.utils.data import Dataset
from datasets.utils.file_utils import get_datasets_user_agent

USER_AGENT = get_datasets_user_agent()

#### Valid

In [None]:
# Fetch image from url
def fetch_image_url(image_url, timeout=None, retries=0):
    for _ in range(retries + 1):
        try:
            request = urllib.request.Request(
                image_url,
                data=None,
                headers={"user-agent": USER_AGENT},
            )
            with urllib.request.urlopen(request, timeout=timeout) as req:
                image = Image.open(io.BytesIO(req.read()))
            break
        except Exception as e:
            image = None
    return image

@ray.remote
def fetch(image_url):
    image = fetch_image_url(image_url, timeout=None, retries=0)
    if image == None:
        return False
    else:
        return True

def invalid_url(batch_size, data, num_cpu, image_column, valid_column):
    ray.init(num_cpus=num_cpu, ignore_reinit_error=True)

    # Initialize variables for results
    invalid_collect = []

    # Process each batch sequentially
    total_batches = (len(data[image_column]) + batch_size - 1) // batch_size
    for i in range(0, len(data[image_column]), batch_size):
        current_batch = i // batch_size + 1
        print(f"Processing batch: {current_batch}/{total_batches}")
        batch = data[image_column][i:i + batch_size]
        futures = [fetch.remote(img) for img in batch]
        result = ray.get(futures)
        invalid_collect.extend(result)

    data.loc[:, valid_column] = invalid_collect
    ray.shutdown()
    return data

In [None]:
gcc_train_valid = invalid_url(batch_size=1, data=gcc_train.head(10), num_cpu=16, image_column='image_name', valid_column='valid')

#### GCC Train
##### https://huggingface.co/datasets/conceptual_captions

In [None]:
gcc_train = pd.read_csv(get_data() / 'gcc' / 'gcc_train.tsv', sep='\t', header=None, names=['caption', 'image_name'])

In [None]:
gcc_train_preprocess = Preprocess(data=gcc_train, column_name='caption', default_name='caption', type='gcc_train', max_words=30)._preprocess()

In [None]:
folder_path = get_data() / 'gcc' / 'chunks'
file_name = 'gcc_train_preprocess'

Preprocess(folder_path=folder_path, file_name=file_name, data=gcc_train_preprocess)._export_in_chunks()

#### GCC Val

In [None]:
gcc_val = pd.read_csv(get_data() / 'gcc' / 'gcc_val.tsv', sep='\t', header=None, names=['caption', 'image_name'])

In [None]:
gcc_val_preprocess = Preprocess(data=gcc_val, column_name='caption', default_name='caption', type='gcc_val', max_words=30)._preprocess()

In [None]:
folder_path = get_data() / 'gcc' / 'chunks'
file_name = 'gcc_val_preprocess'

Preprocess(folder_path=folder_path, file_name=file_name, data=gcc_val_preprocess)._export_in_chunks()