In [1]:
import os
import json
import glob
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import webdataset as wds
from PIL import Image
from io import BytesIO
import torch
import matplotlib.pyplot as plt

In [2]:
def show_tensor_image(tensor_image):
    # 텐서의 차원이 [C, H, W]인 경우 [H, W, C]로 변경
    if tensor_image.dim() == 3 and tensor_image.shape[0] in [1, 3]:
        tensor_image = tensor_image.permute(1, 2, 0)
    
    # 값의 범위를 [0, 1]로 조정 (필요한 경우)
    if tensor_image.max() > 1:
        tensor_image = tensor_image / 255.0
    
    # NumPy 배열로 변환
    img_np = tensor_image.cpu().numpy()
    
    # 그레이스케일 이미지인 경우
    if img_np.shape[-1] == 1:
        plt.imshow(img_np.squeeze(), cmap='gray')
    else:
        plt.imshow(img_np)
    
    plt.axis('off')
    plt.show()

In [None]:
training_dir = r"/home/work/llm_data/datasets/food-images/Training"
labels_dir = r"/home/work/llm_data/datasets/food-images/Labels"
meta_file = r"/home/work/llm_data/datasets/food-images/metadata.jsonl"

In [None]:
img_dirs = list(os.listdir(training_dir))
label_dirs = list(os.listdir(labels_dir))

In [None]:
all_image_folders = glob.glob("/home/work/llm_data/datasets/Training/*")

In [None]:
sorted(all_image_folders)

In [None]:
all_label_folders = glob.glob("/home/work/llm_data/datasets/Labels/*")

In [None]:
# ll = normalize_folder_name(os.path.basename(sorted(all_label_folders)[103]))

In [None]:
for i in range(len(ll)):
    print(ll[i], tt[i], ll[i] == tt[i])

In [None]:
# 한글-영어 음식명 딕셔너리 생성
name_dict = {}
with open('name_dict.txt', 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split(', ')
        if len(parts) == 3:
            _, korean, english = parts
            name_dict[korean] = english

In [None]:
def normalize_folder_name(name):
    # 'json' 접미사 제거 및 앞뒤 공백 제거
    return name.replace(' json', '').strip().replace('.json', '')

# 데이터프레임 생성을 위한 리스트
data = []

# Training 폴더의 경로를 기준으로 매칭
for train_dir in all_image_folders:
    text = os.path.basename(train_dir)
    normalized_text = normalize_folder_name(text)
    
    # Labels 폴더에서 매칭되는 경로 찾기
    matching_label_dir = next(
        (label_dir for label_dir in all_label_folders 
         if normalize_folder_name(os.path.basename(label_dir)) == normalized_text),
        None
    )
    
    if matching_label_dir:
        data.append({
            'han_text': text,
            'train_dir': train_dir,
            'label_dir': matching_label_dir
        })
    else:
        print(f'No matching label for {text}, {normalized_text}, {train_dir}')
        # break

# 데이터프레임 생성
df = pd.DataFrame(data)

In [None]:
df['text'] = df.han_text.map(name_dict)

In [None]:
def process_files(row):
    results = []
    
    # train_dir에서 jpg 파일 목록 가져오기
    jpg_files = [f for f in os.listdir(row['train_dir']) if f.endswith('.jpg')]
    
    for jpg_file in jpg_files:
        jpg_path = os.path.join(row['train_dir'], jpg_file)
        json_file = os.path.splitext(jpg_file)[0] + '.json'
        json_path = os.path.join(row['label_dir'], json_file)
        
        if os.path.exists(json_path):
            with open(json_path, 'r', encoding='utf-8') as f:
                json_data = json.load(f)
            
            if json_data and isinstance(json_data, list):
                item = json_data[0]  # 첫 번째 항목 사용
                
                result = {
                    'file_name': f"{row['han_text']}/{jpg_file}",
                    # 'image_path': jpg_path,
                    'text': row['text'],
                    'name': item.get('Name', ''),
                    'Cat 1': item.get('Cat 1', ''),
                    'Cat 2': item.get('Cat 2', ''),
                    'Cat 3': item.get('Cat 3', ''),
                    'Cat 4': item.get('Cat 4', '')
                }
                results.append(result)
    
    return results

In [None]:
# 결과를 저장할 jsonl 파일 경로
output_jsonl = 'metadata.jsonl'

# jsonl 파일에 결과 저장
with open(output_jsonl, 'w', encoding='utf-8') as f:
    for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing and saving files"):
        results = process_files(row)
        for result in results:
            json.dump(result, f, ensure_ascii=False)
            f.write('\n')

# load_dataset('image_folder'): 실패

너무 오래 걸림

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
ds = load_dataset('imagefolder', data_dir='/home/work/llm_data/datasets/food-images/', streaming=True, logger=logging.getLogger())

# WebDataSet 으로 시도

2,373,670 건의 이미지

In [None]:
def create_webdataset(jsonl_path, image_root, output_path, samples_per_shard=20000):
    writer = wds.TarWriter(output_path)
    sample_count = 0
    shard_count = 0

    with open(jsonl_path, 'r') as f:
        for line in tqdm(f):
            data = json.loads(line)
            image_path = os.path.join(image_root, data['file_name'])
            
            if not os.path.exists(image_path):
                print(f"Warning: Image not found - {image_path}")
                continue

            # 이미지를 바이트로 읽기
            with open(image_path, 'rb') as img_file:
                image_bytes = img_file.read()

            # WebDataset 샘플 생성
            sample = {
                "__key__": f"sample_{sample_count}",
                "jpg": image_bytes,
                "json": json.dumps(data)
            }
            writer.write(sample)

            sample_count += 1

            # 새 샤드 시작
            if sample_count % samples_per_shard == 0:
                writer.close()
                shard_count += 1
                writer = wds.TarWriter(f"{output_path}_{shard_count:05d}.tar")
                # break

    writer.close()
    print(f"Created {shard_count + 1} shards with {sample_count} samples in total.")


In [None]:
jsonl_path = '/home/work/llm_data/datasets/food-images/metadata.jsonl'
image_root = '/home/work/llm_data/datasets/food-images/Training'
output_path = '/home/work/llm_data/datasets/food-images/webdataset/data'

In [None]:
# create_webdataset(jsonl_path, image_root, output_path)

In [3]:
dataset = load_dataset("webdataset", data_dir='/Jupyter/dataset/food-images/webdataset', streaming=True)

Resolving data files:   0%|          | 0/37 [00:00<?, ?it/s]

In [4]:
from torchvision import transforms

In [5]:
train_transforms = transforms.Compose(
    [
        transforms.Resize(
            512, interpolation=transforms.InterpolationMode.BILINEAR
        ),
        (
            transforms.CenterCrop(512)
            if False
            else transforms.RandomCrop(512)
        ),
        (
            transforms.RandomHorizontalFlip()
            if False
            else transforms.Lambda(lambda x: x)
        ),
        transforms.ToTensor(),
        transforms.Normalize([0.5], [0.5]),
    ]
)

In [6]:
def extract_text(example):
    example['text'] = example['json']['text']
    return example

In [7]:
image_column='jpg'

In [8]:
def preprocess_train(examples):
    images = [image.convert("RGB") for image in examples[image_column]]
    examples["pixel_values"] = [train_transforms(image) for image in images]
    # examples["input_ids"] = tokenize_captions(examples)
    return examples

In [9]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    pixel_values = pixel_values.to(memory_format=torch.contiguous_format).float()
    # input_ids = torch.stack([example["input_ids"] for example in examples])
    return {"pixel_values": pixel_values}#, "input_ids": input_ids}

In [10]:
dataset = dataset.map(extract_text)

In [11]:
train_dataset = dataset["train"].map(preprocess_train, batched=True)

In [12]:
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    # shuffle=True,
    collate_fn=collate_fn,
    batch_size=8,
    num_workers=8,
)

In [None]:
7 / 1307 * 5000

In [13]:
batch = next(iter(train_dataloader))

RuntimeError: DataLoader worker (pid(s) 1088012) exited unexpectedly

In [None]:
show_tensor_image(batch['pixel_values'][4])