In [1]:
import os
import sys
import warnings
from datasets import DatasetDict

# zeige keine Warnungen an
warnings.filterwarnings("ignore")

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.dataloader import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# DataDownloader erstellen
data_downloader = DataLoader(
    dataset_name="maveriq/tobacco3482",
    save_path="../data/interim"
)

In [3]:
# Datensatz downloaden und bereinigen
data_downloader.load_and_preprocess()

Downloading data: 100%|██████████| 435M/435M [00:25<00:00, 17.1MB/s] 
Downloading data: 100%|██████████| 594M/594M [00:36<00:00, 16.3MB/s] 
Downloading data: 100%|██████████| 704M/704M [00:40<00:00, 17.6MB/s] 
Generating train split: 100%|██████████| 3482/3482 [01:16<00:00, 45.60 examples/s] 
Map: 100%|██████████| 3482/3482 [00:32<00:00, 108.37 examples/s] 
Processing train-split: 100%|██████████| 2436/2436 [02:16<00:00, 17.80it/s]
Processing validation-split: 100%|██████████| 523/523 [00:25<00:00, 20.28it/s]
Processing test-split: 100%|██████████| 523/523 [00:25<00:00, 20.17it/s]
Saving the dataset (3/3 shards): 100%|██████████| 2436/2436 [00:36<00:00, 66.83 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 523/523 [00:02<00:00, 228.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 523/523 [00:03<00:00, 152.46 examples/s]


In [4]:
dataset = DatasetDict.load_from_disk("../data/interim")

In [5]:
dataset["train"].features

{'image': Image(decode=True, id=None),
 'doc_category': Value(dtype='string', id=None)}

In [6]:
dataset["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=L size=1728x2292>,
 'doc_category': 'Letter'}

In [7]:
test_image = dataset["train"][0]["image"]
test_image.mode

'L'

In [8]:
test_image = test_image.convert("RGB")
test_image.mode

'RGB'

In [11]:
from datasets import DatasetDict, concatenate_datasets
from PIL import Image as PILImage

def add_rgb_image(batch):
    # Wenn der Batch eine Liste ist, dann jedes Bild in der Liste umwandeln
    if isinstance(batch['image'], list):
        batch['image'] = [img.convert("RGB") for img in batch['image']]
    else:
        batch['image'] = batch['image'].convert("RGB")
    return batch

# Laden des Datensatzes
dataset = DatasetDict.load_from_disk("../data/interim")

# Verarbeiten in Chargen
batch_size = 50  # Größe der Chargen, anpassen je nach verfügbarem Speicher
for split in dataset.keys():
    num_examples = len(dataset[split])
    updated_splits = []
    for i in range(0, num_examples, batch_size):
        batch = dataset[split].select(range(i, min(i + batch_size, num_examples)))
        updated_batch = batch.map(add_rgb_image, batched=True, batch_size=batch_size, num_proc=4)  # num_proc für parallele Verarbeitung
        updated_splits.append(updated_batch)
    dataset[split] = concatenate_datasets(updated_splits)

# Speichern des aktualisierten Datensatzes
dataset.save_to_disk("../data/processed")


Map (num_proc=4): 100%|██████████| 50/50 [00:14<00:00,  3.53 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:10<00:00,  4.66 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:09<00:00,  5.44 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:08<00:00,  5.88 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:09<00:00,  5.13 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:08<00:00,  6.17 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:10<00:00,  4.95 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:09<00:00,  5.31 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:10<00:00,  4.55 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:08<00:00,  6.09 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:09<00:00,  5.05 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:10<00:00,  4.65 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:10<00:00,  4.98 examples/s]
Map (num_proc=4): 100%|██████████| 50/50 [00:11<00: