In [1]:
import argparse
from typing import Dict, cast

import datasets
import numpy as np
from nanotron import logging
from nanotron.config import (
    DataArgs,
    DatasetStageArgs,
    PretrainDatasetsArgs,
)
from nanotron.dataloader import (
    DataCollatorForCLM,
    clm_process,
    get_dataloader_worker_init,
    get_datasets,
    get_train_dataloader,
    vqa_process,
)
from nanotron.helpers import (
    compute_remain_train_steps_of_a_data_stage_from_ckp,
    get_consumed_train_samples_of_a_data_stage_from_ckp,
)
from nanotron.logging import log_rank
from nanotron.parallel.pipeline_parallel.utils import get_input_output_pp_ranks
from nanotron.trainer import DistributedTrainer
from nanotron.utils import main_rank_first
# from torch.utils.data import DataLoader

from transformers import AutoProcessor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = "cmarkea/doc-vqa"
dataset_config = None
dataset_splits = "train"
tokenizer_path = "HuggingFaceM4/Idefics3-8B-Llama3"

In [3]:
raw_dataset = get_datasets(
    hf_dataset_or_datasets=dataset,
    hf_dataset_config_name=dataset_config,
    splits=dataset_splits,
)["train"]

processor = AutoProcessor.from_pretrained(tokenizer_path, size= {"longest_edge": 2*364})
train_dataset = vqa_process(
    raw_dataset=raw_dataset,
    processor=processor,
    dataset_processing_num_proc_per_process=1,
    dataset_overwrite_cache=True,
    sequence_length=1024,
)


# We load the processed dataset on the ranks requiring it
# dataloader = get_train_dataloader(
#     train_dataset=train_dataset,
#     sequence_length=trainer.sequence_length,
#     parallel_context=trainer.parallel_context,
#     input_pp_rank=input_pp_rank,
#     output_pp_rank=output_pp_rank,
#     micro_batch_size=trainer.micro_batch_size,
#     consumed_train_samples=consumed_train_samples,
#     dataloader_num_workers=data.num_loading_workers,
#     seed_worker=data.seed,
#     dataloader_drop_last=True,
#     dataset_columns=["input_ids", "pixel_values"]
# )

# Check if we have enough samples for train_steps
# total_tokens_dataset = len(dataloader.dataset) * trainer.sequence_length
# num_tokens_needed_for_training = (
#     num_remaining_train_steps * trainer.global_batch_size * trainer.sequence_length
# )
# assert num_tokens_needed_for_training <= total_tokens_dataset, (
#     f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
#     f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
# )

Map:   0%|          | 0/9688 [00:00<?, ? examples/s]

In [4]:
%pip install pillow

Collecting pillow
  Downloading pillow-11.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (4.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pillow
Successfully installed pillow-11.0.0
[0mNote: you may need to restart the kernel to use updated packages.


In [1]:
from nanotron.dataloader import (
    clm_process,
    dummy_infinite_data_generator,
    get_datasets,
    get_train_dataloader,
)

from transformers import AutoTokenizer

In [5]:
dataset = "cmarkea/doc-vqa"
tokenizer_path = "robot-test/dummy-tokenizer-wordlevel"

In [6]:
import os

# os.environ["HF_ENDPOINT"] = "http://localhost:5564"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [7]:
raw_dataset = get_datasets(
    hf_dataset_or_datasets=dataset,
    hf_dataset_config_name=None,
    splits="train",
)["train"]

tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

README.md:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

train-00000-of-00013.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

train-00001-of-00013.parquet:   0%|          | 0.00/134M [00:00<?, ?B/s]

train-00002-of-00013.parquet:   0%|          | 0.00/144M [00:00<?, ?B/s]

train-00003-of-00013.parquet:   0%|          | 0.00/139M [00:00<?, ?B/s]

train-00004-of-00013.parquet:   0%|          | 0.00/141M [00:00<?, ?B/s]

train-00005-of-00013.parquet:   0%|          | 0.00/138M [00:00<?, ?B/s]

train-00006-of-00013.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

train-00007-of-00013.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

train-00008-of-00013.parquet:   0%|          | 0.00/140M [00:00<?, ?B/s]

train-00009-of-00013.parquet:   0%|          | 0.00/147M [00:00<?, ?B/s]

train-00010-of-00013.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

train-00011-of-00013.parquet:   0%|          | 0.00/103M [00:00<?, ?B/s]

train-00012-of-00013.parquet:   0%|          | 0.00/105M [00:00<?, ?B/s]

test-00000-of-00004.parquet:   0%|          | 0.00/119M [00:00<?, ?B/s]

test-00001-of-00004.parquet:   0%|          | 0.00/111M [00:00<?, ?B/s]

test-00002-of-00004.parquet:   0%|          | 0.00/106M [00:00<?, ?B/s]

test-00003-of-00004.parquet:   0%|          | 0.00/95.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9688 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2421 [00:00<?, ? examples/s]

In [9]:
from transformers import AutoProcessor, AutoModelForVision2Seq

In [10]:
processor = AutoProcessor.from_pretrained("HuggingFaceM4/Idefics3-8B-Llama3")

In [26]:
ex = raw_dataset[0]
messages = []
for i, x in enumerate(ex["qa"]["en"]):
    user_message = {
        "role": "user",
        "content": [
            {"type": "text", "text": x["question"]},
        ]
    }

    if i == 0:
        user_message["content"].append(
            {"type": "image"},
        )

    messages.append(user_message)
    assistant_message = {
        "role": "assistant",
        "content": [
            {"type": "text", "text": x["answer"]},
        ]
    }

    messages.append(assistant_message)

In [27]:
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[ex["image"]], return_tensors="pt")
inputs = {k: v.to("cpu") for k, v in inputs.items()}

In [32]:
inputs = processor(text=prompt, images=[ex["image"]], return_tensors="pt", max_length=128, padding="longest", truncation=True)
inputs["input_ids"].shape
inputs["attention_mask"].shape


torch.Size([1, 128])

In [12]:
pretraining_data = [
    {
        "image": None,
        "text": "A view of the Statue of Liberty in New York."
    },
    {
        "image": None,
        "text": "The skyline of Chicago during sunset."
    },
    # ... more image-text pairs
]


text = processor.apply_chat_template(pretraining_data, add_generation_prompt=True)
# inputs = processor(images=images, text=text, return_tensors="pt").to(DEVICE)

UndefinedError: 'dict object' has no attribute 'role'

In [13]:
new_c = ["2313212"] * len(raw_dataset)

new_dataset = raw_dataset.add_column("new_c", new_c)

In [14]:
train_dataset = clm_process(
    raw_dataset=new_dataset,
    tokenizer=tokenizer,
    text_column_name="new_c",
    dataset_processing_num_proc_per_process=16,
    dataset_overwrite_cache=False,
    sequence_length=256,
)

Grouping texts in chunks of 257 (num_proc=16):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [16]:
len(train_dataset)

320