In [1]:
import sys

# cd to ../src
sys.path.append("../src")
from src.trainer import ConcatDataset
from src.dataset.dataset import CollieDataset, DataCollatorForCoLLIE
from torch.utils.data import DataLoader
import os
from transformers import AutoTokenizer
from tqdm.notebook import tqdm
import logging

logging.basicConfig(level=logging.INFO)


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /home/ikergarcia/miniconda3/envs/transformers/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so
/home/ikergarcia/miniconda3/envs/transformers/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32
CUDA SETUP: Loading binary /home/ikergarcia/miniconda3/envs/transformers/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cpu.so...


  warn("The installed version of bitsandbytes was compiled without GPU support. "


In [2]:
# Set for local execution, leave empty for running it on the server
local_path = "/run/user/1000/gvfs/sftp:host=tximista.ixa.eus,user=igarcia945/"
# local_path = ""

In [3]:
dataset_dir = f"{local_path}/ikerlariak/osainz006/CoLLIE/data/processed"
tasks = [
    "ace05.ner",
    "ace05.re",
    "ace05.rc",
    "ace05.ee",
    "ace05.eae",
    "conll03.ner",
    "rams.eae",
    # "tacred.re",
    # "tacred.sf",
    # "wikievents.eae",
    # "wikievents.ee",
    # "wikievents.ner",
]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    f"{local_path}/gaueko1/hizkuntza-ereduak/LLaMA/lm/huggingface/7B/",
    add_eos_token=True,
)
tokenizer.padding_side = "left"
if tokenizer.pad_token_id is None:
    if "<|padding|>" in tokenizer.get_vocab():
        # StabilityLM specific fix
        tokenizer.add_special_tokens({"pad_token": "<|padding|>"})
    elif tokenizer.unk_token is not None:
        print("Model does not have a pad token, we will use the unk token as pad token.")
        tokenizer.pad_token_id = tokenizer.unk_token_id
    else:
        print("Model does not have a pad token. We will use the eos token as pad token.")
        tokenizer.pad_token_id = tokenizer.eos_token_id

Model does not have a pad token, we will use the unk token as pad token.


In [5]:
training_datasets = []
for train_task in tqdm(tasks):
    train_path = os.path.join(dataset_dir, f"{train_task}.train.jsonl")
    train_dataset = CollieDataset(
        tokenizer=tokenizer,
        dataset_path=train_path,
        max_length=2048,
        is_encoder_decoder=True,
        inference=False,
        prompt_loss_weight=0.0,
    )
    training_datasets.append(train_dataset)

train_dataset = ConcatDataset(training_datasets)

  0%|          | 0/7 [00:00<?, ?it/s]

INFO:root:Found 2 pre-computed epoch datasets.


In [None]:
train_dataloader = DataLoader(
    train_dataset,
    batch_size=32,
    collate_fn=DataCollatorForCoLLIE(
        tokenizer,
        pad_to_multiple_of=8,
        return_tensors="pt",
        padding=True,
        label_pad_token_id=-100,
    ),
    shuffle=True,
)

In [None]:
# Get the len of each batch
batch_lens = []
for batch in train_dataloader:
    batch_lens.append(batch["input_ids"].shape[1])

avg_batch_len = sum(batch_lens) / len(batch_lens)
print(f"Average batch length: {avg_batch_len}")

max_batch_len = max(batch_lens)
print(f"Max batch length: {max_batch_len}")

min_batch_len = min(batch_lens)
print(f"Min batch length: {min_batch_len}")

top_10 = sorted(batch_lens, reverse=True)[:10]
print(f"Top 10 batch lengths: {top_10}")

top_1_percent = sorted(batch_lens, reverse=True)[: int(len(batch_lens) * 0.01)]
print(f"Top 1% batch lengths: {top_1_percent}")

top_01_percent = sorted(batch_lens, reverse=True)[: int(len(batch_lens) * 0.001)]
print(f"Top 0.1% batch lengths: {top_01_percent}")

In [None]:
import matplotlib.pyplot as plt

plt.hist(batch_lens, bins=100)
plt.title("Distribution of batch lengths")
plt.axvline(1024, color="red")