In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np
import random

In [2]:
torch.cuda.is_available()

True

In [3]:
model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

In [4]:
# Calibration datasets
NUM_SAMPLES = 4
SEQUENCE_LEN = 1024
finewebedu_dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train")
c4_train_dataset = load_dataset(
            "allenai/c4",
            "default",
            data_files={"train": "en/c4-train.00000-of-01024.json.gz"},
            split="train",
            revision="607bd4c8450a42878aa9ddc051a65a055450ef87",  # pin revision
        )
c4_val_dataset = load_dataset(
            "allenai/c4",
            "default",
            data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"},
            split="validation[:1100]",
            revision="607bd4c8450a42878aa9ddc051a65a055450ef87",  # pin revision
        )
w2_train_dataset = dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
w2_val_dataset = dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

# 

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [22]:
finewebedu_train_data = finewebedu_dataset.select(range(dataset.num_rows//2))
finewebedu_val_data = 

In [77]:
# WIKITEXT train and test tokens
def get_w2_data(num_samples, seq_len, tokenizer):
    w2_train_dataset = dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
    w2_val_dataset = dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")

    train_tokens = tokenizer("\n\n".join(w2_train_dataset["text"]), return_tensors="pt", add_special_tokens=False).input_ids
    test_tokens = tokenizer("\n\n".join(w2_val_dataset["text"]), return_tensors="pt", add_special_tokens=False).input_ids
    
    num_tokens = train_tokens.size(1)
    idxes = np.random.choice(num_tokens-seq_len, num_samples, replace=False).tolist()
    test_idxes = range(test_tokens.size(1)//seq_len)
    
    train_data = map(lambda idx: train_tokens[:, idx:idx+seq_len], idxes)
    test_data = map(lambda idx: test_tokens[:, idx*seq_len:(idx+1)*seq_len], test_idxes)

    return train_data, test_data

In [37]:
# C4 train and test tokens
def get_c4_data(num_samples, seq_len, tokenizer):
    c4_train_dataset = load_dataset("allenai/c4", "default", data_files={"train": "en/c4-train.00000-of-01024.json.gz"},split="train", revision="607bd4c8450a42878aa9ddc051a65a055450ef87")
    c4_val_dataset = load_dataset("allenai/c4", "default", data_files={"validation": "en/c4-validation.00000-of-00008.json.gz"}, split="validation[:1100]", revision="607bd4c8450a42878aa9ddc051a65a055450ef87")

    train_tokens = tokenizer("\n\n".join(c4_train_dataset["text"][:seq_len+10]), return_tensors="pt", add_special_tokens=False).input_ids
    test_tokens = tokenizer("\n\n".join(c4_val_dataset["text"]), return_tensors="pt", add_special_tokens=False).input_ids
    
    num_tokens = train_tokens.size(1)
    idxes = np.random.choice(num_tokens-seq_len, num_samples, replace=False).tolist()
    test_idxes = range(test_tokens.size(1)//seq_len)
    
    train_data = list(map(lambda idx: train_tokens[:, idx:idx+seq_len], idxes))
    test_data = list(map(lambda idx: test_tokens[:, idx*seq_len:(idx+1)*seq_len], test_idxes))

    return train_data, test_data

In [None]:
# Finewebedu
def get_fw_data(num_samples, seq_len, tokenizer):
    fw_dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train")

    ds_size = fw_dataset.num_samples
    train_tokens = tokenizer("\n\n".join(fw_train_dataset["text"]), return_tensors="pt", add_special_tokens=False).input_ids
    test_tokens = tokenizer("\n\n".join(fw_val_dataset["text"]), return_tensors="pt", add_special_tokens=False).input_ids
    
    num_tokens = train_tokens.size(1)
    idxes = np.random.choice(num_tokens-seq_len, num_samples, replace=False).tolist()
    test_idxes = range(test_tokens.size(1)//seq_len)
    
    train_data = map(lambda idx: train_tokens[:, idx:idx+seq_len], idxes)
    test_data = map(lambda idx: test_tokens[:, idx*seq_len:(idx+1)*seq_len], test_idxes)

    return train_data, test_data

In [55]:
# Fineweb:
# Source: https://github.com/IST-DASLab/EvoPress/blob/main/src/data_utils.py
def get_fineweb_edu(num_tokens, sequence_length, tokenizer, train = True):
    print_on_main("Loading FineWeb-Edu v2")
    dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train")
    tokens_to_load = num_tokens
    if train:
        dataset = dataset.select(range(dataset.num_rows//2))
    else:
        dataset = dataset.select(range(dataset.num_rows//2, dataset.num_rows))
    dataset = dataset.shuffle(seed=0)
    data_iter = iter(dataset)
    data = []
    while tokens_to_load > 0:
        sample = next(data_iter)
        tokenized_sample = tokenizer(sample["text"], return_tensors="pt", add_special_tokens=False).input_ids
        tokenized_sample = tokenized_sample[:, :min(tokenized_sample.shape[1], tokens_to_load)]
        # Split the sequence into multiple samples if it is too long
        # Just throwing away extra tokens would introduce bias to the dataset
        while tokenized_sample.shape[1] > sequence_length:
            data.append(tokenized_sample[:, :sequence_length])
            tokenized_sample = tokenized_sample[:, sequence_length:]
            tokens_to_load -= sequence_length
        data.append(tokenized_sample)
        tokens_to_load -= tokenized_sample.shape[1]
    print_on_main(f"Total tokens loaded: {sum([sample.shape[1] for sample in data])}")
    return data


In [78]:
w2_train_data, w2_test_data = get_w2_data(NUM_SAMPLES, SEQUENCE_LEN, tokenizer)

In [38]:
c4_train_data, c4_test_data = get_c4_data(NUM_SAMPLES, SEQUENCE_LEN, tokenizer)

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


In [43]:
fw_dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train")

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

In [54]:
fw_dataset.num_rows

9672101

In [53]:
fw_dataset[:4]["text"][9]

IndexError: list index out of range

In [15]:
tokenizer("\n\n".join(c4_train_dataset["text"][:50000]), return_tensors="pt", add_special_tokens=False).input_ids

tensor([[48290,  7130, 22658,  ...,  5286,  3353,     4]])

In [13]:
len(c4_train_dataset[:100]["text"])

100