In [29]:
import gzip
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("ashaba1in/small_openwebtext", split='train')
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

def tokenize_function(examples):
    return tokenizer(examples['text'], add_special_tokens=True)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    desc="Tokenizing"
)

def concatenate_and_chunk_to_file(dataset, block_size=1024, output_file="chunks.txt.gz"):
    buffer = []
    
    with gzip.open(output_file, "wt") as f:
        for item in tqdm(dataset):
            buffer.extend(item['input_ids'])
            
            while len(buffer) >= block_size:
                chunk = buffer[:block_size]
                f.write(" ".join(map(str, chunk)) + "\n")
                buffer = buffer[block_size:]

        if buffer:
            f.write(" ".join(map(str, buffer)) + "\n")

    print(f"Chunks saved to {output_file} in compressed format.")

chunked_data = concatenate_and_chunk_to_file(tokenized_dataset)


  0%|          | 0/1000000 [00:00<?, ?it/s]

Chunks saved to chunks.txt.gz in compressed format.


In [2]:
from dataset import TokenizedChunksDataset

ds = TokenizedChunksDataset("chunks.txt.gz", lines_to_read=2000)

2001it [00:00, 8048.30it/s]


In [3]:
ds[1]

{'input_ids': tensor([  369,  1988,  6914,  ..., 28725,   356, 11463]),
 'labels': tensor([ 1988,  6914,   611,  ...,   356, 11463, 28723])}

In [None]:
from llama import LLAMA
from model_params import LLAMAParams
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

from dataset import TokenizedChunksDataset

ds = TokenizedChunksDataset("chunks.txt.gz", lines_to_read=2000)

params = LLAMAParams(
    dim=64,
    vocab_size=tokenizer.vocab_size,
    hidden_dim=128,
    max_seq_len=1024,
    num_layers=2,
    num_heads=2
)


model = LLAMA(params)

In [11]:
from torch.utils.data import DataLoader

dataloader = DataLoader(ds, batch_size=8, shuffle=True)


In [17]:
dataloader.__iter__().__next__()["input_ids"].shape

torch.Size([8, 1023])

In [23]:
ds[5]["input_ids"].shape

torch.Size([1023])

In [1]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llama_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=1e-4
)



In [2]:
from transformers import Trainer

from llama import LLAMA
from model_params import LLAMAParams
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

from dataset import TokenizedChunksDataset

ds = TokenizedChunksDataset("chunks.txt.gz", lines_to_read=2000)

params = LLAMAParams(
    dim=64,
    vocab_size=tokenizer.vocab_size,
    hidden_dim=128,
    max_seq_len=1024,
    num_layers=2,
    num_heads=2
)


model = LLAMA(params).to(params.device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
)

# Start training
trainer.train()

2001it [00:00, 8184.19it/s]


  0%|          | 0/753 [00:00<?, ?it/s]

{'loss': 10.4432, 'grad_norm': 0.26457127928733826, 'learning_rate': 8.671978751660027e-05, 'epoch': 0.4}
{'loss': 9.9726, 'grad_norm': 0.6160191893577576, 'learning_rate': 7.343957503320054e-05, 'epoch': 0.8}
{'loss': 9.1312, 'grad_norm': 0.6620867848396301, 'learning_rate': 6.01593625498008e-05, 'epoch': 1.2}
{'loss': 8.642, 'grad_norm': 0.6315849423408508, 'learning_rate': 4.687915006640107e-05, 'epoch': 1.59}
{'loss': 8.3403, 'grad_norm': 0.7130919098854065, 'learning_rate': 3.359893758300133e-05, 'epoch': 1.99}
{'loss': 8.1643, 'grad_norm': 0.6354120373725891, 'learning_rate': 2.0318725099601595e-05, 'epoch': 2.39}
{'loss': 8.0656, 'grad_norm': 0.619381308555603, 'learning_rate': 7.03851261620186e-06, 'epoch': 2.79}
{'train_runtime': 180.7869, 'train_samples_per_second': 33.205, 'train_steps_per_second': 4.165, 'train_loss': 8.90007697886838, 'epoch': 3.0}


TrainOutput(global_step=753, training_loss=8.90007697886838, metrics={'train_runtime': 180.7869, 'train_samples_per_second': 33.205, 'train_steps_per_second': 4.165, 'total_flos': 0.0, 'train_loss': 8.90007697886838, 'epoch': 3.0})

In [3]:
tokenizer.vocab_size

32000