In [4]:
import gzip
from tqdm.notebook import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("ashaba1in/small_openwebtext", split='train')
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

def tokenize_function(examples):
    return tokenizer(examples['text'], add_special_tokens=True)

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    desc="Tokenizing"
)

def concatenate_and_chunk_to_file(dataset, block_size=1024, output_file="chunks_1024.txt.gz"):
    buffer = []
    
    with gzip.open(output_file, "wt") as f:
        for item in tqdm(dataset):
            buffer.extend(item['input_ids'])
            
            while len(buffer) >= block_size:
                chunk = buffer[:block_size]
                f.write(" ".join(map(str, chunk)) + "\n")
                buffer = buffer[block_size:]
            break

        if buffer:
            f.write(" ".join(map(str, buffer)) + "\n")

    print(f"Chunks saved to {output_file} in compressed format.")

chunked_data = concatenate_and_chunk_to_file(tokenized_dataset)


  0%|          | 0/1000000 [00:00<?, ?it/s]

Chunks saved to chunks_1024.txt.gz in compressed format.


In [8]:
with gzip.open("chunks.txt.gz", 'rt') as f:
    for line in f:
        tokens = line.strip().split()
        print(len(tokens))
        print(tokens[-1])
        break

1024
1745


In [1]:
from dataset import TokenizedChunksDataset

ds = TokenizedChunksDataset("chunks.txt.gz", lines_to_read=2000)

Total lines:  2000


Reading file and loading it to the memory: 2001it [00:00, 8189.91it/s]                          


In [3]:
ds[1]

{'input_ids': tensor([  369,  1988,  6914,  ..., 28725,   356, 11463]),
 'labels': tensor([ 1988,  6914,   611,  ...,   356, 11463, 28723])}

In [22]:
from llama import LLAMA
from model_params import LLAMAParams
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

from dataset import TokenizedChunksDataset

ds = TokenizedChunksDataset("chunks.txt.gz", lines_to_read=2000)

params = LLAMAParams(
    dim=64,
    vocab_size=tokenizer.vocab_size,
    hidden_dim=128,
    max_seq_len=1024,
    num_layers=2,
    num_heads=2
)


model = LLAMA(params)

Total lines:  2000


FileNotFoundError: [Errno 2] No such file or directory: 'chunks.txt.gz'

In [11]:
from torch.utils.data import DataLoader

dataloader = DataLoader(ds, batch_size=8, shuffle=True)


In [17]:
dataloader.__iter__().__next__()["input_ids"].shape

torch.Size([8, 1023])

In [23]:
ds[5]["input_ids"].shape

torch.Size([1023])

In [1]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./llama_model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="no",
    logging_dir='./logs',
    logging_steps=100,
    learning_rate=1e-4
)



In [2]:
from transformers import Trainer

from llama import LLAMA
from model_params import LLAMAParams
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')

from dataset import TokenizedChunksDataset

ds = TokenizedChunksDataset("chunks.txt.gz", lines_to_read=2000)

params = LLAMAParams(
    dim=64,
    vocab_size=tokenizer.vocab_size,
    hidden_dim=128,
    max_seq_len=1024,
    num_layers=2,
    num_heads=2
)


model = LLAMA(params).to(params.device)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds,
)

# Start training
trainer.train()

2001it [00:00, 8184.19it/s]


  0%|          | 0/753 [00:00<?, ?it/s]

{'loss': 10.4432, 'grad_norm': 0.26457127928733826, 'learning_rate': 8.671978751660027e-05, 'epoch': 0.4}
{'loss': 9.9726, 'grad_norm': 0.6160191893577576, 'learning_rate': 7.343957503320054e-05, 'epoch': 0.8}
{'loss': 9.1312, 'grad_norm': 0.6620867848396301, 'learning_rate': 6.01593625498008e-05, 'epoch': 1.2}
{'loss': 8.642, 'grad_norm': 0.6315849423408508, 'learning_rate': 4.687915006640107e-05, 'epoch': 1.59}
{'loss': 8.3403, 'grad_norm': 0.7130919098854065, 'learning_rate': 3.359893758300133e-05, 'epoch': 1.99}
{'loss': 8.1643, 'grad_norm': 0.6354120373725891, 'learning_rate': 2.0318725099601595e-05, 'epoch': 2.39}
{'loss': 8.0656, 'grad_norm': 0.619381308555603, 'learning_rate': 7.03851261620186e-06, 'epoch': 2.79}
{'train_runtime': 180.7869, 'train_samples_per_second': 33.205, 'train_steps_per_second': 4.165, 'train_loss': 8.90007697886838, 'epoch': 3.0}


TrainOutput(global_step=753, training_loss=8.90007697886838, metrics={'train_runtime': 180.7869, 'train_samples_per_second': 33.205, 'train_steps_per_second': 4.165, 'total_flos': 0.0, 'train_loss': 8.90007697886838, 'epoch': 3.0})

In [3]:
tokenizer.vocab_size

32000

In [None]:
import numpy as np
from tqdm.notebook import tqdm

def convert_txt_to_bin(txt_file, bin_file, seq_len=1024):
    with open(txt_file, 'r') as txt, open(bin_file, 'wb') as bin_out:
        for line in tqdm(txt, total=1209636):
            tokens = list(map(int, line.strip().split()))
            if len(tokens) != seq_len:
                continue
            
            token_array = np.array(tokens, dtype=np.int32)
            
            bin_out.write(token_array.tobytes())

    print(f"Binary conversion complete. Data saved to {bin_file}")

In [10]:
convert_txt_to_bin("chunks.txt", "chunks.bin")

  0%|          | 0/1209636 [00:00<?, ?it/s]

Binary conversion complete. Data saved to chunks.bin


In [16]:
from torch.utils.data import Dataset
import os
import torch

class BinaryTokenDataset(Dataset):
    def __init__(self, bin_file, seq_len=1024):
        self.bin_file = bin_file
        self.seq_len = seq_len
        self.record_size = seq_len * 4  # Each int32 token takes 4 bytes
        
        # Determine the total number of records in the file
        self.total_records = os.path.getsize(bin_file) // self.record_size
        print(f"Total sequences available: {self.total_records}")

    def __len__(self):
        return self.total_records

    def __getitem__(self, idx):
        if idx >= self.total_records:
            raise IndexError("Index out of range")

        # Seek to the start of the record in the binary file
        with open(self.bin_file, 'rb') as f:
            f.seek(idx * self.record_size)
            # Read the sequence as binary data and convert it to an int32 numpy array
            data = np.frombuffer(f.read(self.record_size), dtype=np.int32)
            
            # Convert to torch tensor
            input_ids = torch.tensor(data, dtype=torch.long)
            return {'input_ids': input_ids[:-1], 'labels': input_ids[1:]}

In [17]:
ds_bin = BinaryTokenDataset(bin_file="chunks.bin")

Total sequences available: 1209635


In [18]:
ds_bin[0]

{'input_ids': tensor([    1,  4194, 28733,  ...,   661,   403,   396]),
 'labels': tensor([ 4194, 28733,   581,  ...,   403,   396,  1745])}

In [21]:
ds_bin[6]

{'input_ids': tensor([16049,   794, 28725,  ..., 28723,  1015,   395]),
 'labels': tensor([  794, 28725,  8160,  ...,  1015,   395,   272])}