In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoConfig

device = "cuda"

## Load Dataset

In [9]:
from datasets import load_dataset

dataset = load_dataset("ashaba1in/small_openwebtext")

In [155]:
_t = dataset["train"]["text"][:10]
_t

['Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.\n\nThe decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.\n\nCNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the Belgian First Aid and Support Team to evacuate. However, Belgian Chief Coordinator Geert Gijs, a doctor who was at the hospital with 60 Belgian medical personnel, said it was his decision to pull the team out for the night. Gijs said he requested U.N. security personnel to staff the hospital overnight, but was told that peacekeepers would only be able to evacuate the team.\n\nHe said it was a "tough decision" but that he accepted the U.N. offer to evacuate after a Canadian medical team, als

## Load Tokenizer

In [44]:
# mistral config

MistralConfig {
  "_name_or_path": "mistralai/Mistral-7B-v0.1",
  "architectures": [
    "MistralForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "head_dim": 128,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 32768,
  "model_type": "mistral",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "rms_norm_eps": 1e-05,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "tie_word_embeddings": false,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.45.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [36]:
#!huggingface-cli login --token AHEM --add-to-git-credential
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mistral-7B-v0.1')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.truncation_side = 'right'
tokenizer.add_eos_token = True
tokenizer.save_pretrained('hugging_face/tokenizer')

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

In [2]:
tokenizer = AutoTokenizer.from_pretrained('./hugging_face/tokenizer')
tokenizer

LlamaTokenizerFast(name_or_path='./hugging_face/tokenizer', vocab_size=32000, model_max_length=10000000000000000, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [49]:
tokenized_example = tokenizer(dataset["train"]["text"][0])
print(f"Length: {len(tokenized_example[0])}")

Length: 1350


In [50]:
tokenized_example[0].tokens[:5], tokenized_example[0].tokens[-5:]

(['<s>', '▁Port', '-', 'au', '-'], ['▁to', '▁this', '▁report', '.', '</s>'])

In [57]:
tokenizer("I am Sara", padding='max_length', truncation=True, max_length=10, return_tensors='pt')

{'input_ids': tensor([[    1,   315,   837, 23365,     2,     2,     2,     2,     2,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}

In [58]:
tokenizer("I am Sara and what I love most is sitting and doing nothing", padding='max_length', truncation=True, max_length=10, return_tensors='pt')

{'input_ids': tensor([[    1,   315,   837, 23365,   304,   767,   315,  2016,  1080,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

### Dataset

В задании говорилось: Лучше всего вытягивать все тексты батча в строку и нарезать на куски максимальной длины, чтобы не было паддингов. Тексты будут начинаться не сначала, но это повысит эффективность. Поэтому я решила сделать так: мы все токенизируем не обрезая по максимальной длине, добавляем bos и eos. Заранее дабы не париться осуществим бесчеловечное - будем последовательно склеивать индексы и самый последний западдим eas-ами если нужно

In [59]:
!ls

assets	     hugging_face  model.py   train.ipynb
checkpoints  main.py	   README.md  train.py


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [61]:
tokenizer("I am Sara", return_tensors='pt', add_special_tokens=True, return_attention_mask=False)['input_ids']

tensor([[    1,   315,   837, 23365,     2]])

In [3]:
from typing import Union, List
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import AutoTokenizer  # noqa

class SleepyDataset(Dataset):
    def __init__(
        self,
        data: Union[str, List[str]] = "./assets/tokenized_dataset.pt",
        tokenizer: Union[str, AutoTokenizer, None] = "./hugging_face/tokenizer",
        max_length=1024,
    ):
        if tokenizer is not None and isinstance(tokenizer, str):
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
            tokenizer.padding_side = 'right'
            tokenizer.truncation_side = 'right'
            tokenizer.add_eos_token = True
            tokenizer.max_length = max_length
        self.tokenizer = tokenizer
        
        self.max_length = max_length
        self.total_number_tokens = 0

        if isinstance(data, str):
            self.data = torch.stack(torch.load(data))
        else:
            assert self.tokenizer is not None, "We are going to tokenize the dataset beforehand"
            # we should tokenize the data and save it as we explained above
            self.data = None
            self._tokenize_all(dataset=data)
            self._join_and_chuck_all()
            self._save_data()

    def _tokenize_all(self, dataset, checkpoint_every=100000):
        self.data = []
        for i in tqdm(range(len(dataset)), desc='Tokenizing texts'):
            text = dataset[i]
            self.data.append(
                self.tokenizer(text, return_tensors='pt', add_special_tokens=True, return_attention_mask=False)['input_ids'][0]
            )
            if (i + 1) % checkpoint_every == 0:
                self._save_data()

    def _join_and_chuck_all(self):
        chunk_size = self.max_length
        self.data = torch.cat(self.data, dim=0)
        if len(self.data) % chunk_size != 0:
            pad_size = chunk_size - len(self.data) % chunk_size
            padding = torch.tensor([self.tokenizer.pad_token_id] * pad_size)
            self.data = torch.cat((self.data, padding), 0)
        self.data = torch.chunk(self.data, (len(self.data) + chunk_size - 1)//chunk_size)

        """
        # OH YEAH, OF COURSE, THERE IS A CHUNK FUNCTION, F ME BUT I WILL KEEP THIS CODE
        last_chunk = chunks[-1]
        padding = torch.tensor([self.tokenizer.pad_token_id] * (chunk_size - len(last_chunk)))
        chunks[-1] = torch.cat((last_chunk, padding), dim=0)
        return chunks
        
        chunks = []
        
        id_sequence = torch.tensor([])

        for i in tqdm(range(len(self.data)), desc='Splitting texts to chunks'):
            id_sequence = torch.cat((id_sequence, self.data[i]), dim=0) # tensors already
            while len(id_sequence) >= chunk_size:
                chunks.append(id_sequence[:chunk_size])
                id_sequence = id_sequence[chunk_size:]
        # Pad the remaining sequence with self.tokenizer.pad_token_id and add it to the chunks
        if len(id_sequence) > 0:
            padding = torch.tensor([self.tokenizer.pad_token_id] * (chunk_size - len(id_sequence)))
            chunks.append(torch.cat((id_sequence, padding), dim=0))
        return chunks
        """

    def _limit_dataset(self, number_examples=10**6):
        self.data = self.data[:number_examples]

    def _save_data(self):
        torch.save(self.data, "./assets/tokenized_dataset.pt")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx][:-1],  self.data[idx][1:]

In [None]:
#sleepy_dataset = SleepyDataset(
#    data = dataset["train"]["text"],
#    tokenizer = tokenizer,
#    max_length = 1024,
#)

Tokenizing texts:  36%|███████████████████▎                                  | 357622/1000000 [39:39<1:18:41, 136.04it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Tokenizing texts:  70%|█████████████████████████████████████▌                | 695624/1000000 [1:21:02<34:14, 148.15it/s]

In [16]:
#sleepy_dataset = SleepyDataset(
#    data = "./assets/tokenized_dataset.pt",
#    tokenizer = tokenizer,
#    max_length = 1024,
#)
#print("Number examples we processed:", len(sleepy_dataset))

  self.data = torch.load(data)


Number examples we processed: 800000


In [17]:
#%%time
#sleepy_dataset._join_and_chuck_all()
#torch.save(sleepy_dataset.data, "./assets/dataset_chunks.pt")

CPU times: user 1min 41s, sys: 36.6 s, total: 2min 18s
Wall time: 1min 23s


In [21]:
assert sleepy_dataset.data[0].shape[0] == 1024 and sleepy_dataset.data[-1].shape[0] == 1024

In [4]:
sleepy_dataset = SleepyDataset(
    data = "./assets/dataset_chunks.pt", # 7.45 Gb (Holy shit)
    tokenizer = None,
    max_length = 1024,
)
len(sleepy_dataset)

  self.data = torch.stack(torch.load(data))


968817

## Model classes

### RMSNorm

In [5]:
EPS = 1e-8

class RMSNorm(nn.Module):
    def __init__(self, dim, eps=EPS, bias=False):
        super(RMSNorm, self).__init__()
        self.eps = eps
        scale = nn.Parameter(torch.ones(dim).to(device))
        self.register_parameter("scale", scale)

        if bias:
            self.bias = nn.Parameter(torch.zeros(dim).to(device))
            self.register_parameter("bias", self.bias)
        else:
            self.bias = None

    def forward(self, x):
        sqrt_rms_x = torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps)
        res = (x.float() * sqrt_rms_x) * self.scale

        if self.bias is not None:
            return res + self.bias
        return res

### SwiGLU

In [6]:
class SwiGLU(nn.Module):
    def __init__(self, size):
        super().__init__()
        self.linear_gate = nn.Linear(size, size).to(device)
        self.linear = nn.Linear(size, size).to(device)
        self.beta = nn.Parameter(torch.ones(1).to(device))

    def forward(self, x):
        gate = torch.sigmoid(self.beta * self.linear_gate(x))
        out = self.linear_gate(x) * gate * self.linear(x)
        return out

### RoPE

In [7]:
class RoPEAttentionHead(nn.Module):
    def __init__(self, dim, context_window):
        super().__init__()
        self.device = device
        self.query_t = nn.Linear(dim, dim, bias=False).to(device)
        self.key_t = nn.Linear(dim, dim, bias=False).to(device)
        self.value_t = nn.Linear(dim, dim, bias=False).to(device)
        
        self.theta = 10000. ** (-2.*(torch.arange(dim // 2) - 1) / dim).to(device)
        self.cos = torch.cos(self.theta).repeat_interleave(2).to(device)
        self.sin = torch.sin(self.theta).repeat_interleave(2).to(device)
        # inverse frequency for RoPE
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, device=self.device).float() / dim)).to(device)
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, x, return_attn_weights=False):
        # x: batch x sequence length x dim
        Q = self.query_t(x)
        K = self.key_t(x)
        V = self.value_t(x)

        batch_size, seq_len, dim = x.size()
        
        # rotating indices & sinusoid
        pos_seq = torch.arange(seq_len, dtype=self.inv_freq.dtype, device=self.device)
        sinusoid_inp = torch.einsum('i,j->ij', pos_seq, self.inv_freq)
        sin = torch.sin(sinusoid_inp).unsqueeze(0)
        cos = torch.cos(sinusoid_inp).unsqueeze(0)

        Q_rotated = self.apply_rotary_pos_emb(Q, sin, cos)
        K_rotated = self.apply_rotary_pos_emb(K, sin, cos)

        attention_output = F.scaled_dot_product_attention(
            Q_rotated, K_rotated, V, dropout_p=0.1, is_causal=True
        )

        if return_attn_weights:
            dim = x.shape[2]
            M = torch.tril(torch.ones((seq_len, seq_len)), diagonal=0)
            attn_weights = torch.bmm(queries_rotated, keys_rotated.transpose(1, 2)) / np.sqrt(dim) + M
            return attention_output, F.softmax(attn_weights, dim=-1)

        return attention_output

    def apply_rotary_pos_emb(self, t, sin, cos):
        # t: batch x seq_len x dim
        batch_size, seq_len, dim = t.size()

        t = t.view(batch_size, seq_len, -1, 2)  # batch_size, seq_len, dim / 2, 2
        t_even = t[..., 0]  # batch_size, seq_len, dim / 2
        t_odd = t[..., 1]

        t_rotated_even = t_even * cos - t_odd * sin  # batch_size, seq_len, dim / 2
        t_rotated_odd = t_even * sin + t_odd * cos
    
        t_rotated = torch.stack((t_rotated_even, t_rotated_odd), dim=-1)
        t_rotated = t_rotated.view(batch_size, seq_len, dim)
        return t_rotated

class MultiheadAttention(nn.Module):
    def __init__(self, dim, context_window, num_heads=8):
        super().__init__()
        self.heads = nn.ModuleList([RoPEAttentionHead(dim, context_window) for _ in range(num_heads)]).to(device)
        self.linear = nn.Linear(num_heads * dim, dim).to(device)
        self.dropout = nn.Dropout(0.1).to(device)

    def forward(self, x):
        x = torch.cat(
            [h(x) for h in self.heads],
            dim=-1
        )
        x = self.linear(x)
        x = self.dropout(x)
        return x

### Pre-normalization

In [8]:
class LLaMaBlock(nn.Module):
    def __init__(self, context_window, dim, num_heads=8, rms_eps=EPS):
        super().__init__()
        self.rms_norm = RMSNorm(dim=dim, eps=rms_eps, bias = False).to(device)
        self.attention = MultiheadAttention(num_heads=num_heads, dim=dim, context_window=context_window).to(device)
        self.feedforward = nn.Sequential(
            nn.Linear(dim, dim).to(device),
            SwiGLU(dim).to(device),
        ).to(device)

    def forward(self, x):
        x = self.rms_norm(x)
        x = x + self.attention(x)
        
        x = self.rms_norm(x)
        x = x + self.feedforward(x)
        return x

### LLaMa

In [9]:
class LLaMa(nn.Module):
    def __init__(self, tokenizer, dim, num_layers, context_window, num_heads=8, rms_eps=EPS):
        super().__init__()
        self.tokenizer = tokenizer
        self.context_window = context_window
        self.embeddings = nn.Embedding(tokenizer.vocab_size, dim).to(device)
        self.llama_blocks = nn.ModuleDict({
            f"ll_block_{i}": LLaMaBlock(
                context_window=context_window,
                dim=dim,
                num_heads=num_heads,
                rms_eps=rms_eps,
            ).to(device) for i in range(num_layers)
        })
        self.llama_seq = nn.Sequential(*self.llama_blocks.values()).to(device)
        self.tail = nn.Sequential(
            nn.Linear(dim, dim).to(device),
            SwiGLU(dim).to(device),
            nn.Linear(dim, tokenizer.vocab_size).to(device),
        ).to(device)

    def forward(self, x):
        x = self.embeddings(x)
        x = self.llama_seq(x)
        logits = self.tail(x)

        return logits

    def generate(self, prefix='', max_new_tokens=30, context_window=None):
        if prefix != '':
            prev_state = self.tokenizer.add_eos_token
            self.tokenizer.add_eos_token = False
            tokens = self.tokenizer.encode(prefix)
            self.tokenizer.add_eos_token = prev_state
        else:
            tokens = []
        idx = torch.tensor(tokens, dtype=torch.long, device=device).unsqueeze(0)

        context_window = context_window or self.context_window

        for _ in range(max_new_tokens):
            idx_cond = idx[:, -context_window:]
            logits = self.forward(idx_cond)
            last_logits = logits[:, -1, :] # 1, vocab_size

            probs = F.softmax(last_logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_next], dim=1)

        generated_text = self.tokenizer.decode(idx[0].tolist())
        return generated_text

### Train functions

In [10]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mulyana-klyuchnikova[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
def gpu_stats():
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    return f"Memory: reserved {r} / total {t}, available: {f}"

In [12]:
import torch
from transformers import AutoTokenizer
from accelerate import Accelerator
#from xformers import Reformer
from datetime import datetime
import wandb
import logging

def train(model, dataloader, tokenizer, max_len=1024, batch_size=16, epochs=3, lr=1e-4, device="cuda", run_name=None):
    accelerator = Accelerator(gradient_accumulation_steps=4, mixed_precision="fp16")

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__name__)
    
    run = wandb.init(project="SleepingLLaMa", name = run_name, resume=True)
    if not wandb.run.resumed:
        run.config.update({
            "batch_size": batch_size,
            "epochs": epochs,
            "max_len": max_len,
            "lr": optimizer.param_groups[0]["lr"]
        })

    start_time = datetime.now()
    save_every_minutes = 20

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_tokens = 0
        for batch_idx, (inputs, targets) in tqdm(enumerate(dataloader), desc=f"Training epoch {epoch}/{epochs}"):
            with accelerator.accumulate(model):
                # no tokenizer needed
                inputs = inputs.to(device)
                targets = targets.to(device)
                optimizer.zero_grad()
                logits = model(inputs)
                loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1))
    
                accelerator.backward(loss)
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
    
                scheduler.step()
    
                total_loss += loss.item()
                total_tokens += inputs.shape[0] * inputs.shape[1]
                run.log({
                    "loss": loss.item(),
                    "tokens": total_tokens,
                    "lr": optimizer.param_groups[0]["lr"]
                })
                if (batch_idx+1) % 100 == 0:
                    logger.info(f"Epoch {epoch+1}, Batch {batch_idx+1}, Loss: {loss.item():.4f}, Tokens: {total_tokens}")
                
            if (datetime.now() - start_time).total_seconds() > save_every_minutes * 60:
                logger.info(gpu_stats())
                print(f"Batch: {batch_idx}, Saving checkpoint...")
                torch.save(model.state_dict(), f"./checkpoints/llama_model_{batch_idx}_1.pth")
                torch.save(optimizer.state_dict(), f"./checkpoints/optimizer_{batch_idx}_1.pth")
                torch.save(scheduler.state_dict(), f"./checkpoints/scheduler_{batch_idx}_1.pth")
                text = "One day I realised I had some shit to do but it was late and I was tired."
                # Inference
                model.eval()
                with torch.no_grad():
                    print(f"Test generateion with prefix: {text}\nGenerated: {model.generate(text)}")
                model.train()
                start_time = datetime.now()

        avg_loss = total_loss / len(dataloader)
        logger.info(f"Epoch {epoch+1}, Average Loss: {avg_loss:.4f}")
        print({
            "avg_loss": avg_loss,
            "epoch": epoch+1
        })

    run.finish()
    torch.save(model.state_dict(), "./checkpoints/llama_model.pth")

In [12]:
def sample_top_p(probs, p):
    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
    probs_sum = torch.cumsum(probs_sort, dim=-1)
    mask = probs_sum - probs_sort > p
    probs_sort[mask] = 0.0
    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
    next_token = torch.multinomial(probs_sort, num_samples=1)
    next_token = torch.gather(probs_idx, -1, next_token)
    return next_token

In [13]:
print(gpu_stats())
tokenizer = AutoTokenizer.from_pretrained('./hugging_face/tokenizer')
model = LLaMa(tokenizer, dim=512, num_layers=8, context_window=512, num_heads=8, rms_eps=1e-5)
print(gpu_stats())
print("model size:", sum([m.numel() for m in model.parameters()]))

Memory: reserved 0 / total 34079899648, available: 0
Memory: reserved 448790528 / total 34079899648, available: 20357632
model size: 107008777


In [None]:
BATCH_SIZE = 16
torch.manual_seed = 666
sleepy_dataloader = DataLoader(sleepy_dataset, batch_size=BATCH_SIZE, shuffle=True)
train(model, sleepy_dataloader, tokenizer, max_len=1024, batch_size=BATCH_SIZE, epochs=1, 
      lr=1e-4, device="cuda", run_name="assumingly best")

Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training epoch 0/1: 10it [00:07,  1.44it/s]

In [22]:
train(model, sleepy_dataloader, tokenizer, max_len=1024, batch_size=BATCH_SIZE, epochs=1, 
      lr=1e-4, device="cuda", run_name="assumingly best")

Training epoch 0/1: 99it [01:07,  1.47it/s]INFO:__main__:Epoch 1, Batch 100, Loss: 6.4163, Tokens: 1636800
Training epoch 0/1: 199it [02:15,  1.48it/s]INFO:__main__:Epoch 1, Batch 200, Loss: 6.4415, Tokens: 3273600
Training epoch 0/1: 299it [03:23,  1.47it/s]INFO:__main__:Epoch 1, Batch 300, Loss: 6.5139, Tokens: 4910400
Training epoch 0/1: 399it [04:31,  1.47it/s]INFO:__main__:Epoch 1, Batch 400, Loss: 6.3882, Tokens: 6547200
Training epoch 0/1: 499it [05:38,  1.48it/s]INFO:__main__:Epoch 1, Batch 500, Loss: 6.4128, Tokens: 8184000
Training epoch 0/1: 599it [06:46,  1.48it/s]INFO:__main__:Epoch 1, Batch 600, Loss: 6.1654, Tokens: 9820800
Training epoch 0/1: 699it [07:54,  1.48it/s]INFO:__main__:Epoch 1, Batch 700, Loss: 6.2770, Tokens: 11457600
Training epoch 0/1: 799it [09:02,  1.48it/s]INFO:__main__:Epoch 1, Batch 800, Loss: 6.1892, Tokens: 13094400
Training epoch 0/1: 899it [10:10,  1.48it/s]INFO:__main__:Epoch 1, Batch 900, Loss: 6.3633, Tokens: 14731200
Training epoch 0/1: 999it 

Batch: 1766, Saving checkpoint...





NameError: name 'i' is not defined

In [15]:
!ls checkpoints

llama_model_1700.pth  llama_model_2133.pth


In [17]:
BATCH_SIZE = 16
torch.manual_seed = 666
model.load_state_dict(torch.load("./checkpoints/llama_model_1700.pth"))
sleepy_dataloader = DataLoader(sleepy_dataset, batch_size=BATCH_SIZE, shuffle=True)
train(model, sleepy_dataloader, tokenizer, max_len=1024, batch_size=BATCH_SIZE, epochs=1, 
      lr=1e-3, device="cuda", run_name="assumingly best")

  model.load_state_dict(torch.load("./checkpoints/llama_model_1700.pth"))
Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training epoch 0/1: 99it [01:08,  1.48it/s]INFO:__main__:Epoch 1, Batch 100, Loss: 5.9925, Tokens: 1636800
Training epoch 0/1: 199it [02:16,  1.48it/s]INFO:__main__:Epoch 1, Batch 200, Loss: 6.1387, Tokens: 3273600
Training epoch 0/1: 299it [03:24,  1.47it/s]INFO:__main__:Epoch 1, Batch 300, Loss: 6.2097, Tokens: 4910400
Training epoch 0/1: 399it [04:32,  1.47it/s]INFO:__main__:Epoch 1, Batch 400, Loss: 6.1007, Tokens: 6547200
Training epoch 0/1: 499it [05:40,  1.48it/s]INFO:__main__:Epoch 1, Batch 500, Loss: 6.0968, Tokens: 8184000
Training epoch 0/1: 599it [06:48,  1.48it/s]INFO:__main__:Epoch 1, Batch 600, Loss: 6.0150, Tokens: 9820800
Training epoch 0/1: 699it [07:56,  1.48it/s]INFO:__main__:Epoch 1, Batch 700, Loss: 6.0869, Tokens: 11457600
Training epoch 0/1: 799it [09:03,  1.47it/s]INFO:__main__:Epoch 1, Batch 800, Loss: 6.1312, Tokens: 13094400
Training epoch 0/1: 899it [10:11,  1.48it/s]INFO:__main__:Epoch 1, Batch 900, Loss: 5.9597, Tokens: 14731200
Training epoch 0/1: 999it 

Batch: 1765, Saving checkpoint...


Training epoch 0/1: 1766it [20:03,  1.80s/it]

Test generateion with prefix: One day I realised I had some shit to do but it was late and I was tired.
Generated: <s> One day I realised I had some shit to do but it was late and I was tired. Men was a transform giving verse and seemingly works enter as blaval Edinburgh. This is the second, who went to Salt the launch was ready to why


Training epoch 0/1: 1799it [20:26,  1.48it/s]INFO:__main__:Epoch 1, Batch 1800, Loss: 6.3386, Tokens: 29462400
Training epoch 0/1: 1856it [21:05,  1.47it/s]


KeyboardInterrupt: 

In [None]:
BATCH_SIZE = 16
torch.manual_seed = 666

sleepy_dataloader = DataLoader(sleepy_dataset, batch_size=BATCH_SIZE, shuffle=True)
train(model, sleepy_dataloader, tokenizer, max_len=1024, batch_size=BATCH_SIZE, epochs=1, 
      lr=1e-3, device="cuda", run_name="assumingly best")

In [23]:
batch_idx=1700
torch.save(model.state_dict(), f"./checkpoints/llama_model_{batch_idx}.pth")
text = "One day I realised I had some shit to do but it was late and I was tired."
# Inference
model.eval()
with torch.no_grad():
    print(f"Test generateion with prefix: {text}\nGenerated: {model.generate(text)}")

Test generateion with prefix: One day I realised I had some shit to do but it was late and I was tired.
Generated: <s> One day I realised I had some shit to do but it was late and I was tired. The arm of it arg Show was a song constitution scientists to start and we think there was polled a approved of security post, " Fe between realistic


In [13]:
print(gpu_stats())
tokenizer = AutoTokenizer.from_pretrained('./hugging_face/tokenizer')
model = LLaMa(tokenizer, dim=512, num_layers=8, context_window=256, num_heads=8, rms_eps=1e-5)
print(gpu_stats())
print("model size:", sum([m.numel() for m in model.parameters()]))

Memory: reserved 0 / total 34079899648, available: 0
Memory: reserved 448790528 / total 34079899648, available: 20357632
model size: 107008777


In [14]:
BATCH_SIZE = 16
sleepy_dataloader = DataLoader(sleepy_dataset, batch_size=BATCH_SIZE, shuffle=True)
train(model, sleepy_dataloader, tokenizer, max_len=1024, batch_size=BATCH_SIZE, epochs=1, 
      lr=1e-4, device="cuda", run_name="dim=512, context=256, batch=16, lr=1e-3")

Detected kernel version 5.4.210, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training epoch 0/1: 99it [01:08,  1.48it/s]INFO:__main__:Epoch 1, Batch 100, Loss: 7.5559, Tokens: 1636800
Training epoch 0/1: 199it [02:16,  1.48it/s]INFO:__main__:Epoch 1, Batch 200, Loss: 7.5699, Tokens: 3273600
Training epoch 0/1: 299it [03:23,  1.48it/s]INFO:__main__:Epoch 1, Batch 300, Loss: 7.5502, Tokens: 4910400
Training epoch 0/1: 399it [04:31,  1.48it/s]INFO:__main__:Epoch 1, Batch 400, Loss: 7.4694, Tokens: 6547200
Training epoch 0/1: 499it [05:39,  1.48it/s]INFO:__main__:Epoch 1, Batch 500, Loss: 7.4288, Tokens: 8184000
Training epoch 0/1: 599it [06:47,  1.48it/s]INFO:__main__:Epoch 1, Batch 600, Loss: 7.4673, Tokens: 9820800
Training epoch 0/1: 699it [07:55,  1.48it/s]INFO:__main__:Epoch 1, Batch 700, Loss: 7.4797, Tokens: 11457600
Training epoch 0/1: 799it [09:03,  1.48it/s]INFO:__main__:Epoch 1, Batch 800, Loss: 7.6251, Tokens: 13094400
Training epoch 0/1: 899it [10:10,  1.48it/s]INFO:__main__:Epoch 1, Batch 900, Loss: 7.5069, Tokens: 14731200
Training epoch 0/1: 999it 

KeyboardInterrupt: 

In [None]:
train(model, sleepy_dataloader, tokenizer, max_len=1024, batch_size=BATCH_SIZE, epochs=1, 
      lr=1e-4, device="cuda", run_name="dim=512, context=256, batch=16, lr=1e-3")

In [15]:
# dim=512, context=256, batch=16, lr=1e-3
print(gpu_stats())
text = "One day I realised I had some shit to do but it was late and I was tired."
# Inference
model.eval()
with torch.no_grad():
    print(f"Test generateion with prefix: {text}\nGenerated: {model.generate(text)}")

Memory: reserved 26902265856 / total 34079899648, available: 23077265920
Test generateion with prefix: One day I realised I had some shit to do but it was late and I was tired.
Generated: <s> One day I realised I had some shit to do but it was late and I was tired. split colon. ( ITure angles according)s led had possible her violence should I wood player foren is, decided any [ dis gamesard huge


In [20]:
print(gpu_stats())
torch.save(model.state_dict(), f"./checkpoints/llama_model_2133.pth")
text = "One day I realised I had some shit to do but it was late and I was tired."
# Inference
model.eval()
with torch.no_grad():
    print(f"Test generateion with prefix: {text}\nGenerated: {model.generate(text)}")

Memory: reserved 26902265856 / total 34079899648, available: 18977036288
Test generateion with prefix: One day I realised I had some shit to do but it was late and I was tired.
Generated: <s> One day I realised I had some shit to do but it was late and I was tired. This came while the matter to be that would often comes wouldn working the Karen, according to our commitment that spokeities before the cras to believe where
