In [1]:
import math
import random
!pip install tiktoken
import tiktoken
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
from google.colab import drive
drive.mount('/content/drive')
print(os.getcwd())
os.chdir('/content/drive/MyDrive/Colab Notebooks/gothic')
print(os.getcwd())

# ---------------------------------------------
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # not really a 'bias', more of a mask, but following the OpenAI/HF naming though
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        # attention (materializes the large (T*T) matrix for all the queries and keys)
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y


class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024  # max sequence length
    vocab_size: int = 50257  # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    # 'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
    # 'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
    # 'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
    # 'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
    n_layer: int = 24  # number of layers
    n_head: int = 16  # number of heads
    n_embd: int = 1024  # embedding dimension


class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)




    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and position embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)  # shape (T)
        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, device):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params} parameters")

        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and 'cuda' in device
        print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8)
        return optimizer

class DataLoaderLite:
    def __init__(self, B, T, val_split=0.2):
        self.B = B
        self.T = T

        # Load the tokens and split them into training and validation sets
        with open('./gothic_novels_combined.txt', 'r') as f:
            text = f.read()
        print(f"loaded {len(text)} characters")
        enc = tiktoken.get_encoding('gpt2')

        tokens = enc.encode(text, allowed_special={"<|endoftext|>"})
        len_tokens = len(tokens)

        # Ensure tokens are stored on CPU
        self.tokens = torch.tensor(tokens, device='cpu')
        print(f"loaded: {len(self.tokens)} tokens")

        # Split into training and validation buffers on CPU
        buffers = []
        current_position = 0
        while current_position + (B * T + 1) < len_tokens:
            buffers.append(self.tokens[current_position : current_position + (B * T + 1)])
            current_position += B * T

        random.shuffle(buffers)

        split_idx = int(len(buffers) * (1 - val_split))
        self.train = buffers[:split_idx]
        self.val = buffers[split_idx:]

        print(f"Training buffers: {len(self.train)}, Validation buffers: {len(self.val)}")

        self.current_train_buffer = 0
        self.current_val_buffer = 0

    def next_batch(self, train=True):
        B, T = self.B, self.T
        if train:
            if self.current_train_buffer == 0:
                print(f"Shuffling training buffers")
                random.shuffle(self.train)
            buf = self.train[self.current_train_buffer]
            self.current_train_buffer = (self.current_train_buffer + 1) % len(self.train)
        else:
            if self.current_val_buffer == 0:
                print(f"Shuffling validation buffers")
                random.shuffle(self.val)
            buf = self.val[self.current_val_buffer]
            self.current_val_buffer = (self.current_val_buffer + 1) % len(self.val)


        # Ensure data is returned on the CPU, only transfer to GPU in training loop
        x = (buf[:-1]).view(B, T).to('cpu')  # inputs on CPU
        y = (buf[1:]).view(B, T).to('cpu')   # targets on CPU
        return x, y

    def reset_buffers(self, train=True):
        if train:
            self.current_train_buffer = 0
        else:
            self.current_val_buffer = 0



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/MyDrive/Colab Notebooks/gothic


In [None]:
# attempt to autodetect the device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

model = GPT(GPTConfig(vocab_size=50304))

model.to(device)
model = torch.compile(model)
model.load_state_dict(torch.load('model_medium_fifth_500.pth'), strict=False)
model.train()

using device: cuda


OptimizedModule(
  (_orig_mod): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(50304, 1024)
      (wpe): Embedding(1024, 1024)
      (h): ModuleList(
        (0-23): 24 x Block(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): CausalSelfAttention(
            (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
            (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='tanh')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
        )
      )
      (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=1024, out_features=50304, bias=False)
  )
)

In [None]:
import time
import matplotlib.pyplot as plt

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

# optimize!
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)

total_batch_size = 524288  # 2**19, ~0.5M, in number of tokens
B = 16  # micro batch size
T = 1024  # sequence length
assert total_batch_size % (B * T) == 0, "make sure total_batch_size is divisible by B * T"
grad_accum_steps = total_batch_size // (B * T)

print(f"total desired batch size: {total_batch_size}")
print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

torch.set_float32_matmul_precision('high')

train_loader = DataLoaderLite(B, T, val_split=0.2)

print(f"{len(train_loader.train) / grad_accum_steps=}")
batches_in_epoch = len(train_loader.train) // grad_accum_steps
print(f"{batches_in_epoch=}")
max_epochs = 100
max_steps = batches_in_epoch * max_epochs
print(f"{max_epochs=}")
print(f"{max_steps=}")

max_lr = 6e-5
min_lr = max_lr * 0.1
warmup_steps = math.floor(0.05 * max_steps)
check_point = 10
interval = 2
print(f"{warmup_steps=}")

def get_lr(it):
    # 1) Linear warmup for warmup_iter steps
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

#raise SystemExit

train_losses = []
val_losses = []
total_steps = 0
for epoch in range(max_epochs):
    # Training
    model.train()
    running_train_loss = 0
    for step in range(batches_in_epoch):
        t0 = time.time()
        optimizer.zero_grad()
        loss_accum = 0

        for micro_step in range(grad_accum_steps):
            x, y = train_loader.next_batch(train=True)
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x, y)
            loss = loss / grad_accum_steps
            loss_accum += loss.detach()
            loss.backward()

        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        lr = get_lr(step + epoch * batches_in_epoch)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        optimizer.step()

        running_train_loss += loss_accum.item()
        torch.cuda.synchronize()
        t1 = time.time()
        dt = t1 - t0  # time difference in seconds
        tokens_processed = train_loader.B * train_loader.T * grad_accum_steps
        tokens_per_sec = tokens_processed / dt
        total_steps += 1
        if step % interval == 0:
            print(f"epoch {epoch} | step {total_steps} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tokens/sec: {tokens_per_sec:.2f}")

    avg_train_loss = running_train_loss / batches_in_epoch
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    running_val_loss = 0
    with torch.no_grad():
        for _ in range(len(train_loader.val)):
            x_val, y_val = train_loader.next_batch(train=False)
            x_val, y_val = x_val.to(device), y_val.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits_val, loss_val = model(x_val, y_val)
            running_val_loss += loss_val.item()

    avg_val_loss = running_val_loss / len(train_loader.val)
    val_losses.append(avg_val_loss)

    print(f"End of epoch {epoch} | Avg Train Loss: {avg_train_loss:.6f} | Avg Val Loss: {avg_val_loss:.6f}")

    # Optionally, save the model checkpoint here
    if epoch % check_point == 0:
        print("Saving checkpoint")
        torch.save(model.state_dict(), f"model_medium_test_{epoch}.pth")
    train_loader.reset_buffers(train=True)
    train_loader.reset_buffers(train=False)

torch.save(model.state_dict(), f"model_medium_test_final.pth")


num decayed parameter tensors: 98, with 354549760 parameters
num non-decayed parameter tensors: 194, with 321536 parameters
using fused AdamW: True
total desired batch size: 524288
=> calculated gradient accumulation steps: 32
loaded 66732716 characters
loaded: 17125777 tokens
Training buffers: 836, Validation buffers: 209
len(train_loader.train) / grad_accum_steps=26.125
batches_in_epoch=26
max_epochs=2
max_steps=52
warmup_steps=2
Shuffling training buffers


KeyboardInterrupt: 

In [None]:
import torch

def generate_text(model, start_text, max_length=50, temperature=1.0, top_k=None):
    model.eval()  # Set model to evaluation mode

    # Tokenize the starting text
    enc = tiktoken.get_encoding('gpt2')
    input_ids = enc.encode(start_text, allowed_special={'<|endoftext|>'})
    input_ids = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)

    generated_text = start_text

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            logits = outputs[0][:, -1, :]  # Get logits of the last token

            # Apply temperature and top-k sampling
            logits = logits / temperature
            if top_k is not None:
                logits = top_k_logits(logits, top_k)

            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            next_token_id = next_token.item()

            # Add the predicted token to the sequence
            input_ids = torch.cat((input_ids, next_token), dim=1)
            generated_text += enc.decode([next_token_id])

            # Stop if end-of-text token is generated
            if next_token_id == enc.eot_token:
                break

    return generated_text

def top_k_logits(logits, k):
    """ Set all logits but the k highest to -infinity """
    values, indices = torch.topk(logits, k)
    out = logits.clone()
    out[out < values[..., -1, None]] = -float('Inf')
    return out




In [None]:
# Example usage:
start_text = "In the dark room the man stood"
generated = generate_text(model, start_text, max_length=1000, temperature=1.0, top_k=50)
print(generated)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss per Epoch')
plt.legend()
plt.show()

In [None]:
import os
import re
import ast
import unicodedata

def normalize_text(text):
    # Replace curly quotes with straight quotes
    text = re.sub(r'[‘’]', '\'', text)  # Replace left and right single quotes with '
    text = re.sub(r'[“”]', '"', text)   # Replace left and right double quotes with "

    # Replace en dash and em dash with a hyphen or double hyphen
    text = re.sub(r'–', '-', text)  # Replace en dash with hyphen
    text = re.sub(r'—', '--', text) # Replace em dash with double hyphen

    # Replace ellipsis with three dots
    text = re.sub(r'…', '...', text)

    # Replace other non-ASCII punctuation with ASCII equivalents
    text = re.sub(r'«', '<<', text)  # Replace left double angle quote with <<
    text = re.sub(r'»', '>>', text)  # Replace right double angle quote with >>
    text = re.sub(r'§', 'SS', text)  # Replace section sign with SS
    text = re.sub(r'•', '*', text)   # Replace bullet with asterisk
    text = re.sub(r'‽', '?!', text)  # Replace interrobang with ?!
    text = re.sub(r'′', "'", text)   # Replace prime with single quote
    text = re.sub(r'″', '"', text)   # Replace double prime with double quote
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('ascii')

def get_conversations_HF(data_path):
    conversations = []

    # Load movie lines
    id2line = {}
    with open(os.path.join(data_path, 'movie_lines.txt'), 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 5:
                line_id, text = parts[0], parts[4]
                id2line[line_id] = text

    # Load conversations
    with open(os.path.join(data_path, 'movie_conversations.txt'), 'r', encoding='iso-8859-1') as f:
        for line in f:
            parts = line.strip().split(' +++$+++ ')
            if len(parts) == 4:
                conv_line_ids = ast.literal_eval(parts[3])  # safer than eval()
                # Create pairs of conversations (input, response)
                for i in range(len(conv_line_ids) - 1):
                    # Ensure both line IDs are in id2line
                    if conv_line_ids[i] in id2line and conv_line_ids[i + 1] in id2line:
                        input_line = "Q: " + normalize_text(id2line[conv_line_ids[i]])
                        response_line = "A: " + normalize_text(id2line[conv_line_ids[i + 1]])
                        conversations.append((input_line, response_line))
                    #else:
                    #    print(f"Missing line ID in conversation: {conv_line_ids[i]} or {conv_line_ids[i + 1]}")

    return conversations


class DataLoaderChat:
    def __init__(self, B, T, val_split=0.2):
        self.B = B
        self.T = T

        data_path = "data/cornell_movie_dialogs_corpus"
        conversations = get_conversations_HF(data_path)

        con = [ q + '\n' + a + '<|endoftext|>' for q, a in conversations ]

        text = ''.join(con)
        print(f"{len(text)=}")
        enc = tiktoken.get_encoding('gpt2')

        tokens = enc.encode(text, allowed_special={"<|endoftext|>"})
        len_tokens = len(tokens)
        print(f"{len(tokens)=}")
        # Ensure tokens are stored on CPU
        self.tokens = torch.tensor(tokens, device='cpu')
        print(f"loaded: {len(self.tokens)} tokens")

        # Split into training and validation buffers on CPU
        buffers = []
        current_position = 0
        while current_position + (B * T + 1) < len_tokens:
            buffers.append(self.tokens[current_position : current_position + (B * T + 1)])
            current_position += B * T

        random.shuffle(buffers)

        split_idx = int(len(buffers) * (1 - val_split))
        self.train = buffers[:split_idx]
        self.val = buffers[split_idx:]

        print(f"Training buffers: {len(self.train)}, Validation buffers: {len(self.val)}")

        self.current_train_buffer = 0
        self.current_val_buffer = 0

    def next_batch(self, train=True):
        B, T = self.B, self.T
        if train:
            if self.current_train_buffer == 0:
                print(f"Shuffling training buffers")
                random.shuffle(self.train)
            buf = self.train[self.current_train_buffer]
            self.current_train_buffer = (self.current_train_buffer + 1) % len(self.train)
        else:
            if self.current_val_buffer == 0:
                print(f"Shuffling validation buffers")
                random.shuffle(self.val)
            buf = self.val[self.current_val_buffer]
            self.current_val_buffer = (self.current_val_buffer + 1) % len(self.val)


        # Ensure data is returned on the CPU, only transfer to GPU in training loop
        x = (buf[:-1]).view(B, T).to('cpu')  # inputs on CPU
        y = (buf[1:]).view(B, T).to('cpu')   # targets on CPU
        return x, y

    def reset_buffers(self, train=True):
        if train:
            self.current_train_buffer = 0
        else:
            self.current_val_buffer = 0

In [None]:
import time
import matplotlib.pyplot as plt

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

# optimize!
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)

B = 16  # micro batch size
T = 1024  # sequence length
total_batch_size = 4 * B * T
assert total_batch_size % (B * T) == 0, "make sure total_batch_size is divisible by B * T"
grad_accum_steps = total_batch_size // (B * T)

print(f"total desired batch size: {total_batch_size}")
print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

torch.set_float32_matmul_precision('high')

train_loader = DataLoaderChat(B, T, val_split=0.2)

print(f"{len(train_loader.train) / grad_accum_steps=}")
batches_in_epoch = len(train_loader.train) // grad_accum_steps
print(f"{batches_in_epoch=}")
max_epochs = 10
max_steps = batches_in_epoch * max_epochs
print(f"{max_epochs=}")
print(f"{max_steps=}")

max_lr = 6e-5
min_lr = max_lr * 0.1
warmup_steps = math.floor(0.05 * max_steps)
check_point = 10
interval = 2
print(f"{warmup_steps=}")

def get_lr(it):
    # 1) Linear warmup for warmup_iter steps
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

#raise SystemExit

train_losses = []
val_losses = []
total_steps = 0
for epoch in range(max_epochs):
    # Training
    model.train()
    running_train_loss = 0
    for step in range(batches_in_epoch):
        t0 = time.time()
        optimizer.zero_grad()
        loss_accum = 0

        for micro_step in range(grad_accum_steps):
            x, y = train_loader.next_batch(train=True)
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x, y)
            loss = loss / grad_accum_steps
            loss_accum += loss.detach()
            loss.backward()

        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        lr = get_lr(step + epoch * batches_in_epoch)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        optimizer.step()

        running_train_loss += loss_accum.item()
        torch.cuda.synchronize()
        t1 = time.time()
        dt = t1 - t0  # time difference in seconds
        tokens_processed = train_loader.B * train_loader.T * grad_accum_steps
        tokens_per_sec = tokens_processed / dt
        total_steps += 1
        if step % interval == 0:
            print(f"epoch {epoch} | step {total_steps} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tokens/sec: {tokens_per_sec:.2f}")

    avg_train_loss = running_train_loss / batches_in_epoch
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    running_val_loss = 0
    with torch.no_grad():
        for _ in range(len(train_loader.val)):
            x_val, y_val = train_loader.next_batch(train=False)
            x_val, y_val = x_val.to(device), y_val.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits_val, loss_val = model(x_val, y_val)
            running_val_loss += loss_val.item()

    avg_val_loss = running_val_loss / len(train_loader.val)
    val_losses.append(avg_val_loss)

    print(f"End of epoch {epoch} | Avg Train Loss: {avg_train_loss:.6f} | Avg Val Loss: {avg_val_loss:.6f}")

    # Optionally, save the model checkpoint here
    if epoch % check_point == 0:
        print("Saving checkpoint")
        torch.save(model.state_dict(), f"model_medium_chat_{epoch}.pth")
    train_loader.reset_buffers(train=True)
    train_loader.reset_buffers(train=False)

torch.save(model.state_dict(), f"model_medium_chat_final.pth")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss per Epoch')
plt.legend()
plt.show()

In [None]:
# Example usage:
start_text = "Q: Who and what are you?\nA: "
generated = generate_text(model, start_text, max_length=1000, temperature=1.0, top_k=50)
print(generated)

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch



def test():
    model.eval()  # Set model to evaluation mode

    # Tokenize the starting text
    tokenizer = tiktoken.get_encoding('gpt2')

    generated = ''

    while True:
        try:
            input_text = input("> ")
            current_length = len(generated)
            generated += "Q: " + input_text + "\nA:"


            generated = generate_text(model, generated, max_length=100, temperature=1.0, top_k=50)

            print(generated[current_length :].replace('<|endoftext|>','\n'))

        except Exception as e:
            print(e)

test()

**CONVERSATION TRAINING**

In [None]:
import math
import random
!pip install tiktoken
import tiktoken
import inspect
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
from google.colab import drive
drive.mount('/content/drive')
print(os.getcwd())
os.chdir('/content/drive/MyDrive/Colab Notebooks/gothic')
print(os.getcwd())

# ---------------------------------------------
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # not really a 'bias', more of a mask, but following the OpenAI/HF naming though
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()  # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)  # (B, nh, T, hs)
        # attention (materializes the large (T*T) matrix for all the queries and keys)
        # att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        # att = F.softmax(att, dim=-1)
        # y = att @ v  # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)  # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y


class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024  # max sequence length
    vocab_size: int = 50257  # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    # 'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
    # 'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
    # 'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
    # 'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
    n_layer: int = 24  # number of layers
    n_head: int = 16  # number of heads
    n_embd: int = 1024  # embedding dimension


class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)




    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and position embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)  # shape (T)
        pos_emb = self.transformer.wpe(pos)  # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx)  # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

    def configure_optimizers(self, weight_decay, learning_rate, device):
        # start with all of the candidate parameters (that require grad)
        param_dict = {pn: p for pn, p in self.named_parameters()}
        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
        optim_groups = [
            {'params': decay_params, 'weight_decay': weight_decay},
            {'params': nodecay_params, 'weight_decay': 0.0}
        ]

        num_decay_params = sum(p.numel() for p in decay_params)
        num_nodecay_params = sum(p.numel() for p in nodecay_params)
        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params} parameters")
        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params} parameters")

        # Create AdamW optimizer and use the fused version if it is available
        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
        use_fused = fused_available and 'cuda' in device
        print(f"using fused AdamW: {use_fused}")
        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=(0.9, 0.95), eps=1e-8)
        return optimizer


# attempt to autodetect the device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

model = GPT(GPTConfig(vocab_size=50304))

model.to(device)
model = torch.compile(model)
model.load_state_dict(torch.load('model_medium_fifth_500.pth'), strict=False)
model.train()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content
/content/drive/MyDrive/Colab Notebooks/gothic
using device: cuda


OptimizedModule(
  (_orig_mod): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(50304, 1024)
      (wpe): Embedding(1024, 1024)
      (h): ModuleList(
        (0-23): 24 x Block(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): CausalSelfAttention(
            (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
            (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='tanh')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
        )
      )
      (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=1024, out_features=50304, bias=False)
  )
)

In [None]:
!pip install datasets
from datasets import load_dataset

# Load the DailyDialog dataset
dataset = load_dataset("daily_dialog")

# Check the available splits
print(dataset)





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 11118
    })
    validation: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['dialog', 'act', 'emotion'],
        num_rows: 1000
    })
})


In [None]:
import os
import re
import ast
import json
from cleaner import normalize_text

class DataLoaderChat:
    def __init__(self, B, T):

        self.train = None
        self.val = None
        self.test = None

        self.B = B
        self.T = T

        # First get Hugging Face data
        print(f"loading Hugging Face data")
        train, val, test = self.get_conversations_HF()
        self.process_conversations(train, val, test)
        print(f"Training buffers: {len(self.train)}, Validation buffers: {len(self.val)}, Test buffers: {len(self.test)}")

        # Now get Cornell data
        print(f"loading Cornell data")
        data_path = "data/cornell_movie_dialogs_corpus"
        train, val, test = self.get_conversations_cornell(data_path)
        self.process_conversations(train, val, test)
        print(f"Training buffers: {len(self.train)}, Validation buffers: {len(self.val)}, Test buffers: {len(self.test)}")

        # Now get woz data
        print(f"loading woz data")
        data_path = 'data/MultiWOZ_2.2'
        train, val, test = self.get_conversations_woz(data_path)
        self.process_conversations(train, val, test)
        print(f"Training buffers: {len(self.train)}, Validation buffers: {len(self.val)}, Test buffers: {len(self.test)}")

        # Now get taskmaster data
        print(f"loading taskmaster data")
        data_path = 'data/Taskmaster'
        train, val, test = self.get_conversations_taskmaster(data_path)
        self.process_conversations(train, val, test)
        print(f"Training buffers: {len(self.train)}, Validation buffers: {len(self.val)}, Test buffers: {len(self.test)}")

        random.shuffle(self.train)
        random.shuffle(self.val)
        if self.test:
            random.shuffle(self.test)

        self.current_train_buffer = 0
        self.current_val_buffer = 0
        self.current_test_buffer = 0

    def get_conversations_HF(self):
        train = []
        val = []
        test = []
        the_datasets = {'train': train, 'validation': val, 'test': test}

        types = ['train', 'validation', 'test']

        pattern = r'\s([,.!?;:])'
        # Regular expressions for spaces around parentheses
        pattern_left_parenthesis = r'\(\s'
        pattern_right_parenthesis = r'\s\)'

        for t in types:
            print(f"loading: {t=}")
            ds = dataset[t]
            # Access specific fields
            print(f"{len(ds)=}")

            for i in range(len(ds)):
                dialogue = ds[i]['dialog']
                turn = ["Q: ", "A: "]
                conv = []
                for i, s in enumerate(dialogue):
                    s = s.strip()
                    s = normalize_text(s)
                    s = s.replace(" ' ", "'")
                    s = s.replace('$ ', '$')
                    s = s.replace('( ', '(')
                    s = s.replace(') ', ')')
                    s = re.sub(pattern, r'\1', s)
                    s = turn[0 if (i+1)%2 == 1 else 1] + s
                    conv.append(s)
                the_datasets[t].append(tuple(conv))

        return the_datasets['train'], the_datasets['validation'], the_datasets['test']

    def get_conversations_cornell(self, data_path, val_split=0.2):
        conversations = []

        # Load movie lines
        id2line = {}
        with open(os.path.join(data_path, 'movie_lines.txt'), 'r', encoding='iso-8859-1') as f:
            for line in f:
                parts = line.strip().split(' +++$+++ ')
                if len(parts) == 5:
                    line_id, text = parts[0], parts[4]
                    id2line[line_id] = text

        # Load conversations
        with open(os.path.join(data_path, 'movie_conversations.txt'), 'r', encoding='iso-8859-1') as f:
            for line in f:
                parts = line.strip().split(' +++$+++ ')
                if len(parts) == 4:
                    conv_line_ids = ast.literal_eval(parts[3])  # safer than eval()
                    # Create pairs of conversations (input, response)
                    for i in range(len(conv_line_ids) - 1):
                        # Ensure both line IDs are in id2line
                        if conv_line_ids[i] in id2line and conv_line_ids[i + 1] in id2line:
                            input_line = "Q: " + normalize_text(id2line[conv_line_ids[i]])
                            response_line = "A: " + normalize_text(id2line[conv_line_ids[i + 1]])
                            conversations.append((input_line, response_line))
                        #else:
                        #    print(f"Missing line ID in conversation: {conv_line_ids[i]} or {conv_line_ids[i + 1]}")

        print(f"{len(conversations)=}")
        random.shuffle(conversations)
        train = conversations[:-int(len(conversations) * val_split)]
        val = conversations[-int(len(conversations) * val_split):]
        test = None

        return train, val, test

    def get_conversations_woz(self, data_path):
        turn = ['Q: ', 'A: ']
        types = ['train', 'dev', 'test']
        the_datasets = {t : [] for t in types}
        dir = {t : os.path.join(data_path, t) for t in types}
        for t in types:
            print(f"processing: {t=}")
            json_files = [f for f in os.listdir(dir[t]) if f.endswith('.json')] # Filter the list to include only JSON files
            dialogues = [] # Initialize a list to store the data from all JSON files
            # Loop through each JSON file and load the data
            for json_file in json_files:
                file_path = os.path.join(dir[t], json_file)
                with open(file_path, 'r') as file:
                    dialogues.extend(json.load(file))
            for dialogue in dialogues:
                conversation_list = []
                conversation = dialogue['turns']
                for i, line in enumerate(conversation):
                    conversation_list.append(turn[0 if (i+1)%2 == 1 else 1] + normalize_text(line['utterance']))
                the_datasets[t].append(tuple(conversation_list))
            print(f"{len(the_datasets[t])=}")

        return the_datasets['train'], the_datasets['dev'], the_datasets['test']

    def get_conversations_taskmaster(self, data_path, val_split = 0.2):
        dirs = ['TM-1-2019', 'TM-2-2020/data', 'TM-3-2020/data', 'TM-4-2024/data']
        conversations = []
        for dir in dirs:
            print(f"{dir=}")
            path = os.path.join(data_path, dir)
            json_files = [f for f in os.listdir(path) if f.endswith('.json')] # Filter the list to include only JSON files
            dialogues = [] # Initialize a list to store the data from all JSON files
            # Loop through each JSON file and load the data
            print("loading JSON files")
            for json_file in json_files:
                file_path = os.path.join(path, json_file)
                with open(file_path, 'r') as file:
                    print(f"{file_path=}")
                    dialogues.extend(json.load(file))
            print("processing dialogues")
            for dialogue in dialogues:
                utterances = dialogue['utterances']
                conversation_list = []
                previous_prompt = ""
                for line in utterances:
                    prompt = "A: "
                    if line['speaker'].lower() == 'user':
                        prompt = "Q: "
                    if previous_prompt == prompt:
                        last_text = conversation_list.pop()
                        conversation_list.append(last_text + " " + normalize_text(line['text']))
                    else:
                        conversation_list.append(prompt + normalize_text(line['text']))
                        previous_prompt = prompt
                conversations.append(tuple(conversation_list))
        print(f"{len(conversations)=}")
        random.shuffle(conversations)
        train = conversations[:-int(len(conversations) * val_split)]
        val = conversations[-int(len(conversations) * val_split):]
        test = None
        return train, val, test

    def process_conversations(self, train, val, test):
        types = ['train', 'validation', 'test']
        the_conversations = {'train': train, 'validation': val, 'test': test}

        for t in types:
            print(f"processing: {t=}")
            conversations = the_conversations[t]
            if not conversations:
                print(f"No conversations found for type: {t}")
                continue
            conversations_list = []
            dialogue_list = []
            for dialogue in  conversations:
                for line in dialogue:
                    dialogue_list.append(line + '\n')
                dialogue_list.append('<|endoftext|>')
                conversations_list.append(''.join(dialogue_list))
                dialogue_list = []

            text = ''.join(conversations_list)
            print(f"{len(text)=}")

            enc = tiktoken.get_encoding('gpt2')
            tokens = enc.encode(text, allowed_special={"<|endoftext|>"})
            len_tokens = len(tokens)
            print(f"{len(tokens)=}")
            # Ensure tokens are stored on CPU
            tokens = torch.tensor(tokens, device='cpu')
            print(f"loaded: {len(tokens)} tokens")

            # Split into training and validation buffers on CPU
            buffers = []
            current_position = 0
            while current_position + (B * T + 1) < len_tokens:
                buffers.append(tokens[current_position : current_position + (B * T + 1)])
                current_position += B * T

            if t == 'train':
                if self.train:
                    self.train.extend(buffers)
                else:
                    self.train = buffers
            elif t == 'validation':
                if self.val:
                    self.val.extend(buffers)
                else:
                    self.val = buffers
            else:
                if self.test:
                    self.test.extend(buffers)
                else:
                    self.test = buffers

    def next_batch(self, train=True):
        B, T = self.B, self.T
        if train:
            if self.current_train_buffer == 0:
                print(f"Shuffling training buffers")
                random.shuffle(self.train)
            buf = self.train[self.current_train_buffer]
            self.current_train_buffer = (self.current_train_buffer + 1) % len(self.train)
        else:
            if self.current_val_buffer == 0:
                print(f"Shuffling validation buffers")
                random.shuffle(self.val)
            buf = self.val[self.current_val_buffer]
            self.current_val_buffer = (self.current_val_buffer + 1) % len(self.val)


        # Ensure data is returned on the CPU, only transfer to GPU in training loop
        x = (buf[:-1]).view(B, T).to('cpu')  # inputs on CPU
        y = (buf[1:]).view(B, T).to('cpu')   # targets on CPU
        return x, y

    def reset_buffers(self, train=True):
        if train:
            self.current_train_buffer = 0
        else:
            self.current_val_buffer = 0




In [None]:
import time
import matplotlib.pyplot as plt

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

# optimize!
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device=device)

B = 16  # micro batch size
T = 1024  # sequence length
total_batch_size = 16 * B * T
assert total_batch_size % (B * T) == 0, "make sure total_batch_size is divisible by B * T"
grad_accum_steps = total_batch_size // (B * T)

print(f"total desired batch size: {total_batch_size}")
print(f"=> calculated gradient accumulation steps: {grad_accum_steps}")

torch.set_float32_matmul_precision('high')

train_loader = DataLoaderChat(B, T)

print(f"{len(train_loader.train) / grad_accum_steps=}")
batches_in_epoch = len(train_loader.train) // grad_accum_steps
print(f"{batches_in_epoch=}")
max_epochs = 10
max_steps = batches_in_epoch * max_epochs
print(f"{max_epochs=}")
print(f"{max_steps=}")

max_lr = 6e-5
min_lr = max_lr * 0.1
warmup_steps = math.floor(0.05 * max_steps)
check_point = 50
interval = 10
print(f"{warmup_steps=}")

def get_lr(it):
    # 1) Linear warmup for warmup_iter steps
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps
    # 2) if it > lr_decay_iters, return min learning rate
    if it > max_steps:
        return min_lr
    # 3) in between, use cosine decay down to min learning rate
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))  # coeff starts at 1 and goes to 0
    return min_lr + coeff * (max_lr - min_lr)

train_losses = []
val_losses = []
total_steps = 0
for epoch in range(max_epochs):
    # Training
    model.train()
    running_train_loss = 0
    for step in range(batches_in_epoch):
        t0 = time.time()
        optimizer.zero_grad()
        loss_accum = 0

        for micro_step in range(grad_accum_steps):
            x, y = train_loader.next_batch(train=True)
            x, y = x.to(device), y.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x, y)
            loss = loss / grad_accum_steps
            loss_accum += loss.detach()
            loss.backward()

        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        lr = get_lr(step + epoch * batches_in_epoch)
        for param_group in optimizer.param_groups:
            param_group['lr'] = lr
        optimizer.step()

        running_train_loss += loss_accum.item()
        torch.cuda.synchronize()
        t1 = time.time()
        dt = t1 - t0  # time difference in seconds
        tokens_processed = train_loader.B * train_loader.T * grad_accum_steps
        tokens_per_sec = tokens_processed / dt
        total_steps += 1
        if total_steps % interval == 0:
            print(f"epoch {epoch} | total_steps {total_steps} | loss: {loss_accum.item():.6f} | lr {lr:.4e} | norm: {norm:.4f} | dt: {dt*1000:.2f}ms | tokens/sec: {tokens_per_sec:.2f}")

    avg_train_loss = running_train_loss / batches_in_epoch
    train_losses.append(avg_train_loss)

    # Validation
    model.eval()
    running_val_loss = 0
    with torch.no_grad():
        for _ in range(len(train_loader.val)):
            x_val, y_val = train_loader.next_batch(train=False)
            x_val, y_val = x_val.to(device), y_val.to(device)
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits_val, loss_val = model(x_val, y_val)
            running_val_loss += loss_val.item()

    avg_val_loss = running_val_loss / len(train_loader.val)
    val_losses.append(avg_val_loss)

    print(f"End of epoch {epoch} | Avg Train Loss: {avg_train_loss:.6f} | Avg Val Loss: {avg_val_loss:.6f}")

    # Optionally, save the model checkpoint here
    if epoch % check_point == 0:
        print("Saving checkpoint")
        torch.save(model.state_dict(), f"model_medium_chat_HF_{epoch}.pth")
    train_loader.reset_buffers(train=True)
    train_loader.reset_buffers(train=False)

torch.save(model.state_dict(), f"model_medium_chat_HF_final.pth")

num decayed parameter tensors: 98, with 354549760 parameters
num non-decayed parameter tensors: 194, with 321536 parameters
using fused AdamW: True
total desired batch size: 262144
=> calculated gradient accumulation steps: 16
loading Hugging Face data
loading: t='train'
len(ds)=11118
loading: t='validation'
len(ds)=1000
loading: t='test'
len(ds)=1000
processing: t='train'
len(text)=5511281
len(tokens)=1511279
loaded: 1511279 tokens
processing: t='validation'
len(text)=509503
len(tokens)=139417
loaded: 139417 tokens
processing: t='test'
len(text)=497244
len(tokens)=135733
loaded: 135733 tokens
Training buffers: 92, Validation buffers: 8, Test buffers: 8
loading Cornell data
len(conversations)=221282
processing: t='train'
len(text)=23041615
len(tokens)=6357644
loaded: 6357644 tokens
processing: t='validation'
len(text)=5785511
len(tokens)=1596736
loaded: 1596736 tokens
processing: t='test'
No conversations found for type: test
Training buffers: 480, Validation buffers: 105, Test buffers

In [2]:
# attempt to autodetect the device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

model = GPT(GPTConfig(vocab_size=50304))

model.to(device)
model = torch.compile(model)
model.load_state_dict(torch.load('model_medium_chat_HF_final.pth'), strict=False)
model.train()

using device: cuda


  model.load_state_dict(torch.load('model_medium_chat_HF_final.pth'), strict=False)


OptimizedModule(
  (_orig_mod): GPT(
    (transformer): ModuleDict(
      (wte): Embedding(50304, 1024)
      (wpe): Embedding(1024, 1024)
      (h): ModuleList(
        (0-23): 24 x Block(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): CausalSelfAttention(
            (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
            (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): MLP(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='tanh')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
        )
      )
      (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=1024, out_features=50304, bias=False)
  )
)

In [3]:
import torch

def generate_text(model, start_text, max_length=50, temperature=1.0, top_k=None):
    model.eval()  # Set model to evaluation mode

    # Tokenize the starting text
    enc = tiktoken.get_encoding('gpt2')
    input_ids = enc.encode(start_text, allowed_special={'<|endoftext|>'})
    input_ids = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)

    generated_text = start_text

    with torch.no_grad():
        for _ in range(max_length):
            outputs = model(input_ids)
            logits = outputs[0][:, -1, :]  # Get logits of the last token

            # Apply temperature and top-k sampling
            logits = logits / temperature
            if top_k is not None:
                logits = top_k_logits(logits, top_k)

            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            next_token_id = next_token.item()

            # Add the predicted token to the sequence
            input_ids = torch.cat((input_ids, next_token), dim=1)
            generated_text += enc.decode([next_token_id])

            # Stop if end-of-text token is generated
            if next_token_id == enc.eot_token:
                break

    return generated_text

def generate_chat_text(model, start_text, max_length=50, temperature=1.0, top_k=None):
    model.eval()  # Set model to evaluation mode

    # Tokenize the starting text
    enc = tiktoken.get_encoding('gpt2')
    input_ids = enc.encode(start_text, allowed_special={'<|endoftext|>'})
    input_ids = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)

    generated_text = start_text

    with torch.no_grad():
        for i in range(max_length):
            outputs = model(input_ids)
            logits = outputs[0][:, -1, :]  # Get logits of the last token

            # Apply temperature and top-k sampling
            logits = logits / temperature
            if top_k is not None:
                logits = top_k_logits(logits, top_k)

            probs = torch.nn.functional.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            next_token_id = next_token.item()

            # Add the predicted token to the sequence
            input_ids = torch.cat((input_ids, next_token), dim=1)
            generated_text += enc.decode([next_token_id])

            # stop if a question or answer prompt generated
            if i >= 2 and (generated_text[-2:] == 'Q:' or generated_text[-2:] == 'A:'):
                generated_text = generated_text[:-3]
                break

            # Stop if end-of-text token is generated
            if next_token_id == enc.eot_token:
                break

    return generated_text

def top_k_logits(logits, k):
    """ Set all logits but the k highest to -infinity """
    values, indices = torch.topk(logits, k)
    out = logits.clone()
    out[out < values[..., -1, None]] = -float('Inf')
    return out

In [11]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

def test():
    model.eval()  # Set model to evaluation mode

    # Tokenize the starting text
    tokenizer = tiktoken.get_encoding('gpt2')

    generated = ''

    while True:
        try:
            input_text = input("> ")
            if input_text == 'exit':
                print("-------------------------------------------")
                print(generated)
                print("-------------------------------------------")
                break
            initial_length = len(generated)
            generated += "\nQ: " + input_text + "\nA:"
            new_length = len(generated)
            statement_length = new_length - initial_length


            if new_length > 1024:
                generated = generated[-1024:]
                initial_length = 1024 - statement_length


            generated = generate_chat_text(model, generated, max_length=100, temperature=1.0, top_k=50)

            print(generated[initial_length:].replace('<|endoftext|>','\n'))

        except Exception as e:
            print(e)

test()

> Hello, who am i talking to?
before new_length=34
after len(generated)=34

Q: Hello, who am i talking to?
A: I can not hear you.
> Can you hear me now?
before new_length=81
after len(generated)=81

Q: Can you hear me now?
A: Yes, sure where are you from?
> I am planning a trip to Cambridge.
before new_length=152
after len(generated)=152

Q: I am planning a trip to Cambridge.
A: I have found a flight with Alaska Airlines it leaves at 6:40 pm and Best Western Plus.
> What airport doe it arrive at?
before new_length=276
after len(generated)=276

Q: What airport doe it arrive at?
A: August 31st February 7 through 23rd.
> And is that close to Cambridge?
before new_length=351
after len(generated)=351

Q: And is that close to Cambridge?
A: Sorry, did you say that there's no layover?
> Is it easy to get to Cambridge from there?
before new_length=444
after len(generated)=444

Q: Is it easy to get to Cambridge from there?
A: sorry there are no trains departing from Birmingham New Zealand.
> Can

KeyboardInterrupt: Interrupted by user