In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from dataclasses import dataclass


class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)

        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y


class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension


class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing
        self.transformer.wte.weight = self.lm_head.weight

        # weight initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean = 0.0, std = std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std = 0.02)



    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        """Loads pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model


@dataclass
class Config:
    vocab_size: int = 50257
    max_seq_len: int = 2048
    dim: int = 768
    num_layers: int = 12
    num_heads: int = 12
    dropout: float = 0.1

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.n_head = config.num_heads
        self.n_embd = config.dim

        # Linear projections for Q, K, V
        self.c_attn = nn.Linear(config.dim, 3 * config.dim) # [n_embd, 3 * n_embd]
        self.c_proj = nn.Linear(config.dim, config.dim) # [n_embd, n_embd]

        self.attn_dropout = nn.Dropout(config.dropout)
        self.resid_dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        B, T, C = x.size() # [B, T, n_embd]

        # Linear projection and split into Q, K, V
        q, k, v = self.c_attn(x).split(self.n_embd, dim=2) # [B, T, n_embd] each

        # Reshape for multi-head attention
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # [B, n_head, T, n_embd/n_head]
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # [B, n_head, T, n_embd/n_head]
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # [B, n_head, T, n_embd/n_head]

        # Attention scores
        att = (q @ k.transpose(-2, -1)) * (1.0 / (k.size(-1) ** 0.5)) # [B, n_head, T, T]
        att = F.softmax(att, dim=-1) # [B, n_head, T, T]
        att = self.attn_dropout(att) # [B, n_head, T, T]

        # Weighted sum of values
        y = att @ v # [B, n_head, T, n_embd/n_head]

        # Reshape and project
        y = y.transpose(1, 2).contiguous().view(B, T, C) # [B, T, n_embd]
        y = self.c_proj(y) # [B, T, n_embd]
        y = self.resid_dropout(y) # [B, T, n_embd]

        return y

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.dim, 4 * config.dim) # [n_embd, 4 * n_embd]
        self.c_proj = nn.Linear(4 * config.dim, config.dim) # [4 * n_embd, n_embd]
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x):
        x = self.c_fc(x) # [B, T, 4 * n_embd]
        x = F.gelu(x) # [B, T, 4 * n_embd]
        x = self.c_proj(x) # [B, T, n_embd]
        x = self.dropout(x) # [B, T, n_embd]
        return x

class TransformerBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.dim) # [n_embd]
        self.attn = MultiHeadAttention(config)
        self.ln_2 = nn.LayerNorm(config.dim) # [n_embd]
        self.mlp = FeedForward(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x)) # [B, T, n_embd]
        x = x + self.mlp(self.ln_2(x)) # [B, T, n_embd]
        return x

class DecoderOnlyTransformer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.wte = nn.Embedding(config.vocab_size, config.dim) # [vocab_size, n_embd]
        self.wpe = nn.Embedding(config.max_seq_len, config.dim) # [max_seq_len, n_embd]
        self.drop = nn.Dropout(config.dropout)
        self.blocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.num_layers)])
        self.ln_f = nn.LayerNorm(config.dim) # [n_embd]
        self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=False) # [n_embd, vocab_size]

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, (nn.Linear, nn.Embedding)):
            module.weight.data.normal_(mean=0.0, std=0.02)
            if isinstance(module, nn.Linear) and module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, idx):
        B, T = idx.size() # [B, T]

        # Positional embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device).unsqueeze(0) # [1, T]

        # Token and position embeddings
        tok_emb = self.wte(idx) # [B, T, n_embd]
        pos_emb = self.wpe(pos) # [1, T, n_embd]

        # Combine embeddings and apply dropout
        x = self.drop(tok_emb + pos_emb) # [B, T, n_embd]

        # Transformer blocks
        for block in self.blocks:
            x = block(x) # [B, T, n_embd]

        # Final layer norm and linear projection
        x = self.ln_f(x) # [B, T, n_embd]
        logits = self.lm_head(x) # [B, T, vocab_size]

        return logits

In [2]:
!pip install tiktoken
!pip install torchinfo

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0
Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [3]:
import os
import time
import math
import sys
import random
import tiktoken
import numpy as np
import torch
import torch.nn.functional as F

from pprint import pprint
from tqdm import tqdm
from torchinfo import summary

In [4]:
def set_all_seeds(seed):
    """
    Ensures reproducible behaviour by resetting all seeds with the seed given by `seed`.
    Moreover, additional parameters are set to ensure deterministic behaviour.

    Reference:
    [1] https://pytorch.org/docs/stable/notes/randomness.html, Accessed: 2021-07-19

    Args:
        seed: The desired seed to be set
    """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
    # torch.backends.cudnn.benchmark = False

In [5]:
class DataLoaderLite:
    def __init__(self, B, T):
        self.B = B
        self.T = T
        self.tokenizer = tiktoken.get_encoding('gpt2')

        # at init load tokens from disk and store them in memory
        with open(R'/content/input.txt', 'r') as f:
            text = f.read()

        tokens = self.tokenizer.encode(text)
        self.tokens = torch.tensor(tokens)
        self.num_batches = len(self.tokens) // (B * T)
        print(f'loaded {len(self.tokens)} tokens')
        print(f'1 epoch = {self.num_batches} batches')

        # state
        self.current_position = 0

    def __len__(self):
        return len(self.tokens) // (self.B * self.T) - 1   # Subtracting 1 as the last batch will be used for text generation and not for training

    def next_batch(self):
        B, T = self.B, self.T
        buf = self.tokens[self.current_position: self.current_position + B * T + 1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # targets
        # advance the position in the tensor
        self.current_position += B*T
        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_position = 0
        return x, y

In [6]:
config = GPTConfig()
# # update config here with
# params = {"n_layer": 24, "n_head": 16}
# config = replace(config, **params)
print(config)
model = GPT(config)
model = model.to(torch.device("cpu"))

# Generate random input
input_ids = torch.randint(0, config.vocab_size, (16, 128)).to(torch.device("cpu"))

# Forward pass
logits = model(input_ids)
print("Input shape:", input_ids.shape)
print("Output shape:", logits[0].shape)

# summary
print(summary(model, input_data=input_ids, device=torch.device("cpu")))

del model, logits, input_ids, config

GPTConfig(block_size=1024, vocab_size=50257, n_layer=12, n_head=12, n_embd=768)
Input shape: torch.Size([16, 128])
Output shape: torch.Size([16, 128, 50257])
Layer (type:depth-idx)                             Output Shape              Param #
GPT                                                [16, 128, 50257]          --
├─ModuleDict: 1-1                                  --                        --
│    └─Embedding: 2-1                              [128, 768]                786,432
│    └─Embedding: 2-2                              [16, 128, 768]            38,597,376
│    └─ModuleList: 2-3                             --                        --
│    │    └─Block: 3-1                             [16, 128, 768]            7,087,872
│    │    └─Block: 3-2                             [16, 128, 768]            7,087,872
│    │    └─Block: 3-3                             [16, 128, 768]            7,087,872
│    │    └─Block: 3-4                             [16, 128, 768]            7,087,

In [7]:
model_history = {
    "train_loss": [],
    "steps": [],
    "lr": [],
    "tokens/sec": []
}


class WarmUpLR():
    def __init__(self, optimizer, initial_lr, num_steps_to_warm_up=10):
        """
        Args:
            optimizer: The used optimizer
            initial_lr: The initial learning rate
            num_steps_to_warm_up (optional): The number of epochs the learning rate should be warmed up
        """
        self.optimizer = optimizer
        self.initial_lr = initial_lr
        self.num_steps_to_warm_up = num_steps_to_warm_up
        self.last_step = 0

    def step(self):
        if self.last_step < self.num_steps_to_warm_up:
            lr = self.initial_lr / (self.num_steps_to_warm_up - self.last_step)
        else:
            lr = self.initial_lr

        self.last_step += 1

        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr


class WarmUpCosineLR(torch.optim.lr_scheduler.LambdaLR):
    """
        Linear warmup and then cosine decay.
        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
    """

    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
        """_summary_

        Args:
            optimizer : optimizer used
            warmup_steps : steps to warm up
            t_total : total steps
            cycles : Learning rate tranversal. Defaults to 0.5.
            last_epoch : last epoch. Defaults to -1.
        """
        self.warmup_steps = warmup_steps
        self.t_total = t_total
        self.cycles = cycles
        super().__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)

    def lr_lambda(self, step):
        if step < self.warmup_steps:
            return float(step) / float(max(1.0, self.warmup_steps))
        # progress after warmup
        progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
        return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))


def train(data_loader, steps, model, optimizer, scheduler, criterion=None, device=torch.device("cpu")):
    train_bar = tqdm(range(steps),
                     total=steps,
                     colour="blue",
                     file=sys.stdout,
                     bar_format="{l_bar}{bar:30}{r_bar}"
                    )
    # running_train_loss = 0
    with open('training_metrics.txt', 'w') as f:
        for step in train_bar:
            t0 = time.time()
            train_bar.set_description(f"Training [S={step+1:02d}]")
            sequence, target = data_loader.next_batch()
            sequence, target = sequence.to(device), target.to(device)

            # Forward pass
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                outputs, loss = model(sequence, target)
            # loss = loss_fn(outputs.view(-1, outputs.size(-1)), target.view(-1))    # logits.view(-1, logits.size(-1)), targets.view(-1)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()

            optimizer.step()

            if scheduler:
                scheduler.step()

            torch.cuda.synchronize()
            t1 = time.time()
            batch_loss = loss.item()
            tokens_per_sec = (data_loader.B * data_loader.T) / (t1 - t0)
            model_history["train_loss"].append(batch_loss)
            model_history["steps"].append(step)
            model_history["lr"].append(optimizer.param_groups[0]['lr'])
            model_history["tokens/sec"].append(tokens_per_sec)
            train_bar.set_postfix_str(f"loss={batch_loss:.6f} | tokens/sec: {tokens_per_sec:.2f}")

            f.write(f"Step: {step}, Loss: {batch_loss:.6f}, Tokens/sec: {tokens_per_sec:.2f}\n")

    return sequence, loss


In [8]:
set_all_seeds(42)

# config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
batch_size=16
seq_len = 128
num_return_sequences = 10
max_length = 30
epochs = 40

data_loader = DataLoaderLite(B = batch_size, T = seq_len)
steps = data_loader.num_batches * epochs

# model
config = GPTConfig()
# params = {"n_layer": 24, "n_head": 16}
# config = replace(config, **params)
model = GPT(config)
model = model.to(device)

# hparams
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
# scheduler = WarmUpLR(optimizer=optimizer, initial_lr=5e-4, num_steps_to_warm_up=200)
scheduler = WarmUpCosineLR(optimizer=optimizer, warmup_steps=0.15*steps, t_total=steps)

loaded 338025 tokens
1 epoch = 165 batches


In [9]:
sequence, train_loss = train(data_loader, steps, model, optimizer, scheduler, device=device)
print(f"Train Loss: {train_loss}")

Training [S=6600]: 100%|[34m██████████████████████████████[0m| 6600/6600 [19:31<00:00,  5.63it/s, loss=0.021756 | tokens/sec: 11933.08]
Train Loss: 0.021756356582045555


In [10]:
torch.save({"step": steps,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "loss": train_loss,
            },
           os.path.join(f"gpt-config.pth"))

In [11]:
from matplotlib.pyplot import ylabel
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Create subplots with 3 rows, 1 column
fig = make_subplots(
    rows=3, cols=1,
    # shared_xaxes=True,
    vertical_spacing=0.1,
    subplot_titles=("Training Loss", "Learning Rate", "Tokens per Second"),
    # row_heights=[0.33, 0.33, 0.33]  # Optional, adjust row heights
)

# Add Training Loss plot
fig.add_trace(
    go.Scatter(x=list(range(len(model_history["train_loss"]))),
               y=model_history["train_loss"],
               mode='lines',
               name='Training Loss',
               line=dict(color='red')),
    row=1, col=1
)

# Add Learning Rate plot
fig.add_trace(
    go.Scatter(x=list(range(len(model_history["lr"]))),
               y=model_history["lr"],
               mode='lines',
               name='Learning Rate',
               line=dict(color='blue')),
    row=2, col=1
)

# Add Tokens per second plot
fig.add_trace(
    go.Scatter(x=list(range(len(model_history["tokens/sec"]))),
               y=model_history["tokens/sec"],
               mode='lines',
               name='Tokens/sec',
               line=dict(color='green')),
    row=3, col=1
)

# Update layout with axis labels and titles
fig.update_layout(
    title="Model Training Metrics",
    xaxis_title="Steps",
    showlegend=True,
    height=900
)

# Show the plot
fig.show()

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
seed = 42
batch_size=16
seq_len = 128
num_return_sequences = 10
max_length = 30
epochs = 25

# model
config = GPTConfig()
model = GPT(config)

ckpt = torch.load("gpt-config.pth", map_location="cpu")
model.load_state_dict(ckpt["model_state_dict"])
model.to(device)

print("weights loaded!")



You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.



weights loaded!


In [13]:
def generate_text(text, model, max_length, num_return_sequences):
    print(F"Input Text: {text}")

    tokenizer = tiktoken.get_encoding('gpt2')
    tokens = tokenizer.encode(text)
    tokens = torch.tensor(tokens).unsqueeze(0).repeat(num_return_sequences, 1)
    x = tokens.to(device)

    model.eval()

    # for s in range(num_return_sequences):
    # generated_text = []
    for _ in range(max_length):

        with torch.no_grad():
            logits, _ = model(x)
        # take the logits at the last position
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # do top-k sampling of 50 (huggingface pipeline default)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

    # tokens = x[s, :max_length].tolist()
    # generated_text.append(tokenizer.decode(tokens))

        # generated_text.append(tokenizer.decode_batch(x[s, :max_length].tolist()))

    generated_text = []
    for i in range(num_return_sequences):
        tokens = x[i, :max_length].tolist()
        decoded = tokenizer.decode(tokens)
        generated_text.append(decoded + "\n>>>\n")

    return "".join(generated_text)

# pprint(generate_text(sequence, tokenizer, model, max_length, num_return_sequences))
pprint(generate_text("Hear this, O Senate, hear our grief!",
                     model,
                     max_length=64,
                     num_return_sequences=5
                     )
)

Input Text: Hear this, O Senate, hear our grief!
('Hear this, O Senate, hear our grief!\n'
 '\n'
 'GLOUCESTER:\n'
 'Peace, good cousin, for my tongue!\n'
 'My un opposite fear\n'
 "DU God'st are not to to the watch.\n"
 '\n'
 'KING EDWARD IV:\n'
 'I must I be no more:\n'
 'Stay\n'
 '>>>\n'
 'Hear this, O Senate, hear our grief! would be better,\n'
 'For sorrow we were nothing but so law as I should not say by mother:\n'
 'Nay, rather tell thee, officer,\n'
 'To speak with some thousand or every one day than my words:\n'
 'But, yet amen.\n'
 '\n'
 '\n'
 '>>>\n'
 'Hear this, O Senate, hear our grief! would I stay behind your news?\n'
 '\n'
 'PARIS:\n'
 'My lord, my lord, and to your king\n'
 'To buy so lasting it.\n'
 '\n'
 'GLOUCESTER:\n'
 "Stay, sir, or worms!' comfort, and not speak itself not\n"
 '>>>\n'
 'Hear this, O Senate, hear our grief! not, sir, but you\n'
 'Are either to you all old bride there?\n'
 '\n'
 'CL women are not so excuse,I speak no, garment from me not:\n'
 "But b