In [34]:
"""
We first implement a GPT1-like language model to generate text. The training data looks like (y0, y1, y2) -> (y1, y2, y3) and we have 3 loss terms. The model will be trained on chunks of data from Hemingway's most well-known novel. The default setting below will produce a model with about 700K parameters. The model should be expected to work better and better as we make the sizes of the parameters (starting with d_model below) bigger and bigger. Collab should allow us to scale to a 10M parameter model without making us to pay.

At the end, after training, you'll decode with this model and generate text. Although there are specialized metric for this task (e.g. perplexity), we will simplify the comparisons by just considering training and validation accuracies.

The goal of this notebook is to incorporate from various angles noise structures that hypothetically would promise something interesting or surprising.

"""

import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # can infer the # of independent sequences we will process in parallel from here.
block_size = 256 # can infer the maximum context length for predictions from here.
max_iters = 5000
eval_interval = 500 # answers how often we evaluate across the optimization: every 500 iterations
learning_rate = 3e-4 # can set to different values
"""
Use 'mps' if on a mac as below:

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
"""
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# How many batches we use each time we evaluate
eval_iters = 200
d_model = 96 # could increase this to 386 --> in order to make the model bigger.
n_head = 6 # This implied that each head has a dimension for the key, query, and values of d_model / 6.
n_layer = 6 # This implies we have 6 turns to mix the embeddigs --- `n_layer` is "Nx" in the paper.
dropout = 0.2
# ------------

torch.manual_seed(1337)

<torch._C.Generator at 0x7c5d92738510>

In [None]:
!gdown 'https://drive.google.com/uc?export=download&id=1RlmRmXiWVKpZq98ftdtOIdM2lsA1uw3j'

Downloading...
From: https://drive.google.com/uc?export=download&id=1RlmRmXiWVKpZq98ftdtOIdM2lsA1uw3j
To: /content/hemingway.txt
  0% 0.00/133k [00:00<?, ?B/s]100% 133k/133k [00:00<00:00, 75.7MB/s]


In [27]:
with open('hemingway.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [28]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    xb = torch.stack([data[i:i+block_size] for i in ix])
    yb = torch.stack([data[i+1:i+block_size+1] for i in ix])
    xb, yb = xb.to(device), yb.to(device)
    return xb, yb

In [29]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits, loss = model(xb, yb)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [36]:
"""
BASELINE

"""

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, d_head):
        super().__init__()
        self.d_head  = d_head
        # Map each key, query, or value in to a d_head dimensional model.
        self.W_K = nn.Linear(d_model, d_head, bias=False)
        self.W_Q = nn.Linear(d_model, d_head, bias=False)
        self.W_V = nn.Linear(d_model, d_head, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # (B, T, d_model)
        B,T,d = x.shape
        k = self.W_K(x)   # (B,T,d_head)
        q = self.W_Q(x) # (B,T,head_size)
        # compute attention scores ("affinities")

        # (B T, d) @ (B, d, T) = (B, T, T)
        scores = q @ k.transpose(-2,-1) * self.d_head**-0.5 # (B, T, d_head) @ (B, d_head, T) -> (B, T, T)
        scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        a = F.softmax(scores, dim=-1) # (B, T, T)
        a = self.dropout(a)
        # perform the weighted aggregation of the values
        v = self.W_V(x) # (B,T,d)
        out = a @ v # (B, T, T) @ (B, T, d) -> (B, T, d)
        # These are the values.
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, d_head):
        super().__init__()
        self.heads = nn.ModuleList([Head(d_head) for _ in range(num_heads)])
        # This is to project back to the dimension of d_model. In this case, it is just a learned linear map.
        self.proj = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Concatenate the different representations per head.
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Project the concatenation.
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """
    A simple linear layer followed by a non-linearity; this is applied at the token level.
    """

    def __init__(self, d_model):
        super().__init__()
        d_ff = 4 * d_model
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.ff(x)

class DecoderBlock(nn.Module):
    """
    Transformer decoder block: communication followed by computation.
    These are stacked on top of each other one after another.
    """

    def __init__(self, d_model, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        # Each head gets a smaller dimensional representation of the data.
        d_head = d_model // n_head
        self.sa = MultiHeadAttention(n_head, d_head)
        self.ff = FeedFoward(d_model)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        """
        This is different from te originl transformer paper
        In the "Attention is all you Need" paper, we had
        x = self.ln1(x + self.sa(x))
        x = self.ln2(x + self.ffwd(x))
        See Figure 1 here, and mimic that: https://arxiv.org/pdf/2002.04745.pdf
        """
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x


# class GPT(nn.Module):
#     def __init__(self):
#         super().__init__()
#         # each token directly reads off the logits for the next token from a lookup table
#         self.token_embedding_table = nn.Embedding(vocab_size, d_model)
#         self.position_embedding_table = nn.Embedding(block_size, d_model)
#         self.blocks = nn.Sequential(
#             *[DecoderBlock(d_model, n_head=n_head) for _ in range(n_layer)]
#         )
#          # final layer norm
#         self.ln = nn.LayerNorm(d_model)
#         self.ff = nn.Linear(d_model, vocab_size)

#     def forward(self, idx, targets=None):
#         B, T = idx.shape

#         # idx and targets are both (B,T) tensor of integers
#         # (B,T,d_model)
#         tok_emb = self.token_embedding_table(idx)
#         # (T,d_model)
#         pos_emb = self.position_embedding_table(torch.arange(T, device=device))
#         # Add positional encodings.
#         # (B,T,C)
#         x = tok_emb + pos_emb

#         # Mix up the token representations over and over via the blocks
#         # (B,T,C)
#         x = self.blocks(x)
#         # (B,T,C)
#         x = self.ln(x)
#         # (B,T,vocab_size)
#         logits = self.ff(x)

#         if targets is None:
#             loss = None
#         else:
#             B, T, C = logits.shape
#             logits = logits.view(B*T, C)
#             targets = targets.view(B*T)
#             loss = F.cross_entropy(logits, targets)

#         return logits, loss

#     def generate(self, idx, max_new_tokens):
#         """
#         idx is (B, T) array of indices in the current context.
#         This will generate B total paths in parrallel.
#         """
#         self.eval()
#         for _ in range(max_new_tokens):
#             # crop idx to the last block_size tokens
#             # The model only has kowledge of the context of maximum size block_size.
#             idx_cond = idx[:, -block_size:]
#             # get the predictions
#             # (B, T, vocab_size)
#             logits, loss = self(idx_cond)
#             # focus only on the last time step
#             logits = logits[:, -1, :] # becomes (B, vocab_size)
#             # apply softmax to get probabilities
#             probs = F.softmax(logits, dim=-1) # (B, vocab_size)
#             # sample from the distribution
#             idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
#             # append sampled index to the running sequence
#             idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
#         self.train()
#         return idx




In [37]:
"""
ALTERNATIVE_1: Gaussian Noise Injection

- Idea: Inject Gaussian noise directly into the word embeddings or positional embeddings.
        This can help the model learn more robust representations by forcing it to handle
        slight variations in the input space.

- Implementation: After obtaining the word embeddings, add a small amount of Gaussian noise
                  (with mean 0 and a chosen variance) before feeding them into the transformer layers.

- self.noise_std: A hyperparameter that controls the standard deviation of the Gaussian noise added to the token embeddings.
                  A higher value means more noise, which increases the regularization effect.

- Noise Addition: During training (self.training), Gaussian noise with mean 0 and standard deviation self.noise_std is added
                  to the token embeddings.

- Embedding Modification:
  - tok_emb: The token embeddings generated from the input indices.
  - pos_emb: The position embeddings that are added to the token embeddings.
  - Noise Addition: Before passing the combined embeddings through the transformer blocks, Gaussian noise is added to perturb the embeddings slightly.

- Training vs. Inference:
  - Training: Noise is added only during training. This regularizes the model by forcing it to be robust to small perturbations in the embedding space.
  - Inference: During inference (evaluation mode), no noise is added to ensure stable predictions.


"""

class GPT(nn.Module):
    def __init__(self, noise_std=0.01):
        super().__init__()
        # Embedding layers
        self.token_embedding_table = nn.Embedding(vocab_size, d_model)
        self.position_embedding_table = nn.Embedding(block_size, d_model)
        self.blocks = nn.Sequential(
            *[DecoderBlock(d_model, n_head=n_head) for _ in range(n_layer)]
        )
        self.ln = nn.LayerNorm(d_model)
        self.ff = nn.Linear(d_model, vocab_size)
        self.noise_std = noise_std  # Standard deviation of the Gaussian noise

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # Token and position embeddings
        tok_emb = self.token_embedding_table(idx)  # (B, T, d_model)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, d_model)
        x = tok_emb + pos_emb  # (B, T, d_model)

        # Add Gaussian noise to the embeddings
        if self.training and self.noise_std > 0:
            noise = torch.randn_like(x) * self.noise_std
            x = x + noise

        # Pass through transformer blocks
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.ff(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        self.train()
        return idx


In [31]:
"""
ALTERNATIVE_7: Perturbed Token Mixup
- Idea: Combine (or mix) embeddings of different tokens from the same sequence or from different sequences
        with some noise. This method is inspired by the mixup technique used in image processing, promoting
        smoother transitions between different token representations.

-Implementation: For each token in a sequence, mix its embedding with that of a random token from the same or
                 a different sequence, adding some noise during the combination.

origin of idea: https://medium.com/@lhungting/mixup-a-trivial-but-powerful-image-augmentation-technique-4e2d0725b8e3#:~:text=MixUp%20augmentation%20linearly%20combines%20an,sampled%20from%20a%20Beta%20distribution.



- Mixup Probability (mixup_prob): This parameter determines how often the Perturbed Token Mixup is applied during training.
                                  A higher probability means more frequent mixups.

- Lambda (lam): A random mixing coefficient that determines the ratio between the original and the permuted token embeddings.
              The closer lam is to 1, the more the mixed embedding resembles the original one.

- Noise Addition (noise_std): This parameter controls the standard deviation of the Gaussian noise added to the mixed embeddings.
                            The noise helps the model to handle variations in the input representations.

- Mixup Mechanism: If the model is in training mode (self.training) and a random value is less than mixup_prob, the token embeddings are mixed
                with those of another random batch. This mixup creates new training examples by combining inputs from different sequences,
                effectively augmenting the training data.

- Integration with Transformer Blocks: After applying the mixup and noise, the perturbed embeddings are passed through the transformer blocks as usual.

"""
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        # Embedding layers
        self.token_embedding_table = nn.Embedding(vocab_size, d_model)
        self.position_embedding_table = nn.Embedding(block_size, d_model)
        self.blocks = nn.Sequential(
            *[DecoderBlock(d_model, n_head=n_head) for _ in range(n_layer)]
        )
        self.ln = nn.LayerNorm(d_model)
        self.ff = nn.Linear(d_model, vocab_size)

    def forward(self, idx, targets=None, mixup_prob=0.2, noise_std=0.01):
        B, T = idx.shape

        # Token and position embeddings
        tok_emb = self.token_embedding_table(idx)  # (B, T, d_model)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))  # (T, d_model)
        x = tok_emb + pos_emb  # (B, T, d_model)

        # Apply Perturbed Token Mixup
        if self.training and random.random() < mixup_prob:
            # Select a random batch for mixup
            perm = torch.randperm(B).to(device)
            lam = torch.rand(1).item()  # Lambda for mixing
            x = lam * x + (1 - lam) * x[perm]
            # Add Gaussian noise for perturbation
            x += noise_std * torch.randn_like(x)

        # Pass through transformer blocks
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.ff(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        self.eval()
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        self.train()
        return idx


In [None]:
# """
# ALTERNATIVE_9: Noisy Residual Connections

# - Idea: Add noise to the residual connections that skip layers. This can make the learning process more resilient to errors in the intermediate layers
# and encourage the model to learn more robust representations.
# - Implementation: Inject Gaussian or uniform noise into the residual connection before adding it back to the layer’s output.

# - noise_std: This parameter controls the amount of noise added to the residual connections. You can adjust it based on your experiment.
# - torch.randn_like(x): Generates a tensor of the same shape as x with values drawn from a standard normal distribution.
# - Adding Noise: Noise is added after the residual connection is computed, making the output slightly perturbed.
# This noise is added twice: once after the self-attention block and once after the feed-forward block.

# """

# class DecoderBlock(nn.Module):
#     """
#     Transformer decoder block: communication followed by computation.
#     These are stacked on top of each other one after another.
#     """

#     def __init__(self, d_model, n_head, noise_std=0.1):
#         super().__init__()
#         d_head = d_model // n_head
#         self.sa = MultiHeadAttention(n_head, d_head)
#         self.ff = FeedFoward(d_model)
#         self.ln1 = nn.LayerNorm(d_model)
#         self.ln2 = nn.LayerNorm(d_model)
#         self.noise_std = noise_std  # Standard deviation for the Gaussian noise

#     def forward(self, x):
#         # Apply self-attention and add Gaussian noise to the residual connection
#         residual = x
#         x = self.ln1(x)
#         x = self.sa(x)
#         x = residual + x + torch.randn_like(x) * self.noise_std

#         # Apply feed-forward network and add Gaussian noise to the residual connection
#         residual = x
#         x = self.ln2(x)
#         x = self.ff(x)
#         x = residual + x + torch.randn_like(x) * self.noise_std

#         return x


In [23]:
# """
# ALTENATIVE_10: DropConnect in Transformer Weights
# - Idea: Instead of dropping out entire neurons or units, randomly drop out individual connections (weights)
# within the transformer layers. This is known as DropConnect and can lead to sparser, more efficient learning.
# - Implementation: Apply dropout to the weights of the transformer layers (not the activations), randomly setting a fraction of the weights to zero during training.

# - DropConnect Function: This function applies DropConnect by randomly setting a portion of the weights to zero based on the drop_prob.

# - Modifications in Head and MultiHeadAttention: DropConnect is applied to the weights of the linear layers in the Head and MultiHeadAttention
#   classes. This regularizes the attention mechanism by zeroing out random weights.

# - Modifications in FeedForward: Similarly, DropConnect is applied to the weights in the feed-forward network, adding regularization to the transformation layers.

# - drop_prob: This parameter controls the probability of each weight being set to zero. You can tune this parameter based on your experiment.-

# """

# def dropconnect(layer, drop_prob):
#     """Apply DropConnect to the weights of the given layer."""
#     if not layer.training or drop_prob == 0:
#         return layer.weight
#     # Create a binary mask with the same shape as the weights
#     mask = torch.bernoulli(torch.ones_like(layer.weight) * (1 - drop_prob))
#     # Apply the mask to the weights
#     return layer.weight * mask

# class Head(nn.Module):
#     """ one head of self-attention """

#     def __init__(self, d_head, drop_prob=0.1):
#         super().__init__()
#         self.d_head = d_head
#         self.drop_prob = drop_prob
#         self.W_K = nn.Linear(d_model, d_head, bias=False)
#         self.W_Q = nn.Linear(d_model, d_head, bias=False)
#         self.W_V = nn.Linear(d_model, d_head, bias=False)
#         self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, x):
#         B, T, d = x.shape
#         # Apply DropConnect to the linear layers' weights
#         k = F.linear(x, dropconnect(self.W_K, self.drop_prob))   # (B, T, d_head)
#         q = F.linear(x, dropconnect(self.W_Q, self.drop_prob))   # (B, T, d_head)
#         v = F.linear(x, dropconnect(self.W_V, self.drop_prob))   # (B, T, d_head)

#         scores = q @ k.transpose(-2, -1) * self.d_head ** -0.5  # (B, T, T)
#         scores = scores.masked_fill(self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
#         a = F.softmax(scores, dim=-1)  # (B, T, T)
#         a = self.dropout(a)
#         out = a @ v  # (B, T, d_head)
#         return out

# class MultiHeadAttention(nn.Module):
#     """ multiple heads of self-attention in parallel """

#     def __init__(self, num_heads, d_head, drop_prob=0.1):
#         super().__init__()
#         self.heads = nn.ModuleList([Head(d_head, drop_prob) for _ in range(num_heads)])
#         self.proj = nn.Linear(d_model, d_model)
#         self.dropout = nn.Dropout(dropout)
#         self.drop_prob = drop_prob

#     def forward(self, x):
#         out = torch.cat([h(x) for h in self.heads], dim=-1)
#         # Apply DropConnect to the projection layer's weights
#         out = F.linear(out, dropconnect(self.proj, self.drop_prob))
#         out = self.dropout(out)
#         return out

# class FeedFoward(nn.Module):
#     """
#     A simple linear layer followed by a non-linearity; this is applied at the token level.
#     """

#     def __init__(self, d_model, drop_prob=0.1):
#         super().__init__()
#         self.drop_prob = drop_prob
#         d_ff = 4 * d_model
#         self.fc1 = nn.Linear(d_model, d_ff)
#         self.fc2 = nn.Linear(d_ff, d_model)
#         self.dropout = nn.Dropout(dropout)

#     def forward(self, x):
#         # Apply DropConnect to the weights of the linear layers
#         x = F.linear(x, dropconnect(self.fc1, self.drop_prob))
#         x = F.relu(x)
#         x = F.linear(x, dropconnect(self.fc2, self.drop_prob))
#         return self.dropout(x)

# class DecoderBlock(nn.Module):
#     """
#     Transformer decoder block: communication followed by computation.
#     These are stacked on top of each other one after another.
#     """

#     def __init__(self, d_model, n_head, drop_prob=0.1):
#         super().__init__()
#         d_head = d_model // n_head
#         self.sa = MultiHeadAttention(n_head, d_head, drop_prob)
#         self.ff = FeedFoward(d_model, drop_prob)
#         self.ln1 = nn.LayerNorm(d_model)
#         self.ln2 = nn.LayerNorm(d_model)

#     def forward(self, x):
#         x = x + self.sa(self.ln1(x))
#         x = x + self.ff(self.ln2(x))
#         return x

# class GPT(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.token_embedding_table = nn.Embedding(vocab_size, d_model)
#         self.position_embedding_table = nn.Embedding(block_size, d_model)
#         self.blocks = nn.Sequential(
#             *[DecoderBlock(d_model, n_head=n_head) for _ in range(n_layer)]
#         )
#         self.ln = nn.LayerNorm(d_model)
#         self.ff = nn.Linear(d_model, vocab_size)

#     def forward(self, idx, targets=None):
#         B, T = idx.shape
#         tok_emb = self.token_embedding_table(idx)
#         pos_emb = self.position_embedding_table(torch.arange(T, device=device))
#         x = tok_emb + pos_emb
#         x = self.blocks(x)
#         x = self.ln(x)
#         logits = self.ff(x)

#         if targets is None:
#             loss = None
#         else:
#             B, T, C = logits.shape
#             logits = logits.view(B * T, C)
#             targets = targets.view(B * T)
#             loss = F.cross_entropy(logits, targets)

#         return logits, loss

#     def generate(self, idx, max_new_tokens):
#         self.eval()
#         for _ in range(max_new_tokens):
#             idx_cond = idx[:, -block_size:]
#             logits, loss = self(idx_cond)
#             logits = logits[:, -1, :]
#             probs = F.softmax(logits, dim=-1)
#             idx_next = torch.multinomial(probs, num_samples=1)
#             idx = torch.cat((idx, idx_next), dim=1)
#         self.train()
#         return idx




In [38]:
class EarlyStopping:
    def __init__(self, tolerance=5, min_delta=0):

        self.tolerance = tolerance
        self.min_delta = min_delta
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss, validation_loss):
        if (validation_loss - train_loss) / train_loss > self.min_delta:
            self.counter += 1
            if self.counter >= self.tolerance:
                self.early_stop = True

In [39]:
model = GPT().to(device)
# Print the number of parameters in the model
print(sum(p.numel() for p in model.parameters()) / 1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
early_stopping = EarlyStopping(tolerance=1, min_delta=0.2)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        if iter:
          scheduler.step()
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        early_stopping(losses['train'], losses['val'])
        if early_stopping.early_stop:
          print("We stop at epoch {}".format(iter))
          break


    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


0.706046 M parameters
step 0: train loss 4.3288, val loss 4.3201
step 500: train loss 2.2168, val loss 2.2290
step 1000: train loss 2.0454, val loss 2.0479
step 1500: train loss 1.7986, val loss 1.8035
step 2000: train loss 1.6482, val loss 1.6666
step 2500: train loss 1.5444, val loss 1.5728
step 3000: train loss 1.4684, val loss 1.5051
step 3500: train loss 1.4042, val loss 1.4594
step 4000: train loss 1.3643, val loss 1.4282


KeyboardInterrupt: 

In [None]:
# Start the model with a new line, generate up to 10000 tokens
# This is technically doing generations in batches, but here we have a batch size of 1 and 1 element to start in the batch
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=100)[0].tolist()))
open('fake_hemingway.txt', 'w').write(decode(model.generate(context, max_new_tokens=100)[0].tolist()))

In [None]:
torch.save(model.state_dict(), 'gpt.pt')