In [1]:
%pip install tiktoken datasets lightning wandb

Collecting tiktoken
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
Collecting lightning
  Downloading lightning-2.2.4-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.4/53.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl.metadata (10 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.4.28-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-16.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3

In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mkevinv3796[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
import tiktoken
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
import lightning as L
from typing import Any
from torch.utils.data import random_split
import os
import pickle
from lightning.pytorch.loggers import WandbLogger
from lightning.pytorch.tuner import Tuner
import wandb
import torch
import numpy as np
def get_device():
    return torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

class TokenEmbedding(torch.nn.Module):
    """
    PyTorch module that converts tokens into embeddings.

    Input dimension is: (batch_size, sequence_length)
    Output dimension is: (batch_size, sequence_length, d_model)
    """

    def __init__(self, d_model, number_of_tokens):
        super().__init__()
        self.embedding_layer = torch.nn.Embedding(
            num_embeddings=number_of_tokens,
            embedding_dim=d_model
        )

    def forward(self, x):
        return self.embedding_layer(x)


class PositionalEncoding(torch.nn.Module):
    """
    Pytorch module that creates a positional encoding matrix. This matrix will later be added to the
    transformer's input embeddings to provide a sense of position of the sequence elements.
    """

    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.d_model = d_model
        self.max_sequence_length = max_sequence_length
        self.positional_encoding = self.create_positional_encoding()

    def create_positional_encoding(self):
        """
        Creates a positional encoding matrix of size (max_sequence_length, d_model).
        """

        # Initialize positional encoding matrix
        positional_encoding = np.zeros((self.max_sequence_length, self.d_model))

        # Calculate positional encoding for each position and each dimension
        for pos in range(self.max_sequence_length):
            for i in range(0, self.d_model, 2):
                # Apply sin to even indices in the array; indices in Python start at 0 so i is even.
                positional_encoding[pos, i] = np.sin(pos / (10000 ** ((2 * i) / self.d_model)))

                if i + 1 < self.d_model:
                    # Apply cos to odd indices in the array; we add 1 to i because indices in Python start at 0.
                    positional_encoding[pos, i + 1] = np.cos(pos / (10000 ** ((2 * i) / self.d_model)))

        # Convert numpy array to PyTorch tensor and return it
        return torch.from_numpy(positional_encoding).float().to(get_device())

    def forward(self, x):
        """
        Adds the positional encoding to the input embeddings at the corresponding positions.
        """
        # Add positional encodings to input embeddings. The ":" indexing ensures we only add positional encodings up
        # to the length of the sequence in the batch. x.size(0) is the batch size, so this is a way to make sure
        # we're not adding extra positional encodings.
        positional_encoding = self.positional_encoding[:x.size(1), :]
        return x + positional_encoding


class MaskedSelfAttention(torch.nn.Module):
    """
    Pytorch module for a self attention layer.
    This layer is used in the MultiHeadedSelfAttention module.

    Input dimension is: (batch_size, sequence_length, embedding_dimension)
    Output dimension is: (batch_size, sequence_length, head_dimension)
    """

    def __init__(self, embedding_dimension, head_dimension):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.head_dimension = head_dimension
        self.query_layer = torch.nn.Linear(embedding_dimension, self.head_dimension)
        self.key_layer = torch.nn.Linear(embedding_dimension, self.head_dimension)
        self.value_layer = torch.nn.Linear(embedding_dimension, self.head_dimension)
        self.softmax = torch.nn.Softmax(dim=-1)

    def forward(self, x, mask):
        """
        Compute the self attention.

        x dimension is: (batch_size, sequence_length, embedding_dimension)
        output dimension is: (batch_size, sequence_length, head_dimension)
        mask dimension is: (batch_size, sequence_length)

        mask values are: 0 or 1. 0 means the token is masked, 1 means the token is not masked.
        """

        # x dimensions are: (batch_size, sequence_length, embedding_dimension)
        # query, key, value dimensions are: (batch_size, sequence_length, head_dimension)
        query = self.query_layer(x)
        key = self.key_layer(x)
        value = self.value_layer(x)

        # Calculate the attention weights.
        # attention_weights dimensions are: (batch_size, sequence_length, sequence_length)
        attention_weights = torch.matmul(query, key.transpose(-2, -1))

        # Scale the attention weights.
        attention_weights = attention_weights / np.sqrt(self.head_dimension)

        # Apply the mask to the attention weights, by setting the masked tokens to a very low value.
        # This will make the softmax output 0 for these values.
        mask = mask.reshape(attention_weights.shape[0], 1, attention_weights.shape[2])
        _MASKING_VALUE = -1e+30 if attention_weights.dtype == torch.float32 else -1e+4
        #https://discuss.pytorch.org/t/runtimeerror-value-cannot-be-converted-to-type-at-half-without-overflow-1e-30/109768
        attention_weights = attention_weights.masked_fill(mask == 0, _MASKING_VALUE)

        # Softmax makes sure all scores are between 0 and 1 and the sum of scores is 1.
        # attention_scores dimensions are: (batch_size, sequence_length, sequence_length)
        attention_scores = self.softmax(attention_weights)

        # The attention scores are multiplied by the value
        # Values of tokens with high attention score get highlighted because they are multiplied by a larger number,
        # and tokens with low attention score get drowned out because they are multiplied by a smaller number.
        # Output dimensions are: (batch_size, sequence_length, head_dimension)
        return torch.bmm(attention_scores, value)


class MaskedMultiHeadedSelfAttention(torch.nn.Module):
    """
    Pytorch module for a multi head attention layer.

    Input dimension is: (batch_size, sequence_length, embedding_dimension)
    Output dimension is: (batch_size, sequence_length, embedding_dimension)
    """

    def __init__(self, embedding_dimension, number_of_heads):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.head_dimension = embedding_dimension // number_of_heads
        self.number_of_heads = number_of_heads

        # Create the self attention modules
        self.self_attentions = torch.nn.ModuleList(
            [MaskedSelfAttention(embedding_dimension, self.head_dimension) for _ in range(number_of_heads)])

        # Create a linear layer to combine the outputs of the self attention modules
        self.output_layer = torch.nn.Linear(number_of_heads * self.head_dimension, embedding_dimension)

    def forward(self, x, mask):
        """
        Compute the multi head attention.

        x dimensions are: (batch_size, sequence_length, embedding_dimension)
        mask dimensions are: (batch_size, sequence_length)
        mask values are: 0 or 1. 0 means the token is masked, 1 means the token is not masked.
        """
        # Compute the self attention for each head
        # self_attention_outputs dimensions are:
        # (number_of_heads, batch_size, sequence_length, head_dimension)
        self_attention_outputs = [self_attention(x, mask) for self_attention in self.self_attentions]

        # Concatenate the self attention outputs
        # self_attention_outputs_concatenated dimensions are:
        # (batch_size, sequence_length, number_of_heads * head_dimension)
        concatenated_self_attention_outputs = torch.cat(self_attention_outputs, dim=2)

        # Apply the output layer to the concatenated self attention outputs
        # output dimensions are: (batch_size, sequence_length, embedding_dimension)
        return self.output_layer(concatenated_self_attention_outputs)


class FeedForward(torch.nn.Module):
    """
    Pytorch module for a feed forward layer.

    A feed forward layer is a fully connected layer with a ReLU activation function in between.
    """

    def __init__(self, embedding_dimension, feed_forward_dimension):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.feed_forward_dimension = feed_forward_dimension
        self.linear_1 = torch.nn.Linear(embedding_dimension, feed_forward_dimension)
        self.linear_2 = torch.nn.Linear(feed_forward_dimension, embedding_dimension)

    def forward(self, x):
        """
        Compute the feed forward layer.
        """
        return self.linear_2(torch.relu(self.linear_1(x)))


class DecoderLayer(torch.nn.Module):
    """
    Pytorch module for an encoder layer.

    An encoder layer consists of a multi-headed self attention layer, a feed forward layer and dropout.

    Input dimension is: (batch_size, sequence_length, embedding_dimension)
    Output dimension is: (batch_size, sequence_length, embedding_dimension)
    """

    def __init__(
            self,
            embedding_dimension,
            number_of_heads,
            feed_forward_dimension,
            dropout_rate
    ):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.number_of_heads = number_of_heads
        self.feed_forward_dimension = feed_forward_dimension
        self.dropout_rate = dropout_rate

        self.multi_headed_self_attention = MaskedMultiHeadedSelfAttention(embedding_dimension, number_of_heads)
        self.feed_forward = FeedForward(embedding_dimension, feed_forward_dimension)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.layer_normalization_1 = torch.nn.LayerNorm(embedding_dimension)
        self.layer_normalization_2 = torch.nn.LayerNorm(embedding_dimension)

    def forward(self, x, mask):
        """
        Compute the encoder layer.

        x dimensions are: (batch_size, sequence_length, embedding_dimension)
        mask dimensions are: (batch_size, sequence_length)
        mask values are: 0 or 1. 0 means the token is masked, 1 means the token is not masked.
        """

        # Layer normalization 1
        normalized_x = self.layer_normalization_1(x)

        # Multi headed self attention
        attention_output = self.multi_headed_self_attention(normalized_x, mask)

        # Residual output
        residual_output = x + attention_output

        # Layer normalization 2
        normalized_residual_output = self.layer_normalization_2(residual_output)

        # Feed forward
        feed_forward_output = self.feed_forward(normalized_residual_output)

        # Dropout
        if self.training:
            feed_forward_output = self.dropout(feed_forward_output)

        # Residual output
        return residual_output + feed_forward_output


class DecoderStack(torch.nn.Module):
    """
    The decoder stack consists of multiple decoder layers in sequence.
    """

    def __init__(
            self,
            embedding_dimension,
            number_of_layers,
            number_of_heads,
            feed_forward_dimension,
            dropout_rate,
            max_sequence_length
    ):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.number_of_layers = number_of_layers
        self.number_of_heads = number_of_heads
        self.feed_forward_dimension = feed_forward_dimension
        self.dropout_rate = dropout_rate
        self.max_sequence_length = max_sequence_length

        # Create the encoder layers
        self.encoder_layers = torch.nn.ModuleList(
            [DecoderLayer(embedding_dimension, number_of_heads, feed_forward_dimension, dropout_rate) for _ in
             range(number_of_layers)])

    def forward(self, x, mask):
        decoder_outputs = x
        for decoder_layer in self.encoder_layers:
            decoder_outputs = decoder_layer(decoder_outputs, mask)

        return decoder_outputs


class LMHead(torch.nn.Module):
    """
    Pytorch module for the language model head.
    The language model head is a linear layer that maps the embedding dimension to the vocabulary size.
    """

    def __init__(self, embedding_dimension, number_of_tokens):
        super().__init__()
        self.embedding_dimension = embedding_dimension
        self.number_of_tokens = number_of_tokens
        self.linear = torch.nn.Linear(embedding_dimension, number_of_tokens)

    def forward(self, x):
        """
        Compute the language model head.

        x dimensions are: (batch_size, sequence_length, embedding_dimension)
        output dimensions are: (batch_size, sequence_length, number_of_tokens)
        """
        # Compute the linear layer
        # linear_output dimensions are: (batch_size, sequence_length, number_of_tokens)
        linear_output = self.linear(x)

        return linear_output


class LanguageModel(torch.nn.Module):
    """
    Pytorch module for a language model.
    """

    def __init__(
            self,
            number_of_tokens,  # The number of tokens in the vocabulary
            max_sequence_length=512,  # The maximum sequence length to use for attention
            embedding_dimension=512,  # The dimension of the token embeddings
            number_of_layers=6,  # The number of decoder layers to use
            number_of_heads=4,  # The number of attention heads to use
            feed_forward_dimension=None,  # The dimension of the feed forward layer
            dropout_rate=0.1  # The dropout rate to use
    ):
        super().__init__()
        self.number_of_tokens = number_of_tokens
        self.max_sequence_length = max_sequence_length
        self.embedding_dimension = embedding_dimension
        self.number_of_layers = number_of_layers
        self.number_of_heads = number_of_heads

        if feed_forward_dimension is None:
            # GPT-2 paper uses 4 * embedding_dimension for the feed forward dimension
            self.feed_forward_dimension = embedding_dimension * 4
        else:
            self.feed_forward_dimension = feed_forward_dimension

        self.dropout_rate = dropout_rate

        # Create the token embedding layer
        self.token_embedding = TokenEmbedding(embedding_dimension, number_of_tokens)

        # Create the positional encoding layer
        self.positional_encoding = PositionalEncoding(embedding_dimension, max_sequence_length)

        # Create the normalization layer
        self.layer_normalization = torch.nn.LayerNorm(embedding_dimension)

        # Create the decoder stack
        self.decoder = DecoderStack(
            embedding_dimension=embedding_dimension,
            number_of_layers=number_of_layers,
            number_of_heads=number_of_heads,
            feed_forward_dimension=self.feed_forward_dimension,
            dropout_rate=dropout_rate,
            max_sequence_length=max_sequence_length
        )

        # Create the language model head
        self.lm_head = LMHead(embedding_dimension, number_of_tokens)

    def forward(self, x, mask):
        # Compute the token embeddings
        # token_embeddings dimensions are: (batch_size, sequence_length, embedding_dimension)
        token_embeddings = self.token_embedding(x)

        # Compute the positional encoding
        # positional_encoding dimensions are: (batch_size, sequence_length, embedding_dimension)
        positional_encoding = self.positional_encoding(token_embeddings)

        # Post embedding layer normalization
        positional_encoding_normalized = self.layer_normalization(positional_encoding)

        decoder_outputs = self.decoder(positional_encoding_normalized, mask)
        lm_head_outputs = self.lm_head(decoder_outputs)

        return lm_head_outputs


class AutoregressiveWrapper(torch.nn.Module):
    """
    Pytorch module that wraps a GPT model and makes it autoregressive.
    """

    def __init__(self, gpt_model):
        super().__init__()
        self.model = gpt_model
        self.max_sequence_length = self.model.max_sequence_length

    def forward(self, x, mask):
        """
        Autoregressive forward pass
        """
        #x.shape, mask.shape = (batch_size, sequence_length)
        inp, target = x[:, :-1], x[:, 1:]
        mask = mask[:, :-1]

        #inp.shape, mask.shape = (batch_size, sequence length - 1 (see above))
        output = self.model(inp, mask)
        return output, target

    def next_token_probabilities(self, x, mask, temperature=1.0):
        """
        Calculate the token probabilities for the next token in the sequence.
        """
        logits = self.model(x, mask)[:, -1]

        # Apply temperature
        if temperature != 1.0:
            logits = logits / temperature

        # Apply the softmax
        probabilities = torch.softmax(logits, dim=-1)

        return probabilities

class LitGPT(L.LightningModule):
    def __init__(self, autoregressive_model, config, *args: Any, **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)
        #self.example_input_array = torch.Tensor(32, 1, 28, 28)  # need to change this
        self.model = autoregressive_model
        self.loss_function = torch.nn.CrossEntropyLoss()
        for (
            key,
            value,
        ) in (
            config.__dict__.items()
        ):  # this assigns all of the config in this format: self.lr = config.lr
            setattr(self, key, value)
        self.save_hyperparameters()

    def evaluate_batch_loss(self, batch):
        input_ids, attention_mask = batch
        outputs, targets = self.model(x=input_ids, mask=attention_mask)
        # Reshape targets to match the format expected by CrossEntropyLoss
        targets_flat = targets.reshape(-1)  # Flatten to shape [batch_size * sequence_length]
        # Flatten logits to shape [batch_size * sequence_length, num_classes]
        outputs_flat = outputs.reshape(-1, outputs.shape[2])
        # Compute loss
        loss = self.loss_function(outputs_flat, targets_flat)
        return loss

    def training_step(self, batch, batch_idx):
        train_loss = self.evaluate_batch_loss(batch)
        self.log("train/loss", train_loss)
        return train_loss

    def validation_step(self, batch, batch_idx):
        val_loss = self.evaluate_batch_loss(batch)
        self.log("val/loss", val_loss)
        return val_loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

    def forward(self, x, mask, *args: Any, **kwargs: Any) -> Any:
        print('x shape:', x.shape, 'mask shape:', mask.shape)
        pred = self.model(x, mask)
        return z

In [3]:
class CodeSnippetDataset(Dataset):
    def __init__(self, snippets, tokenizer):
        self.snippets = snippets
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.snippets)

    def __getitem__(self, idx):
        snippet = self.snippets[idx]

        # Tokenize the snippet
        input_ids = self.tokenizer.encode(snippet, allowed_special="all")
        attention_mask = [1] * len(input_ids)  # Assume all tokens are attended to

        return input_ids, attention_mask

class PyCodeDataModule(L.LightningDataModule):
    def __init__(self, tokenizer, config, data_dir: str = os.getcwd()):
        super().__init__()
        self.data_dir = data_dir
        self.tokenizer = tokenizer
        self.vocab = tokenizer.n_vocab
        for (
            key,
            value,
        ) in (
            config.__dict__.items()
        ):  # this assigns all of the config in this format: self.lr = config.lr
            setattr(self, key, value)

        self.save_hyperparameters()

    def prepare_data(self) -> None:
        """
        Prepares the data by downloading and tokenizing it. Runs once, on Rank 0.
        """
        raw_data = load_dataset(
            "ArtifactAI/arxiv_python_research_code",
            split=f"train[:{self.total_samples}]",
        )
        torch_data = raw_data.with_format("torch")
        tokenized_dataset = CodeSnippetDataset(torch_data["code"], self.tokenizer)
        # lighting recommends you save to disk and load in the setup function to be compatible with distributed training
        with open("saved_dataset.pkl", "wb") as f:
            pickle.dump(tokenized_dataset, f)

    def collate_fn(self, batch, max_length):
        """Collate function for dataloader"""
        input_ids_list, attention_mask_list = [], []

        # Process each snippet in the batch
        for input_ids, attention_mask in batch:
            # Truncate input_ids and attention_mask based on the maximum sequence length in the batch
            input_ids = input_ids[:max_length]
            attention_mask = attention_mask[:max_length]

            # Pad input_ids and attention_mask to max_length
            padding_length = max_length - len(input_ids)
            padded_input_ids = input_ids + [self.padding_token_id] * padding_length
            padded_attention_mask = attention_mask + [0] * padding_length

            input_ids_list.append(padded_input_ids)
            attention_mask_list.append(padded_attention_mask)

        # Convert lists to tensors
        input_ids_tensor = torch.tensor(input_ids_list, dtype=torch.long)
        attention_mask_tensor = torch.tensor(attention_mask_list, dtype=torch.long)

        return input_ids_tensor, attention_mask_tensor

    def setup(self, stage: str) -> None:
        # load the dataset from disk
        # dataset = load_dataset_from_disk(...)
        # Assign train/val datasets for use in dataloaders
        with open("saved_dataset.pkl", "rb") as f:
            tokenized_dataset = pickle.load(f)
        valid_size = self.valid_size
        if stage == "fit":
            self.train_dataset, self.val_dataset = random_split(
                tokenized_dataset,
                [1 - valid_size, valid_size],
                generator=torch.Generator().manual_seed(42),
            )

        if stage == "test":
            pass

        if stage == "predict":
            pass

    def train_dataloader(self) -> Any:
        return DataLoader(
            self.train_dataset, 
            batch_size=self.batch_size, 
            collate_fn=lambda batch: self.collate_fn(batch, max_length = self.max_length),
            num_workers=95
        )  # the batch size needs to be self.batch_size for tuner to work

    def val_dataloader(self) -> Any:
        return DataLoader(
            self.val_dataset, 
            batch_size=self.batch_size, 
            collate_fn=lambda batch: self.collate_fn(batch, max_length = self.max_length),
            num_workers=95
        )

In [4]:
from types import SimpleNamespace
#wandb.init()
enc = tiktoken.encoding_for_model("gpt-4")
eos_token_id = enc.eot_token
eos_token = enc.decode([eos_token_id])
config = SimpleNamespace(
        total_samples = 10000,
        valid_size = 0.2,
        batch_size = 32,
        max_length = 512,
        epochs=100,
        learning_rate=4e-2,
        max_grad_norm = 0.5,
        embedding_dimension = 256,
        number_of_heads=4,
        number_of_layers=3,
        dropout_rate=0.1,
        padding_token_id = eos_token_id,
        padding_token = eos_token,
    )


litgpt = LitGPT(
    autoregressive_model=AutoregressiveWrapper(
        LanguageModel(
            embedding_dimension=config.embedding_dimension,
            number_of_tokens=enc.n_vocab,
            number_of_heads=config.number_of_heads,
            number_of_layers=config.number_of_layers,
            dropout_rate=config.dropout_rate,
            max_sequence_length=config.max_length,
        )
    ),
    config=config,
)

dm = PyCodeDataModule(tokenizer=enc, config=config)

/usr/local/lib/python3.10/dist-packages/lightning/pytorch/utilities/parsing.py:199: Attribute 'autoregressive_model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['autoregressive_model'])`.


In [5]:
#Callbacks
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from lightning.pytorch.callbacks import ModelSummary, DeviceStatsMonitor, ModelCheckpoint, StochasticWeightAveraging, ModelPruning

swa_cb = StochasticWeightAveraging(swa_lrs=1e-2, swa_epoch_start=75) # results in smoother loss landscape and not getting stuck on local minimas, optional
earlystopping_cb = EarlyStopping(monitor='val/loss', mode='min')
model_summary_cb = ModelSummary(max_depth=-1) #this does not usually need to be set unless you want to modify behaviour
pruning_cb = ModelPruning("l1_unstructured", amount=0.5)

In [6]:
torch.set_float32_matmul_precision('medium')

In [7]:
wandb_logger = WandbLogger(project="transformer-lightning")
trainer = L.Trainer(
    profiler="simple",
    logger=wandb_logger,
    accelerator="auto",
    devices="auto",
    strategy="auto",
    precision="16-mixed",  # 32-true by default. Other options: bf16-mixed, 16-true, 64-true (high memory, more sensitive)
    gradient_clip_val=0.5,  # default 0,
    #fast_dev_run=True, #turn this off after debugging model and data modules code
    max_epochs=config.epochs,
    callbacks = [
        model_summary_cb,
        swa_cb,
        earlystopping_cb,
        pruning_cb
    ]
)

Using 16bit Automatic Mixed Precision (AMP)
Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [8]:
#tuner = Tuner(trainer)
#tuner.scale_batch_size(litgpt, datamodule = dm, mode = "binsearch") result = 58, using 32 because of cuda out of memory errors
#tuner.lr_find(litgpt, datamodule = dm) #result = 0.003981071705534969, using 4e-2

In [9]:
trainer.fit(model=litgpt, datamodule=dm)

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

    | Name                                                                                           | Type                           | Params
----------------------------------------------------------------------------------------------------------------------------------------------------
0   | model                                                                                          | AutoregressiveWrapper          | 53.8 M
1   | model.model                                                                                    | LanguageModel                  | 53.8 M
2   | model.model.token_embedding                                                                    | TokenEmbedding                 | 25.7 M
3   | model.model.token_embedding.embedding_layer                                                    | Embedding                      | 25.7 M
4   | model.model.positional_encoding                                                        

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

FIT Profiler Report

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Action                                                                                                                                                         	|  Mean duration (s)	|  Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
|  Total                                                                                                                                                          	|  -              	|  14950          	|

## Generation

In [None]:
model = MyLightningModule.load_from_checkpoint("/path/to/checkpoint.ckpt")

# disable randomness, dropout, etc...
model.eval()

# predict with the model
y_hat = model(x)

In [None]:
#this can be done with the DataModule as well but since we didn't want to make a dataloader for the test samples we use the manual method
#already ran this one for this run
def pad_left(sequence, final_length, padding_token):
    return [padding_token] * (final_length - len(sequence)) + sequence

class Generator:
    def __init__(
            self,
            model,
            tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def generate(
            self,
            max_tokens_to_generate: int,
            prompt: str = None,
            temperature: float = 1.0,
            eos_token: int = None,
            padding_token: int = config.eos_token_id):

        self.model.eval()

        if prompt is None:
            start_tokens = [config.eos_token_id]
        else:
            start_tokens = self.tokenizer.encode(prompt, allowed_special = "all")

        input_tensor = torch.tensor(
            pad_left(
                sequence=start_tokens,
                final_length=self.model.max_sequence_length + 1,
                padding_token=padding_token
            ),
            dtype=torch.long
        ).to(get_device())

        num_dims = len(input_tensor.shape)

        if num_dims == 1:
            input_tensor = input_tensor[None, :]

        out = input_tensor
        for _ in range(max_tokens_to_generate):

            x = out[:, -self.model.max_sequence_length:]

            mask = torch.ones_like(x)
            mask[x == padding_token] = 0

            # Compute the next token probabilities
            next_token_probabilities = self.model.next_token_probabilities(
                x=x,
                temperature=temperature,
                mask=mask
            )

            # Sample the next token from the probability distribution
            next_token = torch.multinomial(next_token_probabilities, num_samples=1)

            # Append the next token to the output
            out = torch.cat([out, next_token], dim=1)

            # If the end of sequence token is reached, stop generating tokens
            if eos_token is not None and next_token == eos_token:
                break

        generated_tokens = out[0].tolist()
        return self.tokenizer.decode(generated_tokens)
        #return ''.join([self.tokenizer.decode(token) for token in generated_tokens])


In [None]:
# Generate text
max_tokens_to_generate = 512
generator = Generator(model, tokenizer)
txt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
generated_text = generator.generate(
    max_tokens_to_generate=max_tokens_to_generate,
    prompt=txt,
    padding_token=eos_token_id
)
print(generated_text.replace(eos_token, ''))