In [1]:
import math
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader#, Dataset
from transformers import AutoTokenizer
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Tokenizer
import os
import sys
import numpy as np
sys.path.append(os.path.abspath(os.path.join("..")))

from urnng.data import Dataset

ModuleNotFoundError: No module named 'urnng'

In [None]:
class InputEmbedding(nn.Module):
    def __init__(self, embed_dim: int, vocab_size: int):
        """
        Initialize the InputEmbedding module.
        
        Args:
            embed_dim (int): The dimensionality of the input embedding
            vocab_size (int): The size of the vocabulary
        """
        super().__init__()
        # Store the dimensionality and vocabulary size
        
        self.embed_dim = embed_dim
        self.vocab_size = vocab_size
        
        # Create embedding layer to map vocabulary to an embed_dim-dimensionalspace
        # The embedding layer should have shape (vocab_size, embed_dim)
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
    def forward(self, x):
        """
        Perform the forward ass of the InputEmbedding module.
        
        Args:
            x (tensor): The input tensor.
            
        Returns:
            tensor: The embedded input tensor after scaling it by the square root of the dimensionality.
        """
        # Embed the input tensor using the embedding layer
        # Shape: (batch_size, seq_len) -> (batch_size, seq_len, embed_dim)
        embedded_input = self.embedding(x)
        scaled_embedded_input = embedded_input * torch.sqrt(torch.tensor(self.embed_dim))
        return scaled_embedded_input

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim: int = 512, max_seq_len: int = 100, dropout: float = 0.1,):
        """Initialize the PositionalEncoding module."""
        super().__init__()
        self.embed_dim = embed_dim
        self.max_seq_len = max_seq_len
        self.dropout = nn.Dropout(dropout)
        # Precompute the positional encoding matrix
        self.positional_encoding = self._precompute_positional_encoding(max_seq_len, embed_dim)
        
    def _precompute_positional_encoding(self, max_seq_len, embed_dim):
        """Precompute the positional encoding matrix."""
        with torch.no_grad():
            # Create a positional encoding matrix of shape (max_seq_len,embed_dim)
            positional_encoding = torch.zeros(max_seq_len, embed_dim)
            # Create a tensor 'pos' with values [0, 1, 2, ..., max_seq_len - 1] (max_seq_len, 1)
            position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
            # Compute the positional encoding matrix
            division_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-torch.log(torch.tensor(10000.0)) / embed_dim))
            positional_encoding[:, 0::2] = torch.sin(position * division_term)
            positional_encoding[:, 1::2] = torch.cos(position * division_term)
            # Shape (max_seq_len, embed_dim) -> (1, max_seq_len, embed_dim)
            positional_encoding = positional_encoding.unsqueeze(0)
        
        return positional_encoding
    
    def forward(self, x):
        """Perform the forward pass of the PositionalEncoding module."""
        # Add the positional encoding matrix to the input tensor
        x = x + self.positional_encoding[:, : x.size(1)].to(x.device)
        # Apply dropout to the input tensor
        x = self.dropout(x)
        return x

In [None]:
class LayerNormalization(nn.Module):
    def __init__(self, embed_dim: int, eps: float = 1e-6):
        """Initialize the LayerNormalization module."""
        super().__init__()
        self.eps = eps
        # Create two learnable parameters to scale and shift the normalized input
        self.gain = nn.Parameter(torch.Tensor(embed_dim).uniform_())
        self.bias = nn.Parameter(torch.Tensor(embed_dim).normal_())
        
    def forward(self, x):
        """Perform the forward pass of the LayerNormalization module."""
        # Compute the mean and standard deviation of the input tensor
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        # Zero center by subtracting the mean from the input tensor
        return (x - mean) / (std + self.eps) * self.gain + self.bias

In [None]:
class FeedForwardBlock(nn.Module):
    def __init__(self, embed_dim: int, intermediate_size: int, dropout: float = 0.1):
        """Initialize the FeedForwardBlock module."""
        super().__init__()
        self.fc1 = nn.Linear(embed_dim, intermediate_size)
        self.fc2 = nn.Linear(intermediate_size, embed_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x_intermediate = self.dropout(F.relu(self.fc1(x)))
        x_output = self.fc2(x_intermediate)
        return x_output

In [None]:
def generate_square_subsequent_mask(size: int, device: torch.device = "cpu"):
    """Generate a square mask for the sequence."""
    mask = torch.tril(torch.ones(size, size, dtype=torch.bool, device=device), diagonal=0)
    # Turn boolean mask into float mask
    mask = mask.long()
    return mask.unsqueeze(0) # Add batch dimension

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim = 512, num_heads: int = 8, attn_dropout: float = 0.1, ff_dropout: float = 0.1, max_len=100):
        super().__init__()
        self.num_heads = num_heads
        assert embed_dim % self.num_heads == 0, "Invalid heads and embed_dim configuration; embed_dim must be evenly divisble by num_heads (there should be no remainder)"
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)
        self.query = nn.Linear(embed_dim, embed_dim)
        self.proj = nn.Linear(embed_dim, embed_dim)
        self.attn_dropout = nn.Dropout(attn_dropout)
        self.proj_dropout = nn.Dropout(ff_dropout)
        # Create a buffer to store the mask wth no gradient
        # Shape: (1, max_len, max_len)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(max_len, max_len, dtype=torch.bool), diagonal=1)
        )
        
    def forward(self, x, mask = None):
        batch_size, seq_len, _ = x.size()
        # Apply linear transformations to the input tensor
        # Take input tensor and apply linear transformations
        # then split the tensor into num_heads and head_dim
        # transpose the tensor into the correct order
        q = self.query(x).view(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        k = self.key(x).view(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        v = self.value(x).view(batch_size, seq_len, self.num_heads, -1).transpose(1, 2)
        
        attention = torch.einsum('bhid,bhjd->bhij', q, k) / math.sqrt(q.size(-1))
        
        if mask is not None:
            attention = attention.masked_fill(mask == 0, float("-inf"))
            
        attention = self.attn_dropout(F.softmax(attention, dim=-1))

        y = torch.einsum('bhij,bhjd->bhid', attention, v)
        
        y = y.transpose(1, 2).contiguous().view(batch_size, seq_len, -1)
        
        # Apply linear transformation and dropout
        # Shape: (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)
        return self.proj_dropout(self.proj(y))

In [None]:
class ResidualConnection(nn.Module):
    def __init__(self, embed_dim, dropout: float = 0.1):
        """Initialize the ResidualConnection module."""
        super().__init__()
        self.layer_norm = LayerNormalization(embed_dim=embed_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, sublayer):
        """Perform the forward pass of the ResidualConnection module."""
        # Apply layer normalization
        # (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)
        normalized_x = self.layer_norm(x)
        
        # Apply sublayer (e.g. feedforward block)
        # (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)
        sublayer_output = sublayer(normalized_x)
        
        # Add residual connection and apply dropout
        # (batch_size, seq_len, embed_dim) + (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)
        residual_output = x + self.dropout(sublayer_output)
        return residual_output

In [2]:
class ProjectionHead(nn.Module):
    def __init__(self, embed_dim: int, vocab_size: int):
        """Initialize the ProjectionHead module."""
        super().__init__()
        self.fc = nn.Linear(embed_dim, vocab_size)
        
    def forward(self, x):
        """Perform forward pass of the ProjectionHead module."""
        # Apply linear transformation to the input tensor
        # (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, vocab_size)
        return self.fc(x)

In [3]:
class DecoderBlock(nn.Module):
    def __init__(
        self,
        embed_dim: int = 512,
        num_heads: int =8,
        ff_dim: int = 2048,
        attn_dropout: float = 0.1,
        ff_dropout: float = 0.1,
        dropout: float = 0.1,
        max_len: int = 512,
    ):
        super().__init__()
        # Initialize the multi-head self-attention mechanism
        self.MultiHeadAttention = MultiHeadAttention(
            embed_dim=embed_dim,
            num_heads=num_heads,
            attn_dropout=attn_dropout,
            ff_dropout=ff_dropout,
            max_len=max_len,
        )
        # Initialize the feed-forward block
        self.feed_forward = FeedForwardBlock(
            embed_dim=embed_dim,
            intermediate_size=ff_dim,
            dropout=ff_dropout,
        )
        # Initialize residual connections
        self.residual_connection1 = ResidualConnection(embed_dim=embed_dim, dropout=dropout)
        self.residual_connection2 = ResidualConnection(embed_dim=embed_dim, dropout=dropout)
        
    def forward(self, x, attention_mask=None):
        # Apply self-attention mechanism with residual connection
        x_with_attention = self.residual_connection1(x, lambda x: self.MultiHeadAttention(x, mask=attention_mask))
        # Apply feed-forward block with residual connection
        x_with_ff = self.residual_connection2(x_with_attention, self.feed_forward)
        return x_with_ff

In [4]:
class GPT(nn.Module):
    def __init__(
        self,
        vocab_size: int,
        embed_dim: int = 512,
        max_len: int = 512,
        embed_dropout: float = 0.1,
        num_blocks: int = 6,
        num_heads: int = 8,
        ff_dim: int = 2048,
        attn_dropout: float =0.1,
        ff_dropout: float = 0.1
    ):
        super().__init__()
        self.max_len = max_len
        self.token_embedding = InputEmbedding(
            embed_dim=embed_dim,
            vocab_size=vocab_size
        )
        self.positional_embedding = PositionalEncoding(
            embed_dim=embed_dim,
            max_seq_len=max_len,
            dropout=embed_dropout,
        )
        self.blocks = nn.ModuleList([DecoderBlock(
            embed_dim=embed_dim,
            num_heads=num_heads,
            ff_dim=ff_dim,
            attn_dropout=attn_dropout,
            ff_dropout=ff_dropout,
            max_len=max_len,
        ) for _ in range(num_blocks)])
        
        self.projection_head = ProjectionHead(embed_dim=embed_dim, vocab_size=vocab_size)
        
    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
        # Shape: (batch_size, seq_len) - > (seq_len)
        seq_len = input_ids.size(1)
        assert seq_len <= self.max_len, "Sequence longer than model capacity"
        
        # Token embedding
        # Shape: (batch_size, seq_len) -> (batch_size, seq_len, embed_dim)
        x = self.token_embedding(input_ids)
        
        # Add positional embedding
        x = self.positional_embedding(x)
        
        # Forward through decoder blocks
        # Output of each block iis the hidden state of the transformer
        # Shape: (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, embed_dim)
        for block in self.blocks:
            x = block(x, attention_mask=attention_mask)
            
        # Linear layer for output logits
        # Shape: (batch_size, seq_len, embed_dim) -> (batch_size, seq_len, vocab_size)
        x = self.projection_head(x)
        
        return x

In [5]:
np.random.seed(3435)
torch.manual_seed(3435)
train_data = Dataset("../urnng/data/tokenized_data/ptb-train.pkl")
test_data = Dataset("../urnng/data/tokenized_data/ptb-test.pkl")
val_data = Dataset("../urnng/data/tokenized_data/ptb-val.pkl")  
vocab_size = int(train_data.vocab_size)
print("Vocab size: ", vocab_size)

NameError: name 'Dataset' is not defined

In [17]:
# Define model parameters
# vocab_size = 50257 # Example vocab size; specific to GPT2 tokenizer
# vocab_size = 12000
embed_dim = 768
max_len = 1024 # This can be adjusted based on the use case
embed_dropout = 0.1
num_blocks = 6 # This can be adjusted based on the use case
num_heads = 8 # This can be adjusted based on the use case
ff_dim = 2048 # This can be adjusted based on the use case
attn_dropout = 0.1
ff_dropout = 0.1

# Initialize GPT model
model = GPT(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    max_len=max_len,
    embed_dropout=embed_dropout,
    num_blocks=num_blocks,
    num_heads=num_heads,
    ff_dim=ff_dim,
    attn_dropout=attn_dropout,
    ff_dropout=ff_dropout
)

In [18]:
sample_data = [
 "Mary had a little lamb",
 "Its fleece was white as snow",
 "And everywhere that Mary went",
 "The lamb was sure to go",
]

## Once you've figured out how to load in the data, put it through GPTDataset and train

In [19]:
class GPTDataset(Dataset):
    def __init__(self, data: list, tokenizer, max_length: int):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.end_token = tokenizer.eos_token_id #token_to_id("</s>")
        print(self.end_token)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data[idx]
        input_txt = self.tokenizer(text, truncation=True, return_tensors="pt")["input_ids"].squeeze(0)
        text_len = input_txt.size(0)
        if text_len < self.max_length:
            padding_len = self.max_length - text_len
            padding = torch.tensor([self.end_token] * padding_len)
            input_ids = torch.cat((input_txt, padding), dim=0)
            label = torch.cat((input_txt[1:], torch.tensor([self.end_token]), padding), dim=0)
        else:
            input_ids = input_txt[:self.max_length]
            label = torch.cat((input_txt[1:self.max_length], torch.tensor([self.end_token])), dim=0)
        return input_ids, label

In [20]:
tokenizer = ByteLevelBPETokenizer("../urnng/tokenizers/rnng/vocab.json", "../urnng/tokenizers/rnng/merges.txt")
# files = ["../data/processed/segmented.txt"]
# tokenizer.train(files=files, vocab_size=12726, min_frequency=500, special_tokens=[
#     "<s>", "<pad>", "</s>", "<unk>", "<mask>"
# ])
# tokenizer.save_model(".")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained(".")
# gpt2_tokenizer.save_pretrained(".")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [21]:
train_dataset = GPTDataset(
    data = sample_data,
    tokenizer = gpt2_tokenizer,
    max_length = 200
)

12726


In [103]:
# input_ids, label = train_dataset[2]
# input_ids = input_ids.unsqueeze(0)
# label = label.unsqueeze(0)
# print("Label:", label)
# print("Input IDs:", input_ids)
# print("Label Shape:", label.shape)
# print("Input IDs Shape:", input_ids.shape)

### Training pipeline

In [56]:
if torch.cuda.is_available():
    device = "cuda"
    print("Cuda is available. Using GPU.")
else:
    device = "cpu"
    print("Cuda is not available. Using CPU.")

Cuda is available. Using GPU.


In [57]:
lr = 5e-5
batch_size = 256
num_epochs = 18
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,)

In [81]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    total_tokens = 0
    for i in range(len(train_data)):
        optimizer.zero_grad()
        sents, length, batch_size, gold_actions, gold_spans, gold_binary_trees, other_data = train_data[100]
#         sents = sents[:1, :]
        
#         print("Inputs: ")
#         print([tokenizer.decode([idx.item()]) for idx in sents.squeeze()])
        
        batch_size, length = sents.size(0), sents.size(1)

        sents = sents.to(device)
        pad_tokens = torch.ones(sents.size(0), 1, dtype=sents.dtype).to(device)
        labels = torch.cat((sents[:, 1:], pad_tokens), dim=1)

        mask = generate_square_subsequent_mask(sents.size(1), device=device)

        logits = model(input_ids=sents, attention_mask=mask)
        preds = torch.argmax(logits, dim=2)
        
#         print("Preds: ")
#         print([tokenizer.decode([idx.item()]) for idx in preds.squeeze()])
#         print()
        log_probs_word = F.log_softmax(logits, 2)
        log_probs_word = torch.gather(log_probs_word, 2, labels.unsqueeze(2)).squeeze(2).sum(1)
        log_probs_word = log_probs_word.sum()
        print(log_probs_word)
        
        nll_loss = -log_probs_word.mean()
        print(nll_loss)
        nll_loss.backward()
        total_loss += nll_loss.item()*batch_size
        total_tokens += batch_size*length
        
        optimizer.step()
        break
    break
#     avg_nll_loss = total_loss/total_tokens
#     perplexity = torch.exp(torch.tensor(avg_nll_loss))
#     print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}, Perplexity: {perplexity.item()}')

tensor(-10244.0820, device='cuda:0', grad_fn=<SumBackward0>)
tensor(10244.0820, device='cuda:0', grad_fn=<NegBackward0>)


In [69]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    
    for batch in train_loader:
        optimizer.zero_grad()
        # Unpack input and label from each batch and move to device
        input_ids, labels = batch
        input_ids, labels = input_ids.to(device), labels.to(device)
        
        mask = generate_square_subsequent_mask(input_ids.size(1), device=device)
        
        # Forward pass
        logits = model(input_ids=input_ids, attention_mask=mask)
        
        # Flatten
        logits_flat = logits.view(-1, logits.size(-1))
        labels_flat = labels.view(-1)
        
        # Compute loss
        loss = criterion(logits_flat, labels_flat)
        
        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch: {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

/dev/shm/shtsai/PyTorch/1.12.1/foss-2022a-CUDA-11.7.0/pytorch-v1.12.1/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [174,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/dev/shm/shtsai/PyTorch/1.12.1/foss-2022a-CUDA-11.7.0/pytorch-v1.12.1/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [174,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/dev/shm/shtsai/PyTorch/1.12.1/foss-2022a-CUDA-11.7.0/pytorch-v1.12.1/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [174,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/dev/shm/shtsai/PyTorch/1.12.1/foss-2022a-CUDA-11.7.0/pytorch-v1.12.1/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelectLargeIndex: block: [174,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/dev/shm/shtsai/PyTorch/1.12.1/foss-2022a-CUDA-11.7.0/pytorch-v1.12.1/aten/src/ATen/native/cuda/Indexing.cu:975: indexSelect

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [74]:
vocab_size = 50257
embed_dim = 768
max_len = 1024
embed_dropout = 0.1
num_blocks = 12 # or 24 for GPT-2 XL
num_heads = 12 # or 24 for GPT-2 XL
ff_dim = 3072
attn_dropout = 0.1
ff_dropout = 0.1

model = GPT(
    vocab_size=vocab_size,
    embed_dim=embed_dim,
    max_len=max_len,
    embed_dropout=embed_dropout,
    num_blocks=num_blocks,
    num_heads=num_heads,
    ff_dim=ff_dim,
    attn_dropout=attn_dropout,
    ff_dropout=ff_dropout
)

In [75]:
model_name = "gpt2"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [136]:
input_txt = "Machine Learning with PyTorch can do amazing"
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)

print(input_ids)
print(input_ids.shape)

tensor([[37573, 18252,   351,  9485, 15884,   354,   460,   466,  4998]],
       device='cuda:0')
torch.Size([1, 9])


In [137]:
model = model.to(device)
iterations = []
n_steps = 10
choices_per_step = 5

with torch.no_grad():
    for _ in range(n_steps):
        iteration = dict()
        iteration["Input"] = tokenizer.decode(input_ids[0])
        output = model(input_ids=input_ids)
        
        # Select logits of the first batch, last token, and apply softmax to get probability distribution
        next_token_logits = output[0, -1, :]
        next_token_probs = torch.softmax(next_token_logits, dim=-1)
        sorted_ids = torch.argsort(next_token_probs, dim=-1, descending=True)
        
        # Store tokens with highest probability in a table
        for choice_idx in range(choices_per_step):
            token_id = sorted_ids[choice_idx]
            token_prob = next_token_probs[token_id].cpu().numpy()
            token_choice = (
                f"{tokenizer.decode(token_id)} ({100 * token_prob:.2f}%)"
            )
            iteration[f"Choice {choice_idx+1}"] = token_choice
        iterations.append(iteration)
        
        # Append predicted next token to input
        input_ids = torch.cat([input_ids, sorted_ids[None, 0, None]], dim=-1)

sample_inference = pd.DataFrame(iterations)

In [138]:
sample_inference.head()

Unnamed: 0,Input,Choice 1,Choice 2,Choice 3,Choice 4,Choice 5
0,Machine Learning with PyTorch can do amazing,visuals (87.56%),230 (10.46%),262 (1.93%),ABE (0.03%),Order (0.01%)
1,Machine Learning with PyTorch can do amazing v...,dissatisf (86.26%),moan (5.94%),Blades (4.90%),Whe (2.15%),awaited (0.43%)
2,Machine Learning with PyTorch can do amazing v...,application (52.74%),Sisters (47.26%),dodged (0.00%),eloqu (0.00%),largely (0.00%)
3,Machine Learning with PyTorch can do amazing v...,ac (78.47%),deities (20.22%),agency (1.02%),manifold (0.12%),228 (0.10%)
4,Machine Learning with PyTorch can do amazing v...,Hundred (99.95%),queries (0.03%),tease (0.02%),unused (0.00%),weighed (0.00%)


In [149]:
def generate_text_until_end(
    input_text:str,
    model:GPT,
    tokenizer:AutoTokenizer,
    max_length:int=100,
    device='cpu',
):
    model = model.to(device)
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
    end_token_id = tokenizer.eos_token_id
    generated_ids = input_ids.flatten().clone()
    
    with torch.no_grad():
        while True:
            output = model(input_ids=input_ids)
            next_token_logits = output[:, -1, :]
            next_token_id = torch.argmax(next_token_logits, dim=-1)
            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
            
            input_ids = next_token_id.unsqueeze(0)
            
            if next_token_id == end_token_id or len(generated_ids) >= max_length:
                break
                
    generated_text = tokenizer.decode(generated_ids, skip_special_tokens=True)
    return generated_text

In [151]:
generated_text = generate_text_until_end(
 input_text="I like to eat",
 model=model,
 tokenizer=tokenizer,
 max_length=20,
 device=device,
)
print(generated_text)

I like to eat believableADD safelyuez posterior Niagara vegetable bowel € Humph Dick characteristic Queens pound enzymeッ
