In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import matplotlib.pyplot as plt
import torchtext

import tiktoken
import pandas as pd
import numpy as np

from torch.nn import functional as F
import torch.nn as nn
from tqdm.notebook import tqdm
import pickle
import ast

In [2]:
import os

#dataset https://nlp.stanford.edu/projects/nmt/

#tiktoken api https://github.com/openai/tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")

# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
tokenizer = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "<|PAD|>": 100277,
        "<|START|>": 100278,
        "<|END|>": 100279,
        "<|DEL|>": 100280,

    }
)
print(tokenizer.n_vocab) #this is the number of tokens in our tokenizer
print(tokenizer._special_tokens) #prints out our special tokens 

specials = {"<|PAD|>","<|START|>","<|END|>", "<|DEL|>"}

print(dir(tokenizer))

pad_token = 100277
start_token = 100278
end_token = 100279
del_token = 100280

100281
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276, '<|PAD|>': 100277, '<|START|>': 100278, '<|END|>': 100279, '<|DEL|>': 100280}
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_core_bpe', '_encode_bytes', '_encode_only_native_bpe', '_encode_single_piece', '_mergeable_ranks', '_pat_str', '_special_tokens', 'decode', 'decode_batch', 'decode_bytes', 'decode_bytes_batch', 'decode_single_token_bytes', 'decode_tokens_bytes', 'encode', 'encode_batch', 'encode_ordinary', 'encode_ordinary_batch', 'encode_single_token', 'encode_with_unstable', 'eot_token', 'max_token_value', 'n_vocab', 'name', 'special_tokens_set', '

In [3]:
#GLOBALS

block_size = 16 #This is the value of T
batch_size = 32 #This it the value of B
n_embed = 512
dropout = 0.1
n_heads = 8
n_layers = 6

learning_rate = 1e-5

eval_interval = 500
eval_iters = 200
max_iters = 50000

beam_size = 7

vocab_size = tokenizer.n_vocab
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

device is: cuda


In [4]:
def create_dataset():
    num_examples = 750000

    en_max = 0
    en_length = []
    with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
        sentences_en = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = line
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            en_length.append(len(tok_sentence))
            if len(tok_sentence) > en_max:
                en_max = len(tok_sentence)
                print(en_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[pad_token]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_en.append(tok_sentence)
            else:
                sentences_en.append(block_size*[del_token])

    en_length = torch.tensor(en_length).float()     
    print(en_max)    
    print(f"Length of sentences: {len(sentences_en)}")

    de_max = 0
    de_length = []
    with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
        sentences_de = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|START|> " + (line) + " <|END|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            de_length.append(len(tok_sentence))
            if len(tok_sentence) > de_max:
                de_max = len(tok_sentence)
                print(de_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[pad_token]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_de.append(tok_sentence)
            else:
                sentences_de.append(block_size*[del_token])

    de_length = torch.tensor(de_length).float()               
    print(de_max) 
    print(f"Length of sentences: {len(sentences_de)}")

    print("Removing sentences whos length is greater than our block_size")

    #combine the arrays together
    sentences = np.array([sentences_en, sentences_de])
    #check for indices in both sentences that have rows containing the DEL token
    idx = np.where(sentences == del_token)

    #delete every row that contains the DEL token
    sentences = np.delete(sentences, idx[1], axis = 1)

    #splitting to german and english

    sentences_en = torch.tensor(sentences[0], dtype=torch.long)
    sentences_de = torch.tensor(sentences[1], dtype=torch.long)

    print(f"Length of new english sentences: {len(sentences_en)}")
    print(f"Length of new german sentences: {len(sentences_de)}")

    print(f"Average length of english tokenized sentence: {torch.mean(en_length):.4f}, and with std: {torch.std(en_length):.4f}")
    print(f"Average length of german tokenized sentence: {torch.mean(de_length):.4f}, and with std: {torch.std(de_length):.4f}")

    with open(os.getcwd()+'\\data\\english_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_en, f)

    with open(os.getcwd()+'\\data\\german_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_de, f)


In [5]:
def create_dataset():
    with open(os.getcwd()+'\\data\\deu.txt', 'r', encoding='utf8') as f:
        text = f.read()
        print(len(f.readlines()))
        lines = text.strip().split('\n')
        pairs = [line.split('\t') for line in lines]
        
    eng = [pairs[i][0] for i in range(len(pairs))]
    ger = [pairs[i][1] for i in range(len(pairs))]
    en_max = 0
    en_length = []
    
    sentences_en = []
    for i in tqdm(range(152818)):
        line = eng[i]
        line = line.replace("\n", "")
        len_pad = 0
        sentence = line
        tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
        en_length.append(len(tok_sentence))
        if len(tok_sentence) > en_max:
            en_max = len(tok_sentence)
            #print(en_max)

        if len(tok_sentence) <= block_size:
            len_pad = block_size - len(tok_sentence)
            tok_sentence = tok_sentence + len_pad*[pad_token]
            assert len(tok_sentence) == block_size, print(len(tok_sentence))
            sentences_en.append(tok_sentence)
        else:
            sentences_en.append(block_size*[del_token])

    en_length = torch.tensor(en_length).float()     
    print(en_max)    
    print(f"Length of sentences: {len(sentences_en)}")

    de_max = 0
    de_length = []
    sentences_de = []
    for i in tqdm(range(152818)):
        line = ger[i]
        line = line.replace("\n", "")
        len_pad = 0
        sentence = "<|START|> " + (line) + " <|END|>"
        tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
        de_length.append(len(tok_sentence))
        if len(tok_sentence) > de_max:
            de_max = len(tok_sentence)
            #print(de_max)

        if len(tok_sentence) <= block_size:
            len_pad = block_size - len(tok_sentence)
            tok_sentence = tok_sentence + len_pad*[pad_token]
            assert len(tok_sentence) == block_size, print(len(tok_sentence))
            sentences_de.append(tok_sentence)
        else:
            sentences_de.append(block_size*[del_token])

    de_length = torch.tensor(de_length).float()               
    print(de_max) 
    print(f"Length of sentences: {len(sentences_de)}")

    print("Removing sentences whos length is greater than our block_size")

    #combine the arrays together
    sentences = np.array([sentences_en, sentences_de])
    #check for indices in both sentences that have rows containing the DEL token
    idx = np.where(sentences == del_token)

    #delete every row that contains the DEL token
    sentences = np.delete(sentences, idx[1], axis = 1)

    #splitting to german and english

    sentences_en = torch.tensor(sentences[0], dtype=torch.long)
    sentences_de = torch.tensor(sentences[1], dtype=torch.long)

    print(f"Length of new english sentences: {len(sentences_en)}")
    print(f"Length of new german sentences: {len(sentences_de)}")

    print(f"Average length of english tokenized sentence: {torch.mean(en_length):.4f}, and with std: {torch.std(en_length):.4f}")
    print(f"Average length of german tokenized sentence: {torch.mean(de_length):.4f}, and with std: {torch.std(de_length):.4f}")

    with open(os.getcwd()+'\\data\\english_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_en, f)

    with open(os.getcwd()+'\\data\\german_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_de, f)


In [6]:
create = False
if create:
    create_dataset()

In [7]:
#TRAIN AND VAL DATASETS

with open(os.getcwd()+'\\data\\english_sentences.pkl', 'rb') as f:
    english_sentences = pickle.load(f)

with open(os.getcwd()+'\\data\\german_sentences.pkl', 'rb') as f:
    german_sentences = pickle.load(f)

In [8]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

print(train_data_en[0].shape)

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (batch_size,))
    #print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = pad_token)

    x, y, t = x.to(device), y.to(device), t.to(device)

    return x, y, t


torch.Size([16])


In [9]:
x, y, t = get_batch("train")


In [10]:
import math
def PositionalEncoding(seq_len, n_embd):
        
    pos_enc = torch.zeros(seq_len, n_embd)
    position = torch.arange(0, seq_len, dtype = torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, n_embd, 2) * (-math.log(10000.0) / n_embd))
    pos_enc[:, 0::2] = torch.sin(position * div_term)
    pos_enc[:, 1::2] = torch.cos(position * div_term)

    return pos_enc.to(device)

def get_padding_mask_matrix(x, x_embed):
    """
    x is the (B, T) tokenized matrix with padding included
    x_embed is the embedded matrix that we will convert all rows to zero based on the corresponding row index = padding index
    """

    #locate every index in each tokenized sentence which contains the pad index
    pad_indices = torch.nonzero(x == pad_token).squeeze().to(device) #This will return a (N, 2) where the first column represents the sentence (batch_index) and the second column represents the corresponding index which is the pad index (This 2nd column represents which row we will set to all zeros)

    #initialize a torch.ones of the shape of the embedding matrix
    mask = torch.ones(x_embed.shape).to(device)

    #For each row in the pad_indices matrix, we go to pad_indices[0] to grab the current batch example, and we go to the corresponding row of the batch example using the value of pad_indices[1]. We turn every column of that row into zeros
    #ex: if the current row is [1, 4], then we go to the 2nd batch example, go to the 4th row, and wipe it clean with zeroes
    mask[pad_indices[:,0], pad_indices[:,1], :] = 0

    #element-wise product
    x_padded = x_embed * mask

    return x_padded.to(device)


def apply_padding_mask(x, padding_token):
    """
    Apply padding mask to input sequence, ignoring padding tokens.
    
    Args:
        input_sequence (torch.Tensor): Input sequence of shape (B, T).
        padding_token: Padding token value used in the input sequence.
        
    Returns:
        torch.Tensor: Padding mask of shape (B, T) with 0s for padding tokens and 1s for non-padding tokens.
    """
    padding_mask = (x != padding_token).bool()
    return padding_mask

In [None]:
class BeamNode():
    def __init__(self, tokens, score):
        self.tokens = tokens #pytorch tensor
        self.score = score #float
        self.is_finished = False

    def update(self, score):
        self.score = score

    def get_token(self):
        return self.tokens

    def print_(self):
        print(f"tokenized sentence: {self.tokens}, ", f"log_prob score: {self.score}")

In [497]:
class Head(nn.Module):
    def __init__(self, head_size, decoder = False):
        super().__init__()
        self.decoder = decoder
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size, bias = False)
        self.Wq = nn.Linear(n_embed, head_size, bias = False)
        self.Wv = nn.Linear(n_embed, head_size, bias = False)

        if self.decoder:
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, data):
        x = data['x']
        padding_mask = data['mask']

        B, T, C = x.shape
        #assume input is of size (B, T, C)
        K = self.Wk(x) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(x) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)

        attention_scores = attention_scores.masked_fill_(padding_mask.unsqueeze(1) == False, float('-inf'))

        if self.decoder:
            attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
            
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)

        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, T) @ (B, T, head_size) = (B, T, head_size)

        return out

class crossHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size, bias = False)
        self.Wq = nn.Linear(n_embed, head_size, bias = False)
        self.Wv = nn.Linear(n_embed, head_size, bias = False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, data):
        x = data['x']
        enc_out = data['enc_out']
        padding_mask = data['mask']

        B, T, C = x.shape
        #assume x is of shape (B, T, C)
        #assume enc_out is of shape (B, T, C)

        K = self.Wk(enc_out) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(enc_out) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)

        attention_scores = attention_scores.masked_fill_(padding_mask.unsqueeze(1) == False, float('-inf'))

        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)


        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, head_size)

        return out


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, head_size, decoder):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, decoder) for _ in range(n_heads)])
        #output of heads is of size (B, T, n_heads*head_size)
        self.proj = nn.Linear(head_size * n_heads, n_embed)

        self.dropout = nn.Dropout(dropout)

    def forward(self, data):

        out = torch.cat([h(data) for h in self.heads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)

        return out

class MultiHeadCrossAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.heads = nn.ModuleList([crossHead(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads*head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, data):
        x = torch.cat([h(data) for h in self.heads], dim = -1)
        x = self.proj(x)
        x = self.dropout(x)

        return x

class Embedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embed, padding_idx = pad_token, device = device)

    def forward(self, x):
        #assume x is of shape (B, T)
        return self.embedding(x.long()) * n_embed**(1/2)

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(n_embed, 4*n_embed)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(4*n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

class EncoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads

        self.sa = MultiHeadSelfAttention(head_size, decoder = False)
        self.ffw = FeedForward()
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)

    def forward(self, data):
        x = data['x']
        enc_mask = data['mask']
        #assume input x is of size (B, T, C)
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.sa(data) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x = self.ffw(x) #(B, T, C)

        return {'x': x, 'mask': enc_mask}

class DecoderCrossBlock(nn.Module):
    #one implementation of the multi head cross attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads
        self.heads = MultiHeadCrossAttention(head_size)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, data):
        #assume parameters[0] is input of shape (B, T, C), It is the output of the decoder self attention layer
        #assume parameters is a list of length 2: first element is the output of the previous hidden layer, and the 2nd element is the output of the encoder
        #print(data)
        x = data['x']
        #print(x)
        enc_out = data['enc_out']
        dec_mask = data['mask']

        x = self.layernorm1(x) #(B, T, C)
        x = x + self.heads(data) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x + self.ffw(x) #(B, T, C)

        return {'x': x, 'enc_out': enc_out, 'mask': dec_mask}

class DecoderSelfBlock(nn.Module):
    #one implementation of the multi head self attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed//n_heads
        self.sa = MultiHeadSelfAttention(head_size, decoder = True)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, data):
        
        x = data['x']
        dec_mask = data['mask']
        #assume x is of shape (B, T, C)
        x = self.layernorm1(x)
        x = x + self.sa(data)
        x = self.layernorm2(x)
        x = x + self.ffw(x)

        return {'x': x, 'mask': dec_mask}

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.tok_embedding_matrix_x = Embedding()
        self.tok_embedding_matrix_y = Embedding()

        #positional embedding is a function that requires no backpropagation, so we don't need to initialize it in here
        
        self.EncoderBlocks = nn.Sequential(*[EncoderBlock() for _ in range(n_layers)])
        self.DecoderSelfBlocks = nn.Sequential(*[DecoderSelfBlock() for _ in range(n_layers)])
        self.DecoderCrossBlocks = nn.Sequential(*[DecoderCrossBlock() for _ in range(n_layers)])

        self.final_layernorm = nn.LayerNorm(n_embed)
        self.final_linear = nn.Linear(n_embed, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    #looks to be very brokenish? idk
    def greedy_translate(self, x):
        #assume input is a english sentence
        tokenized_sentence = tokenizer.encode(x, allowed_special = specials)
        #make sure the sentence is less than out block size
        assert len(tokenized_sentence) <= block_size, print("this sentence is greater than our block_size")

        len_pad = block_size - len(tokenized_sentence)
        tokenized_sentence = torch.tensor(tokenized_sentence + len_pad*[pad_token]).view(1, -1).to(device)
        print(tokenized_sentence)

        run = True

        #initialize an output of size block_size with pad tokens
        out = block_size*[pad_token]
        out[0] = start_token
        out = torch.tensor(out).view(1, -1).to(device)
        i = 1
        while run is True:
            T = out.shape[1]

            logits, loss = self(tokenized_sentence, out)
            logits = logits[:, i, :] #becomes (B, C)
            #apply softmax to get the probabilities
            probs = F.softmax(logits, dim =-1) # (B, C)
            #sample from the distribution
            # print("top7 prob: ", torch.topk(probs, 20)[0])
            # print("top7 idx: ", torch.topk(probs, 20)[1])
            # print('ger:',tokenizer.decode((torch.topk(probs, 7)[1].int().tolist())[0]))
            idx_next = torch.max(probs, dim = -1)[1] # (B, 1)
            #out = torch.cat((out, idx_next), dim = 1) #(B, T+1)
            out[0][i] = idx_next.item()
            print('next token: ',idx_next)
            #print(out[0].tolist())
            #print(tokenizer.decode(out[0].tolist()))

            i += 1
            if idx_next == end_token or i == block_size:
                run = False

        #post processing
        translated_sentence = out[0].tolist()
        translated_sentence = tokenizer.decode(translated_sentence)
        translated_sentence = translated_sentence.replace("<|PAD|>",'')
        return translated_sentence
    
    def beam_translate(self, x):
        tokenized_sentence = tokenizer.encode(x, allowed_special = specials)
        #make sure the sentence is less than out block size
        assert len(tokenized_sentence) <= block_size, print("this sentence is greater than our block_size")

        len_pad = block_size - len(tokenized_sentence)
        tokenized_sentence = torch.tensor(tokenized_sentence + len_pad*[pad_token]).view(1, -1).to(device)

        out = block_size*[pad_token]
        out[0] = start_token
        out = torch.tensor(out).view(1, -1).to(device)

        #initialize the beam nodes, each node will contain a tokenized pytorch tensor of tokens along with their corresponding log_score
        beams = [BeamNode(torch.tensor(out, requires_grad=False), 0.0) for _ in range(beam_size)]
        final_beams = beams #create a reference copy of the beam list, we will use this to re-extract the best beam from this var
        logits, loss = self(tokenized_sentence, out) #generate the second token (first is SOS token)

        #first pass
        t = 1

        logits = logits[:, t, :] #becomes (B, C)

        #probs = F.softmax(logits, dim = -1)
        log_probs = F.log_softmax(logits, dim = -1)
        topk_probs, topk_indices = torch.topk(log_probs, beam_size) #grab the log_probs and indices of the best beam_size tokens

        for i in range(len(beams)):
            beams[i].tokens[0][t] = topk_indices[0][i].reshape(1) #change the 2nd token inplace (prob a better way to do this i guess)
            beams[i].update(beams[i].score + topk_probs[0][i]) #update the score in the beam

        #remaining passes
        for t in range(2,block_size):
            candidates = [] #at each time iteration, refresh the candidate list. This list will contain all the candidate token sentences and their corresponding log_scores as a 2 dimensional tuple
            for i in range(len(beams)): #for each active beam (some beams can be deactivated if they reach the 'EOS' token)
                logits, _ = self(tokenized_sentence,beams[i].tokens) #run the model on the current beam's tokens
                logits = logits[:, t, :] #B, C 
                log_probs = F.log_softmax(logits, dim = -1) #calculate log probs
                topk_probs, topk_indices = torch.topk(log_probs, beam_size) ##grab the log_probs and indices of the best beam_size tokens

                for j in range(beam_size): #for each active candidate tokenized sentence (active means not closed, i.e 'EOS token is reached), grab the beam_size best possible next token and calculate their updated scores
                    candidate_token = beams[i].get_token()[0][:t] #specify the candidate token as the original/CURRENT set of tokens UP TO time t, (basically we just take the original current beam tokens and remove all the <|PAD|> tokens)
                    candidate_token = torch.cat((candidate_token, topk_indices[0][j].reshape(1))) #append the candidate token onto the curent set of tokens (up to time t)
                    candidate_score = beams[i].score #grab the original set of tokens score
                    candidates.append((candidate_token, candidate_score + topk_probs[0][j])) #append the new set of tokens and how high the score is

            candidates_sorted = sorted(candidates, key=lambda x: x[1], reverse=True)[:len(beams)] #sort the candidate list by score. best is first

            for i in range(len(beams)): #update all of the active beams to contain the best set of appended tokenized tensors
                beams[i].update(candidates_sorted[i][1])
                beams[i].tokens[0][:len(candidates_sorted[i][0])] = candidates_sorted[i][0]

                if end_token in beams[i].tokens[0]: # if EOS token is appended, then we deactivate the beam
                    beams[i].is_finished = True

            beams = [beam for beam in beams if beam.is_finished != True] #for the next iteration of t, only work on the active beams
            print(len(beams))

        best_ = max(final_beams, key = lambda x: x.score).tokens #grab the best beam after beam search is over
        translated_sentence = tokenizer.decode(best_.tolist()[0])
        translated_sentence = translated_sentence.replace("<|PAD|>",'')

        return translated_sentence
    
    def forward(self, x, y, targets = None):
        Bx, Tx = x.shape
        Cx = n_embed
        
        tok_embed_x = self.tok_embedding_matrix_x(x)
        pos_embed_x = PositionalEncoding(Tx, Cx)

        By, Ty, = y.shape
        Cy = n_embed

        tok_embed_y = self.tok_embedding_matrix_y(y)
        pos_embed_y = PositionalEncoding(Ty, Cy)

        tok_pos_embed_x = tok_embed_x + pos_embed_x
        tok_pos_embed_y = tok_embed_y + pos_embed_y
        
        # masked_tok_embed_x = get_padding_mask_matrix(x, tok_pos_embed_x)
        # masked_tok_embed_y = get_padding_mask_matrix(y, tok_pos_embed_y)

        enc_mask = apply_padding_mask(x, pad_token)
        dec_mask = apply_padding_mask(y, pad_token)

        x = tok_pos_embed_x
        y = tok_pos_embed_y

        x_in = {'x': x, 'mask': enc_mask}
        y_in = {'x': y, 'mask': dec_mask}

        #encoder
        enc_out = self.EncoderBlocks(x_in)['x']

        #decoder self
        y = self.DecoderSelfBlocks(y_in)['x']

        #decoder cross
        #its ideal to send in one parameter only (i.e self, x) when passing parameters through stacked layers in an nn.Sequential, so we have to combine our previous hidden state output along with the enc_out into one object

        in_ = {'x': y, 'enc_out': enc_out, 'mask': dec_mask}
        y = self.DecoderCrossBlocks(in_)

        #grab the transformed decoder input from the cross attention layer
        y = y['x']
        #remaining layers
        y = self.final_layernorm(y)
        logits = self.final_linear(y)

        if targets is not None:
            logits = logits.view(By*Ty, -1)
            targets = targets.view(targets.shape[0]*targets.shape[1])
            loss = F.cross_entropy(logits, targets)

        else:
            loss = None

        return logits, loss



In [12]:
model = Transformer()
m = model.to(device)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

210.848185 M parameters


In [13]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

loss_fn = nn.CrossEntropyLoss(ignore_index = pad_token)
#loss_fn = nn.CrossEntropyLoss()
# for var_name in optimizer.state_dict():
#     print(var_name, "\t", optimizer.state_dict()[var_name])

In [14]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb, tb = get_batch(split)
            logits, loss = model(xb, yb)

            B, T, = yb.shape
            C = n_embed

            logits = logits.view(B*T, -1)
            tb = tb.view(tb.shape[0]*tb.shape[1])
            loss = loss_fn(logits, tb)
            
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [78]:
model.train()
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        lr_ = optimizer.param_groups[0]["lr"]
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, lr {lr_}")

    # sample a batch of data
    xb, yb, tb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)

    B, T, = yb.shape
    C = n_embed

    logits = logits.view(B*T, -1)
    tb = tb.view(tb.shape[0]*tb.shape[1])
    loss = loss_fn(logits, tb)


    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 0.1646, val loss 0.1658, lr 0.0001
step 500: train loss 0.1669, val loss 0.1698, lr 0.0001
step 1000: train loss 0.1728, val loss 0.1686, lr 0.0001
step 1500: train loss 0.1678, val loss 0.1698, lr 0.0001
step 2000: train loss 0.1670, val loss 0.1707, lr 0.0001
step 2500: train loss 0.1708, val loss 0.1701, lr 0.0001
step 3000: train loss 0.1695, val loss 0.1678, lr 0.0001
step 3500: train loss 0.1703, val loss 0.1684, lr 0.0001
step 4000: train loss 0.1703, val loss 0.1693, lr 0.0001
step 4500: train loss 0.1715, val loss 0.1691, lr 0.0001
step 5000: train loss 0.1676, val loss 0.1684, lr 0.0001
step 5500: train loss 0.1695, val loss 0.1710, lr 0.0001
step 6000: train loss 0.1749, val loss 0.1699, lr 0.0001
step 6500: train loss 0.1724, val loss 0.1701, lr 0.0001
step 7000: train loss 0.1734, val loss 0.1740, lr 0.0001
step 7500: train loss 0.1712, val loss 0.1684, lr 0.0001
step 8000: train loss 0.1675, val loss 0.1661, lr 0.0001
step 8500: train loss 0.1729, val l

In [16]:
# filepath = os.getcwd()+"\\machine_model\\model.pt"
# torch.save(model.state_dict(), filepath)
# print("model saved at:", filepath)

model saved at: d:\Documents\Github\robots\machine_model\model.pt


rough work

In [498]:
filepath = os.getcwd()+"\\machine_model\\model.pt"
model = Transformer()
model.load_state_dict(torch.load(filepath))
model.eval()
m = model.to(device)

inference testing

In [95]:
english_sentences[10000]

tensor([    40,   4265,   1093,    264,  33566,     13, 100277, 100277, 100277,
        100277, 100277, 100277, 100277, 100277, 100277, 100277])

In [96]:
german_sentences[1500]

tensor([100278,  26946,  28826,  15826,  99014,    342,  82284,     13,    220,
        100279, 100277, 100277, 100277, 100277, 100277, 100277])

In [99]:
tokenizer.decode(english_sentences[5000].tolist())

"I don't see it.<|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|>"

In [98]:
tokenizer.decode(german_sentences[5000].tolist())

'<|START|> Ich sehe es nicht. <|END|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|>'

In [499]:
sentence = "Please don't use my name"
#sentence = 'hello'
uh_oh = m.greedy_translate(sentence)

tensor([[  5618,   1541,    956,   1005,    856,    836, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0')
next token:  tensor([14895], device='cuda:0')
next token:  tensor([7141], device='cuda:0')
next token:  tensor([8969], device='cuda:0')
next token:  tensor([308], device='cuda:0')
next token:  tensor([82312], device='cuda:0')
next token:  tensor([713], device='cuda:0')
next token:  tensor([7674], device='cuda:0')
next token:  tensor([15010], device='cuda:0')
next token:  tensor([0], device='cuda:0')
next token:  tensor([6529], device='cuda:0')
next token:  tensor([41762], device='cuda:0')
next token:  tensor([13], device='cuda:0')
next token:  tensor([13], device='cuda:0')
next token:  tensor([0], device='cuda:0')
next token:  tensor([100279], device='cuda:0')


In [501]:
sentence = "Please don't use my name"
#sentence = 'hello'
uh_oh = m.beam_translate(sentence)

  beams = [BeamNode(torch.tensor(out, requires_grad=False), 0.0) for _ in range(beam_size)]


7
7
7
7
7
7
7
7
6
4
4
4
4
0


In [502]:
uh_oh

'<|START|>cksih nicht n meinen seinenihchen.<|END|>'

TOKENIZER TESTING

In [159]:
#preprocessing the data needs to be done extremely carefully
#tiktoken is a byte processor which means they encode/tokenize by the bytes and not by the actual 'words' in the sentence
#for example:
print(tokenizer.encode("<|PAD|> I'd ", allowed_special = specials))
print(tokenizer.encode("Id ", allowed_special = specials))
print(tokenizer.encode("I'd", allowed_special = specials))
print(tokenizer.encode(" "))
#all have different encodings

#358 represents " I" and 4265 represents "'d"
#220 represents ' ' and 769 represents "Id"
#40 represents "I"

#its important to encode things as close to the original sentence as possible to avoid mistokenization of the input sentence
#for example, we want the model to learn "I'd" so we want 40 4265 and not [358 4265] as "I'd" or [100277 358 4268] as "I'd"

[100277, 358, 4265, 220]
[769, 220]
[40, 4265]
[220]


CLASS TESTING

In [None]:
#GLOBALS

block_size = 64 #This is the value of T
batch_size = 16 #This it the value of B
n_embed = 128
dropout = 0.2
n_heads = 8
n_layers = 6

learning_rate = 3e-4

eval_interval = 500
eval_iters = 200
max_iters = 5000

vocab_size = tokenizer.n_vocab
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

In [96]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (1,))
    #print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)

    #x, y, t = x.to(device), y.to(device), t.to(device)

    return x, y, t


In [97]:
xb, yb, t = get_batch('train')

xb.shape

torch.Size([1, 64])

In [98]:
xb

tensor([[100277,    220,    972,   4800,    433,   3782,    311,   1522,    430,
          99572,    323,    813,   1274,    304,    279,   3814,   7860,    835,
            567,     12,    567,    835,    567,    892,  20802,    872,    282,
          36036,   3871,   1174,    323,   1101,    315,    872,  24875,   2652,
            379,  12791,   1174,   1524,    682,    279,   3814,   7860,    835,
            567,     12,    567,    835,    567,    892,   1051,    814,  23738,
            872,    282,  36036,   3871,    662,    220, 100277, 100277, 100277,
         100277]])

In [102]:
x_ = tok_embedding_matrix(xb)

In [105]:
z = get_padding_mask_matrix(xb, x_)

In [106]:
torch.all(x_ == z)

tensor(True)

In [101]:
tok_embedding_matrix = nn.Embedding(vocab_size, n_embed, padding_idx = 100277, device = device)

x = tok_embedding_matrix(xb) + PositionalEncoding(block_size, n_embed)
y = tok_embedding_matrix(yb) + PositionalEncoding(block_size, n_embed)

x = get_padding_mask_matrix(xb, x)
y = get_padding_mask_matrix(yb, y)

In [94]:
e = Head(head_size = 64, decoder = False)
e = MultiHeadSelfAttention(64, decoder = False)
e = EncoderBlock()

In [None]:
out_e = e(x)

In [52]:
d = Head(head_size = 64, decoder = True)
d = MultiHeadSelfAttention(head_size = 64, decoder = True)
d = DecoderSelfBlock()

In [53]:
out_d = d(x)

In [55]:
c = crossHead(head_size = 64)
c = MultiHeadCrossAttention(head_size = 64)
c = DecoderCrossBlock()

In [56]:
out_c = c([out_d, out_e])

In [58]:
float('-inf') in out_c[0]

False

In [59]:
float('-inf') in out_c[1]

False

In [None]:
logits, loss = model(xb, yb)

In [None]:
logits

In [None]:
a = torch.cat((out_e, out_d), dim = 0)

In [None]:
a.shape

In [None]:
m = Transformer()

In [None]:
m(xb, yb, t)

In [None]:
encoder = tiktoken.get_encoding("cl100k_base")


In [None]:
tokenizer.encode("<|PAD|>", allowed_special = specials)

In [None]:
tokenizer.decode(english_sentences[0].tolist())

In [None]:
encoding.decode([83, 1609, 5963, 374, 2294, 0])

padding testing

In [None]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (2,))
    #print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)

    #x, y, t = x.to(device), y.to(device), t.to(device)

    return x, y, t


In [None]:
xb, yb, t = get_batch('train')

xb.shape

In [None]:
tok_embedding_matrix = nn.Embedding(vocab_size, 3)

x = tok_embedding_matrix(xb)
y = tok_embedding_matrix(yb)

In [None]:
xb

In [None]:
zero_indices = torch.nonzero(xb == 0).squeeze()

In [None]:
zero_indices

In [None]:
pad = torch.ones(x.shape)

In [None]:
pad

In [None]:
pad[zero_indices[:,0],zero_indices[:,1], :] = 0

In [None]:
pad

In [None]:
x

In [None]:
x_padded = x * pad

In [None]:
x_padded

beam testing

In [490]:
class BeamNode():
    def __init__(self, tokens, score):
        self.tokens = tokens #pytorch tensor
        self.score = score #float
        self.is_finished = False

    def update(self, score):
        self.score = score

    def get_token(self):
        return self.tokens

    def print_(self):
        print(f"tokenized sentence: {self.tokens}, ", f"log_prob score: {self.score}")
    

In [494]:
beam_size = 7
x = "Please don't use my name"

In [495]:
model.eval()
tokenized_sentence = tokenizer.encode(x, allowed_special = specials)
#make sure the sentence is less than out block size
assert len(tokenized_sentence) <= block_size, print("this sentence is greater than our block_size")

len_pad = block_size - len(tokenized_sentence)
tokenized_sentence = torch.tensor(tokenized_sentence + len_pad*[pad_token]).view(1, -1).to(device)

out = block_size*[pad_token]
out[0] = start_token
out = torch.tensor(out).view(1, -1).to(device)
beams = [BeamNode(torch.tensor(out, requires_grad=False), 0.0) for _ in range(beam_size)]
beams_ = beams
logits, loss = model(tokenized_sentence, out)

#first pass
t = 1

logits = logits[:, t, :] #becomes (B, C)

probs = F.softmax(logits, dim = -1)
log_probs = F.log_softmax(logits, dim = -1)
#print(torch.topk(probs, beam_size))
topk_probs, topk_indices = torch.topk(log_probs, beam_size)

for i in range(len(beams)):
    beams[i].tokens[0][t] = topk_indices[0][i].reshape(1)
    beams[i].update(beams[i].score + topk_probs[0][i])


  beams = [BeamNode(torch.tensor(out, requires_grad=False), 0.0) for _ in range(beam_size)]


In [496]:
#second pass
#t = 4
for t in range(2,block_size):
    candidates = []
    for i in range(len(beams)):
        logits, _ = model(tokenized_sentence,beams[i].tokens)
        logits = logits[:, t, :]
        log_probs = F.log_softmax(logits, dim = -1)
        topk_probs, topk_indices = torch.topk(log_probs, beam_size)

        for j in range(beam_size):
            candidate_token = beams[i].get_token()[0][:t]
            candidate_token = torch.cat((candidate_token, topk_indices[0][j].reshape(1)))
            candidate_score = beams[i].score
            candidates.append((candidate_token, candidate_score + topk_probs[0][j]))

    candidates_sorted = sorted(candidates, key=lambda x: x[1], reverse=True)[:len(beams)]

    for i in range(len(beams)):
        beams[i].update(candidates_sorted[i][1])
        beams[i].tokens[0][:len(candidates_sorted[i][0])] = candidates_sorted[i][0]

        if end_token in beams[i].tokens[0]:
            beams[i].is_finished = True

    beams = [beam for beam in beams if beam.is_finished != True]
    print(len(beams))

best_ = max(beams_, key = lambda x: x.score).tokens

print(tokenizer.decode(best_.tolist()[0]))

7
7
7
7
7
7
7
7
6
4
4
4
4
0
<|START|>cksih nicht n meinen seinenihchen.<|END|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|>


In [477]:
'<|START|>cksih nicht n meinengecheniten! zu ihn..!<|END|>'

tensor([[100278,  14895,   7141,   8969,    308,  82312,  60328,   7141,   7674,
             13, 100279, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0')

In [475]:
beams_[2].score

tensor(-13.6709, device='cuda:0', grad_fn=<AddBackward0>)

In [450]:
for i in range(len(beams)):
    beams[i].print_()

tokenized sentence: tensor([[100278,    451,    409,   6383,  68482, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -6.096290588378906
tokenized sentence: tensor([[100278,  14895,   7141,   8969,  82312, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -6.659762382507324
tokenized sentence: tensor([[100278,  14895,   7141,   8969,   8261, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -6.750249862670898


In [447]:
block_size

16

In [427]:
tokenizer.decode([100278,    451,    409,   6383,    409,   2234, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277])

'<|START|>de de Ver deung<|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|><|PAD|>'

In [391]:
a = [1,2,3,4]
b = [-1,-2,-3,-4,-5]

a[:len(b)] = b 
a

[-1, -2, -3, -4, -5]

In [173]:
#third pass
t = 4

candidates = []
for i in range(len(beams)):
    #calculate all the potential candidates for a certain beam
    logits, _ = model(tokenized_sentence,beams[i].tokens)
    logits = logits[:, t, :]
    log_probs = F.log_softmax(logits, dim = -1)
    topk_probs, topk_indices = torch.topk(log_probs, beam_size)

    candidate_token = beams[i].tokens.clone()
    candidate_score = beams[i].score
    for j in range(beam_size):
        candidate_token[0][t] = topk_indices[0][j].reshape(1)
        candidates.append((candidate_token, candidate_score + topk_probs[0][j]))

candidates_sorted = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]

for i in range(beam_size):
    beams[i].replace(candidates_sorted[i][0], candidates_sorted[i][1])

  log_probs = F.log_softmax(logits)


In [174]:
for i in range(beam_size):
    beams[i].print_()

tokenized sentence: tensor([[100278,    380,    753,  18268,  32076, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -5.329845428466797
tokenized sentence: tensor([[100278,   3675,  24459,  15148,   4838, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -5.539424896240234
tokenized sentence: tensor([[100278,     72,  59419,  59419,  59419, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -5.891436576843262
tokenized sentence: tensor([[100278,   3675,  24459,  15148,   4838, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0'),  log_prob score: -6.766961097717285
tokenized sentence: tensor([[100278,   3675,  24459,  15148,   4838, 100277, 100277,

In [165]:
candidates_ = sorted(candidates, key=lambda x: x[1], reverse=True)

candidates_[:5][0][1]

tensor(-3.1571, device='cuda:0', grad_fn=<AddBackward0>)

In [108]:
a = 2
b = a
a += 1
print(a)
print(b)
beams[0].tokens[0][2] = 3

candidate = beams[0].tokens.clone()
candidate[0][4] = 3
print(candidate)
print(beams[0].tokens)

3
2
tensor([[100278,   3675,      3, 100277,      3, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0')
tensor([[100278,   3675,      3, 100277, 100277, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0')


In [106]:
beams[0].tokens[0]

tensor([100278,   3675,      3, 100277, 100277, 100277, 100277, 100277, 100277,
        100277, 100277, 100277, 100277, 100277, 100277, 100277],
       device='cuda:0')

In [25]:
#assume input is a english sentence
tokenized_sentence = tokenizer.encode(x, allowed_special = specials)
#make sure the sentence is less than out block size
assert len(tokenized_sentence) <= block_size, print("this sentence is greater than our block_size")

len_pad = block_size - len(tokenized_sentence)
tokenized_sentence = torch.tensor(tokenized_sentence + len_pad*[pad_token]).view(1, -1).to(device)
print(tokenized_sentence)

run = True

#initialize an output of size block_size with pad tokens
out = block_size*[pad_token]
out[0] = start_token
out = torch.tensor(out).view(1, -1).to(device)
i = 1
while run is True:
    T = out.shape[1]

    logits, loss = model(tokenized_sentence, out)
    logits = logits[:, i, :] #becomes (B, C)
    #apply softmax to get the probabilities
    probs = F.softmax(logits, dim =-1) # (B, C)
    #sample from the distribution
    # print("top7 prob: ", torch.topk(probs, 20)[0])
    # print("top7 idx: ", torch.topk(probs, 20)[1])
    # print('ger:',tokenizer.decode((torch.topk(probs, 7)[1].int().tolist())[0]))
    idx_next = torch.max(probs, dim = -1)[1] # (B, 1)
    #out = torch.cat((out, idx_next), dim = 1) #(B, T+1)
    out[0][i] = idx_next.item()
    print('next token: ',idx_next)
    #print(out[0].tolist())
    #print(tokenizer.decode(out[0].tolist()))

    i += 1
    if idx_next == end_token or i == block_size:
        run = False

tensor([[ 15339,   1070, 100277, 100277, 100277, 100277, 100277, 100277, 100277,
         100277, 100277, 100277, 100277, 100277, 100277, 100277]],
       device='cuda:0')


In [21]:
logits, loss = model(xb, yb)

logits = logits[:, 0, :] #becomes (B, C)

print(logits.shape)
#apply softmax to get the probabilities
probs = F.softmax(logits, dim =-1) # (B, C)
#sample from the distribution
print("top7 prob: ", torch.topk(probs, 20)[0])
print("top7 idx: ", torch.topk(probs, 20)[1])
# print('ger:',tokenizer.decode((torch.topk(probs, 7)[1].int().tolist())[0]))
idx_next = torch.max(probs, dim = -1)[1] # (B, 1)

torch.Size([1, 100281])
top7 prob:  tensor([[5.6157e-05, 5.5625e-05, 5.3637e-05, 5.3094e-05, 5.3035e-05, 5.0744e-05,
         4.9988e-05, 4.9316e-05, 4.8993e-05, 4.7038e-05, 4.6860e-05, 4.5850e-05,
         4.5161e-05, 4.4819e-05, 4.4678e-05, 4.3743e-05, 4.3328e-05, 4.3037e-05,
         4.3021e-05, 4.2896e-05]], device='cuda:0', grad_fn=<TopkBackward0>)
top7 idx:  tensor([[75122, 49842, 65741,  2642, 41264, 25839, 71966, 31732, 18017, 21124,
         63982, 49981, 87437, 18853, 24884, 29664, 69622, 71430, 29328,  2452]],
       device='cuda:0')
