In [309]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import matplotlib.pyplot as plt
import torchtext

import tiktoken
import pandas as pd
import numpy as np

from torch.nn import functional as F
import torch.nn as nn
from tqdm.notebook import tqdm
import pickle
import ast

In [310]:
import os

#dataset https://nlp.stanford.edu/projects/nmt/

#tiktoken api https://github.com/openai/tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")

# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
tokenizer = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "<|PAD|>": 0,
        "<|START|>": 100278,
        "<|END|>": 100279,
        "<|DEL|>": 100280,
        "!": 100281
    }
)
print(tokenizer.n_vocab) #this is the number of tokens in our tokenizer
print(tokenizer._special_tokens) #prints out our special tokens 

specials = {"<|PAD|>","<|START|>","<|END|>", "<|DEL|>", "!"}

print(dir(tokenizer))

100282
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276, '<|PAD|>': 0, '<|START|>': 100278, '<|END|>': 100279, '<|DEL|>': 100280, '!': 100281}
['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_core_bpe', '_encode_bytes', '_encode_only_native_bpe', '_encode_single_piece', '_mergeable_ranks', '_pat_str', '_special_tokens', 'decode', 'decode_batch', 'decode_bytes', 'decode_bytes_batch', 'decode_single_token_bytes', 'decode_tokens_bytes', 'encode', 'encode_batch', 'encode_ordinary', 'encode_ordinary_batch', 'encode_single_token', 'encode_with_unstable', 'eot_token', 'max_token_value', 'n_vocab', 'name', 'special_tokens

In [332]:
#GLOBALS

block_size = 64 #This is the value of T
batch_size = 16 #This it the value of B
n_embed = 128
dropout = 0.2
n_heads = 8
n_layers = 6

learning_rate = 3e-4

eval_interval = 500
eval_iters = 200
max_iters = 15000

vocab_size = tokenizer.n_vocab
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

device is: cuda


In [312]:
def create_dataset():
    num_examples = 100000

    en_max = 0
    en_length = []
    with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
        sentences_en = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|PAD|> " + (line) + " <|PAD|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            en_length.append(len(tok_sentence))
            if len(tok_sentence) > en_max:
                en_max = len(tok_sentence)
                print(en_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[0]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_en.append(tok_sentence)
            else:
                sentences_en.append(block_size*[100280])

    en_length = torch.tensor(en_length).float()     
    print(en_max)    
    print(f"Length of sentences: {len(sentences_en)}")

    de_max = 0
    de_length = []
    with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
        sentences_de = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|START|> " + (line) + " <|END|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            de_length.append(len(tok_sentence))
            if len(tok_sentence) > de_max:
                de_max = len(tok_sentence)
                print(de_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[0]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_de.append(tok_sentence)
            else:
                sentences_de.append(block_size*[100280])

    de_length = torch.tensor(de_length).float()               
    print(de_max) 
    print(f"Length of sentences: {len(sentences_de)}")

    print("Removing sentences whos length is greater than our block_size")

    #combine the arrays together
    sentences = np.array([sentences_en, sentences_de])
    #check for indices in both sentences that have rows containing the DEL token
    idx = np.where(sentences == 100280)

    #delete every row that contains the DEL token
    sentences = np.delete(sentences, idx[1], axis = 1)

    #splitting to german and english

    sentences_en = torch.tensor(sentences[0], dtype=torch.long)
    sentences_de = torch.tensor(sentences[1], dtype=torch.long)

    print(f"Length of new english sentences: {len(sentences_en)}")
    print(f"Length of new german sentences: {len(sentences_de)}")

    print(f"Average length of english tokenized sentence: {torch.mean(en_length)}, and with std: {torch.std(en_length)}")
    print(f"Average length of german tokenized sentence: {torch.mean(de_length)}, and with std: {torch.std(de_length)}")

    with open(os.getcwd()+'\\data\\english_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_en, f)

    with open(os.getcwd()+'\\data\\german_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_de, f)


In [313]:
create = False
if create:
    create_dataset()

In [314]:
#TRAIN AND VAL DATASETS

with open(os.getcwd()+'\\data\\english_sentences.pkl', 'rb') as f:
    english_sentences = pickle.load(f)

with open(os.getcwd()+'\\data\\german_sentences.pkl', 'rb') as f:
    german_sentences = pickle.load(f)

In [315]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (batch_size,))
    #print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)

    x, y, t = x.to(device), y.to(device), t.to(device)

    return x, y, t


In [323]:
import math
def PositionalEncoding(seq_len, n_embd):
        
    pos_enc = torch.zeros(seq_len, n_embd)
    position = torch.arange(0, seq_len, dtype = torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, n_embd, 2) * (-math.log(10000.0) / n_embd))
    pos_enc[:, 0::2] = torch.sin(position * div_term)
    pos_enc[:, 1::2] = torch.cos(position * div_term)

    return pos_enc.to(device)

def get_padding_mask_matrix(x, x_embed):
    """
    x is the (B, T) tokenized matrix with padding included
    x_embed is the embedded matrix that we will convert all rows to zero based on the corresponding row index = padding index
    """

    #locate every index in each tokenized sentence which contains the pad index
    pad_indices = torch.nonzero(xb == 0).squeeze().to(device) #This will return a (N, 2) where the first column represents the sentence (batch_index) and the second column represents the corresponding index which is the pad index (This 2nd column represents which row we will set to all zeros)

    #initialize a torch.ones of the shape of the embedding matrix
    mask = torch.ones(x_embed.shape).to(device)

    #For each row in the pad_indices matrix, we go to pad_indices[0] to grab the current batch example, and we go to the corresponding row of the batch example using the value of pad_indices[1]. We turn every column of that row into zeros
    #ex: if the current row is [1, 4], then we go to the 2nd batch example, go to the 4th row, and wipe it clean with zeroes
    mask[pad_indices[0], pad_indices[1], :] = 0

    #element-wise product
    x_padded = x_embed * mask

    return x_padded


In [328]:
class Head(nn.Module):
    def __init__(self, head_size, decoder = False):
        super().__init__()
        self.decoder = decoder
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size, bias = False)
        self.Wq = nn.Linear(n_embed, head_size, bias = False)
        self.Wv = nn.Linear(n_embed, head_size, bias = False)

        if self.decoder:
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        #assume input is of size (B, T, C)
        K = self.Wk(x) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(x) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)

        if self.decoder:
            attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, T) @ (B, T, head_size) = (B, T, head_size)

        return out

class crossHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size, bias = False)
        self.Wq = nn.Linear(n_embed, head_size, bias = False)
        self.Wv = nn.Linear(n_embed, head_size, bias = False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_out):
        B, T, C = x.shape
        #assume x is of shape (B, T, C)
        #assume enc_out is of shape (B, T, C)

        K = self.Wk(enc_out) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(enc_out) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)
        # print("attn",attention_scores.shape)
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, head_size)

        return out


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, head_size, decoder):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, decoder) for _ in range(n_heads)])
        #output of heads is of size (B, T, n_heads*head_size)
        self.proj = nn.Linear(head_size * n_heads, n_embed)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)

        return out

class MultiHeadCrossAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.heads = nn.ModuleList([crossHead(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads*head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out):
        x = torch.cat([h(x, enc_out) for h in self.heads], dim = -1)
        x = self.proj(x)
        x = self.dropout(x)

        return x

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(n_embed, 4*n_embed)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(4*n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

class EncoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads

        self.sa = MultiHeadSelfAttention(head_size, decoder = False)
        self.ffw = FeedForward()
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        #assume input x is of size (B, T, C)
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.sa(x) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x = self.ffw(x) #(B, T, C)

        return x

class DecoderCrossBlock(nn.Module):
    #one implementation of the multi head cross attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads
        self.heads = MultiHeadCrossAttention(head_size)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, parameters):
        #assume parameters[0] is input of shape (B, T, C), It is the output of the decoder self attention layer
        #assume parameters is a list of length 2: first element is the output of the previous hidden layer, and the 2nd element is the output of the encoder
        
        x = parameters[0]
        enc_out = parameters[1]
        
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.heads(x, enc_out) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x + self.ffw(x) #(B, T, C)

        return [x, enc_out]

class DecoderSelfBlock(nn.Module):
    #one implementation of the multi head self attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed//n_heads
        self.sa = MultiHeadSelfAttention(head_size, decoder = True)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, x):
        #assume x is of shape (B, T, C)
        x = self.layernorm1(x)
        x = x + self.sa(x)
        x = self.layernorm2(x)
        x = x + self.ffw(x)

        return x

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.tok_embedding_matrix_x = nn.Embedding(vocab_size, n_embed) * n_embed*(1/2)
        #positional embedding is a function that requires no backpropagation, so we don't need to initialize it in here
        self.tok_embedding_matrix_y = nn.Embedding(vocab_size, n_embed) * n_embed*(1/2)
        
        self.EncoderBlocks = nn.Sequential(*[EncoderBlock() for _ in range(n_layers)])
        self.DecoderSelfBlocks = nn.Sequential(*[DecoderSelfBlock() for _ in range(n_layers)])
        self.DecoderCrossBlocks = nn.Sequential(*[DecoderCrossBlock() for _ in range(n_layers)])

        self.final_layernorm = nn.LayerNorm(n_embed)
        self.final_linear = nn.Linear(n_embed, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    #looks to be very broken
    def translate(self, x):
        #assume input is a english sentence
        tokenized_sentence = tokenizer.encode(x, allowed_special = specials)
        #make sure the sentence is less than out block size
        assert len(tokenized_sentence) <= block_size, print("this sentence is greater than our block_size")

        len_pad = block_size - len(tokenized_sentence)
        tokenized_sentence = torch.tensor(tokenized_sentence + len_pad*[0]).view(1, -1).to(device)
        run = True
        input = [100278]
        while run is True:
            input = torch.tensor(input).view(1, -1).to(device)
            T = input.shape[1]
            logits, loss = self(tokenized_sentence[:,:T], input)
            logits = logits[:, -1, :] #becomes (B, C)
            #apply softmax to get the probabilities
            probs = F.softmax(logits, dim =-1) # (B, C)
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            input = torch.cat((input, idx_next), dim = 1) #(B, T+1)

            if idx_next == 100279:
                run = False

        return input[0].tolist()
            
    def forward(self, x, y, targets = None):
        Bx, Tx = x.shape
        Cx = n_embed
        
        tok_embed_x = self.tok_embedding_matrix_x(x)
        pos_embed_x = PositionalEncoding(Tx, Cx)

        By, Ty, = y.shape
        Cy = n_embed

        tok_embed_y = self.tok_embedding_matrix_y(y)
        pos_embed_y = PositionalEncoding(Ty, Cy)

        tok_pos_embed_x = tok_embed_x + pos_embed_x
        tok_pos_embed_y = tok_embed_y + pos_embed_y
        
        masked_tok_embed_x = get_padding_mask_matrix(x, tok_pos_embed_x)
        masked_tok_embed_y = get_padding_mask_matrix(y, tok_pos_embed_y)

        x = masked_tok_embed_x
        y = masked_tok_embed_y

        #encoder
        enc_out = self.EncoderBlocks(x)

        #decoder self
        y = self.DecoderSelfBlocks(y)
        #decoder cross
        #its ideal to send in one parameter only (i.e self, x) when passing parameters through stacked layers in an nn.Sequential, so we have to combine our previous hidden state output along with the enc_out into one object
        y = self.DecoderCrossBlocks([y, enc_out])

        #grab the transformed decoder input from the cross attention layer
        y = y[0]
        #remaining layers
        y = self.final_layernorm(y)
        logits = self.final_linear(y)

        if targets is not None:
            logits = logits.view(By*Ty, -1)
            targets = targets.view(targets.shape[0]*targets.shape[1])

            loss = F.cross_entropy(logits, targets)

        else:
            loss = None

        return logits, loss



In [329]:
model = Transformer()
m = model.to(device)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

42.17081 M parameters


In [330]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [326]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y, t = get_batch(split)
            logits, loss = model(x, y, t)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [333]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb, tb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb, tb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


step 0: train loss 2.3025, val loss 2.3255
step 500: train loss 2.2582, val loss 2.2900
step 1000: train loss 2.2697, val loss 2.2266
step 1500: train loss 2.2147, val loss 2.1929
step 2000: train loss 2.1917, val loss 2.1668
step 2500: train loss 2.1639, val loss 2.1969
step 3000: train loss 2.1508, val loss 2.1401
step 3500: train loss 2.1145, val loss 2.1128
step 4000: train loss 2.0518, val loss 2.0902
step 4500: train loss 2.0528, val loss 2.0786
step 5000: train loss 2.0561, val loss 2.0532
step 5500: train loss 2.0276, val loss 2.0301
step 6000: train loss 2.0433, val loss 2.0288
step 6500: train loss 1.9840, val loss 2.0237
step 7000: train loss 1.9779, val loss 1.9638
step 7500: train loss 1.9323, val loss 1.9767
step 8000: train loss 1.9607, val loss 1.9611
step 8500: train loss 1.9514, val loss 1.9296
step 9000: train loss 1.9210, val loss 1.9312
step 9500: train loss 1.9184, val loss 1.9175
step 10000: train loss 1.8972, val loss 1.8785
step 10500: train loss 1.8978, val lo

In [None]:
filepath = os.getcwd()+"\\machine_model\\model.pt"
torch.save(model.state_dict(), filepath)
print("model saved at:", filepath)

model saved at: d:\Documents\Github\robots\machine_model\model.pt


rough work

In [None]:
model = Transformer()
model.load_state_dict(torch.load(filepath))
model.eval()
m = model.to(device)

In [None]:
english_sentences[0]

In [None]:
sentence = 'iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges ( corners ) of the steel ingot mould .'
uh_oh = m.translate(sentence)

In [90]:
tokenizer.decode(uh_oh)

'<|START|> Das Ferieniferösen allinten mit einem Restaurant Mitarbeiter mit den Platz mit allen Flughafen . <|END|>'

In [None]:
#GLOBALS

block_size = 64 #This is the value of T
batch_size = 16 #This it the value of B
n_embed = 128
dropout = 0.2
n_heads = 8
n_layers = 6

learning_rate = 3e-4

eval_interval = 500
eval_iters = 200
max_iters = 5000

vocab_size = tokenizer.n_vocab
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

In [272]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (batch_size,))
    #print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)

    #x, y, t = x.to(device), y.to(device), t.to(device)

    return x, y, t


In [273]:
xb, yb, t = get_batch('train')

xb.shape

torch.Size([16, 64])

In [279]:
tok_embedding_matrix = nn.Embedding(vocab_size, n_embed)

x = tok_embedding_matrix(xb)
y = tok_embedding_matrix(yb)

In [106]:
e = Head(head_size = 64, decoder = False)
e = MultiHeadSelfAttention(64, decoder = False)
e = EncoderBlock()

In [107]:
out_e = e(x)

In [124]:
d = Head(head_size = 64, decoder = True)
#d = MultiHeadSelfAttention(head_size = 64, decoder = True)
#d = DecoderSelfBlock()

In [None]:
c = crossHead(head_size = 64)
c = MultiHeadCrossAttention(head_size = 64)
c = DecoderCrossBlock()
d = DecoderCrossBlock()

In [None]:
a = torch.cat((out_e, out_d), dim = 0)

In [None]:
a.shape

torch.Size([32, 64, 512])

In [None]:
m = Transformer()

In [None]:
m(xb, yb, t)

torch.Size([16, 64])


(tensor([[ 0.3266,  0.9439, -0.6072,  ..., -0.8942, -0.2462,  0.5541],
         [ 0.3558, -0.0816, -1.0313,  ...,  0.2293, -0.4682,  1.1330],
         [ 1.0091, -0.0226, -0.3073,  ..., -1.1772,  0.4133,  1.5344],
         ...,
         [ 0.7896,  0.3576, -0.5905,  ...,  0.0613, -0.5474,  0.6708],
         [ 0.1137,  0.1101, -0.9164,  ...,  0.1007, -0.6001,  0.5187],
         [ 0.6944,  0.8457, -0.8022,  ..., -0.2061, -0.3018,  0.6769]],
        grad_fn=<ViewBackward0>),
 tensor(11.1942, grad_fn=<NllLossBackward0>))

In [None]:
encoder = tiktoken.get_encoding("cl100k_base")


In [None]:
tokenizer.encode("<|PAD|>", allowed_special = specials)

[0]

In [None]:
tokenizer.decode(english_sentences[0].tolist())

TypeError: Encoding.decode() got an unexpected keyword argument 'allowed_special'

In [None]:
encoding.decode([83, 1609, 5963, 374, 2294, 0])

'tiktoken is great!'

In [None]:
# block_size = 10
# num_examples = 5

# en_max = 0 
# with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
#     idx_en = []
#     sentences_en = []
#     for i in tqdm(range(num_examples)):
#         line = f.readline()
#         line = line.replace("\n", "")
#         len_pad = 0
#         sentence = "<|PAD|> " + (line) + " <|PAD|>"
#         print(sentence)
#         print(tokenizer.encode(sentence, allowed_special = specials))
#         tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
#         print(len(tok_sentence))
#         if len(tok_sentence) > en_max:
#             en_max = len(tok_sentence)
#             print(en_max)

#         if len(tok_sentence) <= block_size:
#             len_pad = block_size - len(tok_sentence)
#             tok_sentence = tok_sentence + len_pad*[100277]
#             assert len(tok_sentence) == block_size, print(len(tok_sentence))
#             #idx_en.append(i)
#             sentences_en.append(tok_sentence)
#         else:
#             sentences_en.append(block_size*[100280])

# print(en_max)    
# print(f"Length of sentences: {len(sentences_en)}")


# de_max = 0 
# with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
#     idx_de = []
#     sentences_de = []
#     for i in tqdm(range(num_examples)):
#         line = f.readline()
#         line = line.replace("\n", "")
#         len_pad = 0
#         sentence = "<|START|> " + (line) + " <|END|>"
#         tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
#         if len(tok_sentence) > de_max:
#             de_max = len(tok_sentence)
#             print(de_max)

#         if len(tok_sentence) <= block_size:
#             len_pad = block_size - len(tok_sentence)
#             tok_sentence = tok_sentence + len_pad*[100277]
#             assert len(tok_sentence) == block_size, print(len(tok_sentence))
#             #idx_en.append(i)
#             sentences_de.append(tok_sentence)
#         else:
#             sentences_de.append(block_size*[100280])
            
# print(de_max)  
# print(f"Length of sentences: {len(sentences_de)}")

  0%|          | 0/5 [00:00<?, ?it/s]

<|PAD|> iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges ( corners ) of the steel ingot mould . <|PAD|>
[0, 11245, 24532, 374, 264, 5644, 369, 1005, 25982, 902, 374, 17551, 439, 264, 1488, 1169, 555, 2231, 1919, 22145, 477, 14654, 304, 279, 51370, 13116, 320, 24359, 883, 315, 279, 9699, 6892, 354, 51370, 662, 220, 0]
38
38
<|PAD|> iron cement protects the ingot against the hot , abrasive steel casting process . <|PAD|>
[0, 11245, 24532, 36236, 279, 6892, 354, 2403, 279, 4106, 1174, 94804, 9699, 25146, 1920, 662, 220, 0]
18
<|PAD|> a fire restant repair cement for fire places , ovens , open fireplaces etc . <|PAD|>
[0, 264, 4027, 2800, 519, 13023, 24532, 369, 4027, 7634, 1174, 297, 21778, 1174, 1825, 4027, 27170, 5099, 662, 220, 0]
21
<|PAD|> Construction and repair of highways and ... <|PAD|>
[0, 24987, 323, 13023, 315, 60395, 323, 2564, 220, 0]
10
<|PAD|> An announcement must be commercial character . <|PAD|>
[0, 1556, 17480, 2

  0%|          | 0/5 [00:00<?, ?it/s]

60
60
Length of sentences: 5


In [None]:
# sentences = np.array([sentences_en, sentences_de])

# idx = np.where(sentences == 100280)
# print(idx[1])

[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4
 4 4 4 4 4 4]


In [None]:
# sentences = np.delete(sentences, idx[1], axis = 1)

In [None]:
# sentences

array([], shape=(2, 0, 10), dtype=int32)

In [209]:
def split_zero_indices(lst):
    tensor = torch.tensor(lst)
    zero_indices = torch.nonzero(tensor == 0).squeeze()
    split_indices = torch.nonzero(torch.diff(zero_indices) != 1).squeeze() + 1
    split_lists = torch.split(zero_indices, split_indices)

    return [list(indices.numpy()) for indices in split_lists]

# Example usage
list1 = [[1, 2, 4, 0, 0, 0],[0, 4, 0, 2, 5, 8]]
list2 = [0, 4, 0, 2, 5, 8]


In [212]:
p = torch.tensor(list1)
zero_indices = torch.nonzero(p == 0).squeeze()

In [225]:
zero_indices

tensor([[0, 3],
        [0, 4],
        [0, 5],
        [1, 0],
        [1, 2]])

In [219]:
asdf = [3,4,5]

In [237]:
mask = torch.ones(2, 6, 5)

In [238]:
mask

tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]],

        [[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])

In [234]:
zero_indices[:,0]

tensor([0, 0, 0, 1, 1])

In [235]:
zero_indices[:,1]

tensor([3, 4, 5, 0, 2])

In [242]:
mask[zero_indices[:,0],zero_indices[:,1], :] = 0

In [240]:
mask

tensor([[[1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1.]]])

padding testing

In [293]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (2,))
    #print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)

    #x, y, t = x.to(device), y.to(device), t.to(device)

    return x, y, t


In [294]:
xb, yb, t = get_batch('train')

xb.shape

torch.Size([2, 64])

In [295]:
tok_embedding_matrix = nn.Embedding(vocab_size, 3)

x = tok_embedding_matrix(xb)
y = tok_embedding_matrix(yb)

In [296]:
xb

tensor([[    0, 45276,   916,   551, 32769, 17382, 84223,  4409,  1174, 84223,
          1174, 59792,  1174,  3723, 15422,   482,   220,  2287, 27307,  8544,
           662,   220,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [    0, 31672, 58427,   921, 97911, 75762,  1139,   279,  2217,  8863,
          7620, 10487,   389,   701,  6500,   662,   578, 20206,  7860,   835,
           567,    12,   567,   835,   567,   304,   374, 18641,   449, 29966,
         45979,  1174, 45979, 35257,  1174,  9708,    75, 17646, 14355,  1322,
          1174,  5099,   662,   220,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0

In [297]:
zero_indices = torch.nonzero(xb == 0).squeeze()

In [298]:
zero_indices

tensor([[ 0,  0],
        [ 0, 22],
        [ 0, 23],
        [ 0, 24],
        [ 0, 25],
        [ 0, 26],
        [ 0, 27],
        [ 0, 28],
        [ 0, 29],
        [ 0, 30],
        [ 0, 31],
        [ 0, 32],
        [ 0, 33],
        [ 0, 34],
        [ 0, 35],
        [ 0, 36],
        [ 0, 37],
        [ 0, 38],
        [ 0, 39],
        [ 0, 40],
        [ 0, 41],
        [ 0, 42],
        [ 0, 43],
        [ 0, 44],
        [ 0, 45],
        [ 0, 46],
        [ 0, 47],
        [ 0, 48],
        [ 0, 49],
        [ 0, 50],
        [ 0, 51],
        [ 0, 52],
        [ 0, 53],
        [ 0, 54],
        [ 0, 55],
        [ 0, 56],
        [ 0, 57],
        [ 0, 58],
        [ 0, 59],
        [ 0, 60],
        [ 0, 61],
        [ 0, 62],
        [ 0, 63],
        [ 1,  0],
        [ 1, 44],
        [ 1, 45],
        [ 1, 46],
        [ 1, 47],
        [ 1, 48],
        [ 1, 49],
        [ 1, 50],
        [ 1, 51],
        [ 1, 52],
        [ 1, 53],
        [ 1, 54],
        [ 

In [299]:
pad = torch.ones(x.shape)

In [300]:
pad

tensor([[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1

In [301]:
pad[zero_indices[:,0],zero_indices[:,1], :] = 0

In [302]:
pad

tensor([[[0., 0., 0.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         [0

In [303]:
x

tensor([[[-1.3116,  0.0313, -0.2236],
         [-0.8909, -0.7864,  0.5444],
         [ 0.0336,  1.1090,  0.9984],
         [ 0.6013,  0.4913,  0.5734],
         [ 0.0890,  1.4799, -0.5040],
         [ 0.4353, -1.7232, -1.2105],
         [ 1.6703, -1.3551, -0.1944],
         [-0.2170,  0.1159,  0.5018],
         [ 0.2051, -3.0432, -0.6218],
         [ 1.6703, -1.3551, -0.1944],
         [ 0.2051, -3.0432, -0.6218],
         [-1.0707, -1.5301,  0.3355],
         [ 0.2051, -3.0432, -0.6218],
         [ 1.0630, -1.0203, -0.1490],
         [-1.1775,  1.1844,  0.0845],
         [-0.0749,  1.0951, -0.5779],
         [-1.9409, -1.5697, -0.8750],
         [ 1.0254,  0.7691,  1.1471],
         [ 0.3585, -0.5508, -1.4044],
         [-0.9088, -0.8208, -1.3070],
         [-0.6819, -1.0715, -0.7244],
         [-1.9409, -1.5697, -0.8750],
         [-1.3116,  0.0313, -0.2236],
         [-1.3116,  0.0313, -0.2236],
         [-1.3116,  0.0313, -0.2236],
         [-1.3116,  0.0313, -0.2236],
         [-1

In [304]:
x_padded = x * pad

In [305]:
x_padded

tensor([[[-0.0000,  0.0000, -0.0000],
         [-0.8909, -0.7864,  0.5444],
         [ 0.0336,  1.1090,  0.9984],
         [ 0.6013,  0.4913,  0.5734],
         [ 0.0890,  1.4799, -0.5040],
         [ 0.4353, -1.7232, -1.2105],
         [ 1.6703, -1.3551, -0.1944],
         [-0.2170,  0.1159,  0.5018],
         [ 0.2051, -3.0432, -0.6218],
         [ 1.6703, -1.3551, -0.1944],
         [ 0.2051, -3.0432, -0.6218],
         [-1.0707, -1.5301,  0.3355],
         [ 0.2051, -3.0432, -0.6218],
         [ 1.0630, -1.0203, -0.1490],
         [-1.1775,  1.1844,  0.0845],
         [-0.0749,  1.0951, -0.5779],
         [-1.9409, -1.5697, -0.8750],
         [ 1.0254,  0.7691,  1.1471],
         [ 0.3585, -0.5508, -1.4044],
         [-0.9088, -0.8208, -1.3070],
         [-0.6819, -1.0715, -0.7244],
         [-1.9409, -1.5697, -0.8750],
         [-0.0000,  0.0000, -0.0000],
         [-0.0000,  0.0000, -0.0000],
         [-0.0000,  0.0000, -0.0000],
         [-0.0000,  0.0000, -0.0000],
         [-0