In [473]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import os
import matplotlib.pyplot as plt
import torchtext

import tiktoken
import pandas as pd
import numpy as np

from torch.nn import functional as F
import torch.nn as nn
from tqdm.notebook import tqdm
import pickle
import ast

In [500]:
#GLOBALS

block_size = 256 #This is the value of T
batch_size = 16 #This it the value of B
n_embed = 512
dropout = 0.2
n_heads = 8

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

device is: cuda


In [406]:
import os

#dataset https://nlp.stanford.edu/projects/nmt/

#tiktoken api https://github.com/openai/tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")

# In production, load the arguments directly instead of accessing private attributes
# See openai_public.py for examples of arguments for specific encodings
tokenizer = tiktoken.Encoding(
    # If you're changing the set of special tokens, make sure to use a different name
    # It should be clear from the name what behaviour to expect.
    name="cl100k_im",
    pat_str=cl100k_base._pat_str,
    mergeable_ranks=cl100k_base._mergeable_ranks,
    special_tokens={
        **cl100k_base._special_tokens,
        "<|PAD|>": 0,
        "<|START|>": 100278,
        "<|END|>": 100279,
        "<|DEL|>": 100280,
        "!": 100281
    }
)
print(tokenizer.n_vocab) #this is the number of tokens in our tokenizer
print(tokenizer._special_tokens) #prints out our special tokens 

specials = {"<|PAD|>","<|START|>","<|END|>", "<|DEL|>", "!"}

100282
{'<|endoftext|>': 100257, '<|fim_prefix|>': 100258, '<|fim_middle|>': 100259, '<|fim_suffix|>': 100260, '<|endofprompt|>': 100276, '<|PAD|>': 0, '<|START|>': 100278, '<|END|>': 100279, '<|DEL|>': 100280, '!': 100281}


In [490]:
def create_dataset():
    num_examples = 100000

    en_max = 0 
    with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
        idx_en = []
        sentences_en = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|PAD|> " + (line) + " <|PAD|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            if len(tok_sentence) > en_max:
                en_max = len(tok_sentence)
                print(en_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[0]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_en.append(tok_sentence)
            else:
                sentences_en.append(block_size*[100280])

    print(en_max)    
    print(f"Length of sentences: {len(sentences_en)}")

    print(en_max)    
    print(f"Length of sentences: {len(sentences_en)}")

    de_max = 0 
    with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
        idx_de = []
        sentences_de = []
        for i in tqdm(range(num_examples)):
            line = f.readline()
            line = line.replace("\n", "")
            len_pad = 0
            sentence = "<|START|> " + (line) + " <|END|>"
            tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
            if len(tok_sentence) > de_max:
                de_max = len(tok_sentence)
                print(de_max)

            if len(tok_sentence) <= block_size:
                len_pad = block_size - len(tok_sentence)
                tok_sentence = tok_sentence + len_pad*[0]
                assert len(tok_sentence) == block_size, print(len(tok_sentence))
                sentences_de.append(tok_sentence)
            else:
                sentences_de.append(block_size*[100280])
                
    print(de_max) 
    print(f"Length of sentences: {len(sentences_de)}")

    print("Removing sentences whos length is greater than our block_size")

    #combine the arrays together
    sentences = np.array([sentences_en, sentences_de])
    #check for indices in both sentences that have rows containing the DEL token
    idx = np.where(sentences == 100280)

    #delete every row that contains the DEL token
    sentences = np.delete(sentences, idx[1], axis = 1)

    #splitting to german and english

    sentences_en = torch.tensor(sentences[0], dtype=torch.long)
    sentences_de = torch.tensor(sentences[1], dtype=torch.long)

    print(f"Length of new english sentences: {len(sentences_en)}")
    print(f"Length of new german sentences: {len(sentences_de)}")

    with open(os.getcwd()+'\\data\\english_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_en, f)

    with open(os.getcwd()+'\\data\\german_sentences.pkl', 'wb') as f:
        pickle.dump(sentences_de, f)


In [491]:
create = True
if create:
    create_dataset()

  0%|          | 0/100000 [00:00<?, ?it/s]

38
50
60
67
68
82
95
122
125
132
141
154
177
191
203
295
295
Length of sentences: 100000
295
Length of sentences: 100000


  0%|          | 0/100000 [00:00<?, ?it/s]

60
73
83
91
105
120
130
135
161
163
187
239
267
284
284
Length of sentences: 100000
Removing sentences whos length is greater than our block_size
Length of new english sentences: 99996
Length of new german sentences: 99996


In [492]:
#TRAIN AND VAL DATASETS

with open(os.getcwd()+'\\data\\english_sentences.pkl', 'rb') as f:
    english_sentences = pickle.load(f)

with open(os.getcwd()+'\\data\\german_sentences.pkl', 'rb') as f:
    german_sentences = pickle.load(f)

In [493]:
100280 in english_sentences

False

In [495]:
english_sentences.dtype

torch.int64

In [498]:
#BATCH LOADER
n = int(0.9*len(english_sentences))

train_data_en = english_sentences[:n]
val_data_en = english_sentences[n:]

train_data_de = german_sentences[:n]
val_data_de = german_sentences[n:]

def get_batch(split):
    xdata = train_data_en if "train" else train_data_en
    ydata = train_data_de if "train" else val_data_de
    idx = torch.randint(len(xdata), (batch_size,))
    print(idx)
    x = torch.stack([xdata[i] for i in idx])
    y = torch.stack([ydata[i] for i in idx])

    #shifting our targets by 1 to the right
    y = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    y = F.pad(input = yt, pad = (0,1,0,0), mode = 'constant', value = 0)

    x, y = x.to(device), y.to(device)

    return x, y

#xb, yb = get_batch('train')

In [499]:
#Pytorch's positional encoding https://pytorch.org/tutorials/beginner/transformer_tutorial.html
import math

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
            x: (T, B, C)
            We have to change our shape dimensions in to (T, B, C) and then change it back to (B, T, C) when done

        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [501]:
class Head(nn.Module):
    def __init__(self, head_size, decoder = False):
        super().__init__()

        self.Wk = nn.Linear(n_embed, head_size)
        self.Wq = nn.Linear(n_embed, head_size)
        self.Wv = nn.Linear(n_embed, head_size)

        if decoder:
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        #assume input is of size (B, T, C)
        K = self.Wk(x) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(x) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(head_size)**(1/2) #(B, T, T)

        if decoder:
            attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)

        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, T) @ (B, T, head_size) = (B, T, head_size)

        return out

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, head_size, decoder):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, decoder) for _ in range(n_heads)])
        #output of heads is of size (B, T, num_heads*head_size)
        self.proj = nn.Linear(head_size * num_heads, n_embed)

        self.dropout = nn.Dropout(dropout)

        def forward(self, x):
            out = torch.cat([h(x) for h in self.heads], dim = -1)
            out = proj(out)
            out = self.dropout(out)

            return out

class FeedForward(nn.Module):
    def __init__(self):
        self.fc1 = nn.Linear(n_embed, 4*n_embed)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(4*n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

class EncoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads

        self.sa = MultiHeadSelfAttention(head_size, decoder = False)
        self.ffw = FeedForward()
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        #assume input x is of size (B, T, C)
        x = layernorm1(x) #B, T, C
        x = x + self.sa(x) #B, T, C
        x = layernorm2(x) #B, T, C
        x = x = ffw(x) #B, T, C

        return x

rough work

In [301]:
encoder = tiktoken.get_encoding("cl100k_base")


In [403]:
tokenizer.encode("<|PAD|>", allowed_special = specials)

[0]

In [303]:
encoding.decode([83, 1609, 5963, 374, 2294, 0])

'tiktoken is great!'

In [416]:
block_size = 10
num_examples = 5

en_max = 0 
with open(os.getcwd()+'\\data\\train_en.txt', 'r', encoding='utf8') as f:
    idx_en = []
    sentences_en = []
    for i in tqdm(range(num_examples)):
        line = f.readline()
        line = line.replace("\n", "")
        len_pad = 0
        sentence = "<|PAD|> " + (line) + " <|PAD|>"
        print(sentence)
        print(tokenizer.encode(sentence, allowed_special = specials))
        tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
        print(len(tok_sentence))
        if len(tok_sentence) > en_max:
            en_max = len(tok_sentence)
            print(en_max)

        if len(tok_sentence) <= block_size:
            len_pad = block_size - len(tok_sentence)
            tok_sentence = tok_sentence + len_pad*[100277]
            assert len(tok_sentence) == block_size, print(len(tok_sentence))
            #idx_en.append(i)
            sentences_en.append(tok_sentence)
        else:
            sentences_en.append(block_size*[100280])

print(en_max)    
print(f"Length of sentences: {len(sentences_en)}")


de_max = 0 
with open(os.getcwd()+'\\data\\train_de.txt', 'r', encoding='utf8') as f:
    idx_de = []
    sentences_de = []
    for i in tqdm(range(num_examples)):
        line = f.readline()
        line = line.replace("\n", "")
        len_pad = 0
        sentence = "<|START|> " + (line) + " <|END|>"
        tok_sentence = tokenizer.encode(sentence, allowed_special = specials)
        if len(tok_sentence) > de_max:
            de_max = len(tok_sentence)
            print(de_max)

        if len(tok_sentence) <= block_size:
            len_pad = block_size - len(tok_sentence)
            tok_sentence = tok_sentence + len_pad*[100277]
            assert len(tok_sentence) == block_size, print(len(tok_sentence))
            #idx_en.append(i)
            sentences_de.append(tok_sentence)
        else:
            sentences_de.append(block_size*[100280])
            
print(de_max)  
print(f"Length of sentences: {len(sentences_de)}")

  0%|          | 0/5 [00:00<?, ?it/s]

<|PAD|> iron cement is a ready for use paste which is laid as a fillet by putty knife or finger in the mould edges ( corners ) of the steel ingot mould . <|PAD|>
[0, 11245, 24532, 374, 264, 5644, 369, 1005, 25982, 902, 374, 17551, 439, 264, 1488, 1169, 555, 2231, 1919, 22145, 477, 14654, 304, 279, 51370, 13116, 320, 24359, 883, 315, 279, 9699, 6892, 354, 51370, 662, 220, 0]
38
38
<|PAD|> iron cement protects the ingot against the hot , abrasive steel casting process . <|PAD|>
[0, 11245, 24532, 36236, 279, 6892, 354, 2403, 279, 4106, 1174, 94804, 9699, 25146, 1920, 662, 220, 0]
18
<|PAD|> a fire restant repair cement for fire places , ovens , open fireplaces etc . <|PAD|>
[0, 264, 4027, 2800, 519, 13023, 24532, 369, 4027, 7634, 1174, 297, 21778, 1174, 1825, 4027, 27170, 5099, 662, 220, 0]
21
<|PAD|> Construction and repair of highways and ... <|PAD|>
[0, 24987, 323, 13023, 315, 60395, 323, 2564, 220, 0]
10
<|PAD|> An announcement must be commercial character . <|PAD|>
[0, 1556, 17480, 2

  0%|          | 0/5 [00:00<?, ?it/s]

60
60
Length of sentences: 5


In [417]:
np.array(sentences_en).shape

(5, 10)

In [418]:
sentences = np.array([sentences_en, sentences_de])

In [419]:
sentences = np.array([sentences_en, sentences_de])

idx = np.where(sentences == 100280)
print(idx[1])

[0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0
 0 0 0 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 4 4 4 4
 4 4 4 4 4 4]


In [420]:
sentences = np.delete(sentences, idx[1], axis = 1)

In [421]:
sentences

array([], shape=(2, 0, 10), dtype=int32)