In [2]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
#GLOBALS

block_size = 64 #This is the value of T
batch_size = 16 #This it the value of B
n_embed = 128
dropout = 0.2
n_heads = 8
n_layers = 6

learning_rate = 3e-4

eval_interval = 500
max_iters = 50000
eval_iters = 200

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device is: {device}")

In [4]:
with open(os.getcwd()+'\\data\\albhed.txt', 'r', encoding='utf8') as f:
    raw_text = f.read()

chars = sorted(list(set(raw_text)))
vocab_size = len(chars)
print(''.join(chars))
# create a mapping from characters to integers
s_to_i = { ch:i+1 for i,ch in enumerate(chars) }
s_to_i["<PAD>"] = 0
i_to_s = { i:s for s,i in s_to_i.items()}


 !'(),-.:?ABCDEFGHIJKLMNOPRSTUVWYabcdefghijklmnopqrstuvwxyz


In [5]:
encode = lambda x: [s_to_i[i] for i in x]
decode = lambda y: [i_to_s[i] for i in y]

In [6]:
def parse_text(text):
    en_sen = ''
    a_sen = ''
    i = 0
    en = True
    a = False
    while i < len(text):
        if en:
            if text[i] != '(':
                en_sen = en_sen + text[i]
            elif text[i] == '(':
                en = False
                a = True 
        if a:
            if text[i] != ')' and text[i] != '(':
                a_sen = a_sen + text[i]
            elif text[i] == ')':
                a = False     
        i += 1
    return a_sen.strip(), en_sen.strip()

In [7]:
def create_tensor_of_words(word):
    len_pad = block_size - len(word)
    t = torch.tensor(encode([*word] + len_pad*["<PAD>"])).unsqueeze(0)
    assert t.size(dim=0) == 1 and t.size(dim=1) == 64, print(t.size())

    return t

In [8]:
# english_sentence = []
# albhed_sentence = []

english_sentence = torch.empty((0, 64), dtype = torch.long)
albhed_sentence = torch.empty((0, 64), dtype = torch.long)

with open(os.getcwd()+'\\data\\albhed.txt', 'r', encoding='utf8') as f:
    for line in f:
        line = " ".join(line.split())
        if line != '':
            if ':' in line:
                text = line.split(":")[1]
                en, a = parse_text(text)
                #english_sentence.append(en)
                #albhed_sentence.append(a)

                english_sentence = torch.cat([english_sentence, create_tensor_of_words(en)])
                albhed_sentence = torch.cat([albhed_sentence, create_tensor_of_words(a)])
            else:
                en, a = parse_text(line)
                # english_sentence.append(en)
                # albhed_sentence.append(a)
                english_sentence = torch.cat([english_sentence, create_tensor_of_words(en)])
                albhed_sentence = torch.cat([albhed_sentence, create_tensor_of_words(a)])


In [32]:
#DATASET AND DATALOADERS
n = int(0.9*len(english_sentence))

train_data_en = english_sentence[:n]
val_data_en = albhed_sentence[n:]

train_data_a = english_sentence[:n]
val_data_a = albhed_sentence[n:]

In [52]:
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    datax = train_data_en if split == 'train' else val_data_en
    datay = train_data_a if split == 'train' else val_data_a
    ix = torch.randint(len(datax) - block_size, (batch_size,))
    x = torch.stack([datax[i:i+block_size] for i in ix])
    y = torch.stack([datay[i+1:i+block_size+1] for i in ix])

    t = y[:, 1:]
    #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
    t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)

    x, y = x.to(device), y.to(device), t.to(device)
    return x, y, t


In [None]:
import math
def PositionalEncoding(seq_len, n_embd):
        
    pos_enc = torch.zeros(seq_len, n_embd)
    position = torch.arange(0, seq_len, dtype = torch.float32).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, n_embd, 2) * (-math.log(10000.0) / n_embd))
    pos_enc[:, 0::2] = torch.sin(position * div_term)
    pos_enc[:, 1::2] = torch.cos(position * div_term)

    return pos_enc.to(device)

def get_padding_mask_matrix(x, x_embed):
    """
    x is the (B, T) tokenized matrix with padding included
    x_embed is the embedded matrix that we will convert all rows to zero based on the corresponding row index = padding index
    """

    #locate every index in each tokenized sentence which contains the pad index
    pad_indices = torch.nonzero(x == 100277).squeeze().to(device) #This will return a (N, 2) where the first column represents the sentence (batch_index) and the second column represents the corresponding index which is the pad index (This 2nd column represents which row we will set to all zeros)

    #initialize a torch.ones of the shape of the embedding matrix
    mask = torch.ones(x_embed.shape).to(device)

    #For each row in the pad_indices matrix, we go to pad_indices[0] to grab the current batch example, and we go to the corresponding row of the batch example using the value of pad_indices[1]. We turn every column of that row into zeros
    #ex: if the current row is [1, 4], then we go to the 2nd batch example, go to the 4th row, and wipe it clean with zeroes
    mask[pad_indices[:,0], pad_indices[:,1], :] = 0

    #element-wise product
    x_padded = x_embed * mask

    return x_padded.to(device)


In [12]:
class Head(nn.Module):
    def __init__(self, head_size, decoder = False):
        super().__init__()
        self.decoder = decoder
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size, bias = False)
        self.Wq = nn.Linear(n_embed, head_size, bias = False)
        self.Wv = nn.Linear(n_embed, head_size, bias = False)

        if self.decoder:
            self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        #assume input is of size (B, T, C)
        K = self.Wk(x) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(x) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)

        if self.decoder:
            attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, T) @ (B, T, head_size) = (B, T, head_size)

        return out

class crossHead(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.head_size = head_size
        self.Wk = nn.Linear(n_embed, head_size, bias = False)
        self.Wq = nn.Linear(n_embed, head_size, bias = False)
        self.Wv = nn.Linear(n_embed, head_size, bias = False)

        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x, enc_out):
        B, T, C = x.shape
        #assume x is of shape (B, T, C)
        #assume enc_out is of shape (B, T, C)

        K = self.Wk(enc_out) #(B, T, head_size)
        Q = self.Wq(x) #(B, T, head_size)
        V = self.Wv(enc_out) #(B, T, head_size)

        attention_scores = Q @ K.transpose(-2, -1) * 1/(self.head_size)**(1/2) #(B, T, T)
        # print("attn",attention_scores.shape)
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) #(B, T, T)
        attention_scores = F.softmax(attention_scores, dim = -1) #(B, T, T)
        scores = self.dropout(attention_scores) #(B, T, T)
        out = scores @ V #(B, T, head_size)

        return out


class MultiHeadSelfAttention(nn.Module):
    def __init__(self, head_size, decoder):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size, decoder) for _ in range(n_heads)])
        #output of heads is of size (B, T, n_heads*head_size)
        self.proj = nn.Linear(head_size * n_heads, n_embed)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.proj(out)
        out = self.dropout(out)

        return out

class MultiHeadCrossAttention(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.heads = nn.ModuleList([crossHead(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_heads*head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out):
        x = torch.cat([h(x, enc_out) for h in self.heads], dim = -1)
        x = self.proj(x)
        x = self.dropout(x)

        return x

class Embedding(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, n_embed, padding_idx = 0, device = device)

    def forward(self, x):
        #assume x is of shape (B, T)
        return self.embedding(x.long()) * n_embed**(1/2)

class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(n_embed, 4*n_embed)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(4*n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.dropout(x)

        return x

class EncoderBlock(nn.Module):
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads

        self.sa = MultiHeadSelfAttention(head_size, decoder = False)
        self.ffw = FeedForward()
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        #assume input x is of size (B, T, C)
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.sa(x) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x = self.ffw(x) #(B, T, C)

        return x

class DecoderCrossBlock(nn.Module):
    #one implementation of the multi head cross attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed // n_heads
        self.heads = MultiHeadCrossAttention(head_size)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, parameters):
        #assume parameters[0] is input of shape (B, T, C), It is the output of the decoder self attention layer
        #assume parameters is a list of length 2: first element is the output of the previous hidden layer, and the 2nd element is the output of the encoder
        
        x = parameters[0]
        enc_out = parameters[1]
        
        x = self.layernorm1(x) #(B, T, C)
        x = x + self.heads(x, enc_out) #(B, T, C)
        x = self.layernorm2(x) #(B, T, C)
        x = x + self.ffw(x) #(B, T, C)

        return [x, enc_out]

class DecoderSelfBlock(nn.Module):
    #one implementation of the multi head self attention block in the decoder
    def __init__(self):
        super().__init__()
        head_size = n_embed//n_heads
        self.sa = MultiHeadSelfAttention(head_size, decoder = True)
        self.layernorm1 = nn.LayerNorm(n_embed)
        self.layernorm2 = nn.LayerNorm(n_embed)
        self.ffw = FeedForward()

    def forward(self, x):
        #assume x is of shape (B, T, C)
        x = self.layernorm1(x)
        x = x + self.sa(x)
        x = self.layernorm2(x)
        x = x + self.ffw(x)

        return x

class Transformer(nn.Module):

    def __init__(self):
        super().__init__()
        self.tok_embedding_matrix_x = Embedding()
        self.tok_embedding_matrix_y = Embedding()

        #positional embedding is a function that requires no backpropagation, so we don't need to initialize it in here
        
        self.EncoderBlocks = nn.Sequential(*[EncoderBlock() for _ in range(n_layers)])
        self.DecoderSelfBlocks = nn.Sequential(*[DecoderSelfBlock() for _ in range(n_layers)])
        self.DecoderCrossBlocks = nn.Sequential(*[DecoderCrossBlock() for _ in range(n_layers)])

        self.final_layernorm = nn.LayerNorm(n_embed)
        self.final_linear = nn.Linear(n_embed, vocab_size)
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    #looks to be very broken
    def translate(self, x):
        pass
            
    def forward(self, x, y, targets = None):
        Bx, Tx = x.shape
        Cx = n_embed
        
        tok_embed_x = self.tok_embedding_matrix_x(x)
        pos_embed_x = PositionalEncoding(Tx, Cx)

        By, Ty, = y.shape
        Cy = n_embed

        tok_embed_y = self.tok_embedding_matrix_y(y)
        pos_embed_y = PositionalEncoding(Ty, Cy)

        tok_pos_embed_x = tok_embed_x + pos_embed_x
        tok_pos_embed_y = tok_embed_y + pos_embed_y
        
        masked_tok_embed_x = get_padding_mask_matrix(x, tok_pos_embed_x)
        masked_tok_embed_y = get_padding_mask_matrix(y, tok_pos_embed_y)

        x = masked_tok_embed_x
        y = masked_tok_embed_y

        #encoder
        enc_out = self.EncoderBlocks(x)

        #decoder self
        y = self.DecoderSelfBlocks(y)
        #decoder cross
        #its ideal to send in one parameter only (i.e self, x) when passing parameters through stacked layers in an nn.Sequential, so we have to combine our previous hidden state output along with the enc_out into one object
        y = self.DecoderCrossBlocks([y, enc_out])

        #grab the transformed decoder input from the cross attention layer
        y = y[0]
        #remaining layers
        y = self.final_layernorm(y)
        logits = self.final_linear(y)

        if targets is not None:
            logits = logits.view(By*Ty, -1)
            targets = targets.view(targets.shape[0]*targets.shape[1])
            loss = F.cross_entropy(logits, targets)

        else:
            loss = None

        return logits, loss



In [None]:
model = Transformer()
m = model.to(device)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.1)

loss_fn = nn.CrossEntropyLoss(ignore_index = 100277)

for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

state 	 {}
param_groups 	 [{'lr': 0.0003, 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0.01, 'amsgrad': False, 'foreach': None, 'maximize': False, 'capturable': False, 'differentiable': False, 'fused': None, 'initial_lr': 0.0003, 'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 1

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb, tb = get_batch(split)
            logits, loss = model(xb, yb)

            B, T, = yb.shape
            C = n_embed

            logits = logits.view(B*T, -1)
            tb = tb.view(tb.shape[0]*tb.shape[1])
            loss = F.cross_entropy(logits, tb)
            
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
model.train()
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()

    # sample a batch of data
    xb, yb, tb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)

    B, T, = yb.shape
    C = n_embed

    logits = logits.view(B*T, -1)
    tb = tb.view(tb.shape[0]*tb.shape[1])
    loss = F.cross_entropy(logits, tb)


    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [22]:
xb, yb, tb = get_batch("train")

In [25]:
''.join(decode(xb.tolist()[0]))

'Whoa! It moves!<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'

In [23]:
''.join(decode(yb.tolist()[0]))

'Fruy! Ed sujac!<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'

In [24]:
''.join(decode(tb.tolist()[0]))

'ruy! Ed sujac!<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>'

In [49]:
# trainset = AlBhedDataset([train_data_en, train_data_a])
# valset = AlBhedDataset([val_data_en, val_data_a])

# train_dataloader = DataLoader(trainset, batch_size = 128, shuffle = True)
# val_dataloader = DataLoader(valset, batch_size = 128, shuffle = True)

In [None]:
model.train()

for i in epoch:
    if epoch % eval_interval == 0 or i == epoch - 1:
        losses = estimate_loss()
        lr_ = optimizer.param_groups[0]["lr"]
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}, lr {lr_}")

    for count, value in enumerate(train_dataloader):

        # sample a batch of data
        xb, yb, tb = get_batch('train')

        # evaluate the loss
        logits, loss = model(xb, yb)

        B, T, = yb.shape
        C = n_embed

        logits = logits.view(B*T, -1)
        tb = tb.view(tb.shape[0]*tb.shape[1])
        loss = F.cross_entropy(logits, tb)


        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [51]:
from torch.utils.data import Dataset, DataLoader, Subset

class AlBhedDataset(Dataset):
    def __init__(self, data):
        self.data_en = data[0]
        self.data_a = data[1]

    def __len__(self):
        return len(self.data_en)

    def __getitem__(self, idx):
        self.dict = {}
        input = self.data_en[idx]
        output = self.data_a[idx]

        t = self.data_a[:, 1:]
        #to pad the last dimension of the input tensor, pad has the form (padding_left, padding_right)
        t = F.pad(input = t, pad = (0,1,0,0), mode = 'constant', value = 0)
        target = t

        self.dict["input"] = input
        self.dict["output"] = output
        self.dict["target"] = target

        return self.dict