In [194]:
from datasets import load_dataset
import torch 
import torch.nn as nn 
from tqdm import tqdm 
import pandas as pd 
from torch.utils.data import Dataset

In [23]:
# ds = load_dataset("codeparrot/github-code", streaming=True, split="train", languages = ["Markdown"], keep_in_memory=False)

# print(next(iter(ds)))


{'code': '---\nname: Dreadnought\ntype: AV\nspeed: 15cm\narmour: 3+\ncc: 4+\nff: 4+\nspecial_rules:\n  - walker\nnotes:\n  |\n    Armed with either a Missile Launcher and Twin Lascannon, or an Assault Cannon and Power Fist.\nweapons:\n  -\n    id: missile-launcher\n    multiplier: 0–1\n  -\n    id: twin-lascannon\n    multiplier: 0–1\n  -\n    id: assault-cannon\n    multiplier: 0–1\n  -\n    id: power-fist\n    multiplier: 0–1\n---', 'repo_name': 'dsusco/tp.net-armageddon.org', 'path': '_site/_units/dreadnought.md', 'language': 'Markdown', 'license': 'isc', 'size': 409}


In [35]:
data = pd.read_csv('datasets/shakespeare-plays/Shakespeare_data.csv')

# Preprocess

In [40]:
processed_corpus = []


for i, row in data.iterrows():
    processed_corpus.append(row['PlayerLine'].lower())


In [42]:
print(processed_corpus[:10])

['act i', 'scene i. london. the palace.', 'enter king henry, lord john of lancaster, the earl of westmoreland, sir walter blunt, and others', 'so shaken as we are, so wan with care,', 'find we a time for frighted peace to pant,', 'and breathe short-winded accents of new broils', 'to be commenced in strands afar remote.', 'no more the thirsty entrance of this soil', "shall daub her lips with her own children's blood,", 'nor more shall trenching war channel her fields,']


In [368]:
class ShakespeareDataset(Dataset):
    def __init__(self, block_size : int, split : str ,  to_lower : bool = True):
        self.data = pd.read_csv('datasets/shakespeare-plays/Shakespeare_data.csv')
        self.processed_corpus = []
        self.block_size = block_size
        

        for i, row in self.data.iterrows():
            if to_lower:
                self.processed_corpus.append(row['PlayerLine'].lower())
            else : 
                self.processed_corpus.append(row['PlayerLine'])
        
        if split == 'train':
            self.processed_corpus = self.processed_corpus[:int(0.9*len(self.processed_corpus))]
        elif split == 'test':
            self.processed_corpus = self.processed_corpus[int(0.9*len(self.processed_corpus)):]

        #Merge all the lines into one big string
        self.processed_corpus = '\n'.join(self.processed_corpus)
     
        vocab_chars = set()
        for line in self.processed_corpus:
            for char in line:
                if char not in vocab_chars:
                    vocab_chars.add(char)
        
        self.vocab_size = len(sorted(vocab_chars))
        self.token_embedding_table = nn.Embedding(self.vocab_size, self.vocab_size)
        self.stoi = {char: i for i, char in enumerate(sorted(vocab_chars))}
        self.itos = {i: char for i, char in enumerate(sorted(vocab_chars))}
        # self.encode = lambda x: [stoi[char] for char in x]
        # self.decode = lambda x: [itos[i] for i in x] 

    def decode(self, x):
        if isinstance(x, torch.Tensor):
            x = x.tolist()
        return [self.itos[i] for i in x]
    
    def encode(self, x):
        return [int(self.stoi[char]) for char in x]

    def __getitem__(self, idx):
        #print(self.encode(self.processed_corpus[idx : idx + self.block_size + 3]))
        context = self.processed_corpus[idx : idx + self.block_size + 1]
        prediction = self.processed_corpus[idx + self.block_size + 1]
        encoded_context = self.encode(context)
        encoded_prediction = self.encode(prediction)

        encoded_prediction = torch.Tensor(encoded_prediction).type(torch.int32)
  
        encoded_prediction = self.token_embedding_table(encoded_prediction).squeeze(0)
   
        encoded_context = torch.Tensor(encoded_context).type(torch.int32)
        print(encoded_context.shape)
        
        return encoded_context, encoded_prediction
    
    def __len__(self):
        return len(self.processed_corpus) - self.block_size - 1 

In [369]:
train_dataset = ShakespeareDataset(block_size = 32, split = 'train')
test_dataset = ShakespeareDataset(block_size = 32, split = 'test')

In [233]:
train_dataset[2]

(tensor([44.,  2., 33.,  1., 43., 27., 29., 38., 29.,  2., 33., 10.,  2., 36.,
         39., 38., 28., 39., 38., 10.,  2., 44., 32., 29.,  2., 40., 25., 36.,
         25., 27., 29., 10.,  1.]),
 tensor([29.]))

In [234]:
train_dataset[3]

(tensor([ 2., 33.,  1., 43., 27., 29., 38., 29.,  2., 33., 10.,  2., 36., 39.,
         38., 28., 39., 38., 10.,  2., 44., 32., 29.,  2., 40., 25., 36., 25.,
         27., 29., 10.,  1., 29.]),
 tensor([38.]))

# Transformer model

In [235]:
processed_corpus[0:19]

['act i',
 'scene i. london. the palace.',
 'enter king henry, lord john of lancaster, the earl of westmoreland, sir walter blunt, and others',
 'so shaken as we are, so wan with care,',
 'find we a time for frighted peace to pant,',
 'and breathe short-winded accents of new broils',
 'to be commenced in strands afar remote.',
 'no more the thirsty entrance of this soil',
 "shall daub her lips with her own children's blood,",
 'nor more shall trenching war channel her fields,',
 'nor bruise her flowerets with the armed hoofs',
 'of hostile paces: those opposed eyes,',
 'which, like the meteors of a troubled heaven,',
 'all of one nature, of one substance bred,',
 'did lately meet in the intestine shock',
 'and furious close of civil butchery',
 'shall now, in mutual well-beseeming ranks,',
 'march all one way and be no more opposed',
 'against acquaintance, kindred and allies:']

In [236]:
test = train_dataset[0]

In [303]:
wei = torch.randn(32,33,33)

In [310]:
(torch.tril(torch.ones(33,33))==0).shape

torch.Size([33, 33])

In [None]:
wei.masked_fill((torch.tril(torch.ones(33,33)))==0, float('-inf'))

In [370]:
class SingleAttentionHead(nn.Module):
    def __init__(self, emb_size : int, head_size : int):
        super().__init__()
        self.emb_size = emb_size
        self.head_size = head_size
        #self.block_size = block_size
        self.q_linear = nn.Linear(self.emb_size, self.head_size)
        self.v_linear = nn.Linear(self.emb_size, self.head_size)
        self.k_linear = nn.Linear(self.emb_size, self.head_size)

        self.dropout = nn.Dropout(0.0)
    
    def forward(self, x):
        B,T,C = x.shape
        k = self.k_linear(x)
        q = self.q_linear(x) # B, T, C

        wei = q @ k.transpose(-2,-1) / self.head_size # B, T, T

        wei = wei.masked_fill((torch.tril(torch.ones(T,T))==0).to(x.device), float('-inf')).to(x.device)
        wei = torch.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.v_linear(x)
        out = wei @ v # B, T, C


        return out.to(x.device)
    


class MultiHeadAttention(nn.Module):
    def __init__(self, head_nb : int, emb_size : int):
        super().__init__()
        self.head_nb = head_nb
        self.emb_size = emb_size
        self.heads = nn.ModuleList([SingleAttentionHead(self.emb_size, self.emb_size//self.head_nb) for _ in range(head_nb)])
        self.proj = nn.Linear(emb_size, emb_size)
        self.dropout = nn.Dropout(0.0)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out
    
class TransformerBlock(nn.Module):
    def __init__(self, emb_size : int, head_nb : int):
        super().__init__()

        self.MHA = MultiHeadAttention(head_nb, emb_size)
        self.feed_forward = nn.Sequential(
                nn.Linear(emb_size, 4*emb_size),
                nn.ReLU(),
                nn.Linear(4*emb_size, emb_size),
                nn.Dropout(0.0)
        )
          
        self.ln = nn.LayerNorm(emb_size)

    def forward(self, x):
        out = self.ln(x)
        out = self.MHA(x)
        out = out + x 
        out = self.ln(x)
        out = self.feed_forward(out)
        out = out + x
        return out

In [371]:
class LLM(nn.Module):
    def __init__(self, vocab_size : int, emb_size : int, head_nb : int, block_nb : int, block_size : int):
        super().__init__()
        self.emb_size = emb_size
        self.head_nb = head_nb
        self.block_nb = block_nb
        self.block_size = block_size

        self.tok_emb = nn.Embedding(vocab_size, emb_size)
        self.pos_emb = nn.Embedding(self.block_size, emb_size)
        self.blocks = nn.ModuleList([TransformerBlock(self.emb_size, self.head_nb) for _ in range(self.block_nb)])
        
        self.ln = nn.LayerNorm(self.emb_size)
        self.lm_head = nn.Linear(self.emb_size, vocab_size)

    def forward(self, x):
        B,T = x.shape
        
        tok_emb =  self.tok_emb(x)
        pos_emb = self.pos_emb(torch.arange(T, device = x.device))
        
        out = tok_emb + pos_emb
        for block in self.blocks:
            out = block(out)
        #out = self.blocks(out)
        logits = self.lm_head(out)

        return logits

In [372]:
def train(model, optimizer, train_loader, device):
    model.train()
    total_acc = 0
    with tqdm(range(len(train_loader))) as pbar :
        for idx, (context,predi) in enumerate(train_loader):
            context, predi = context.to(device), predi.to(device)
            optimizer.zero_grad()
            print(context)
            logits = model(context)
            print(logits.shape, predi.shape)
            loss = nn.CrossEntropyLoss()(logits, predi)
            loss.backward()
            optimizer.step()
            if i % 100 == 0:
                print(loss.item())
            total_acc += (logits.argmax(-1) == predi:,1:]).sum().item()
            pbar.update(1)

    print(f'[TRAIN] Accuracy : {total_acc/len(train_dataset)} Loss : {loss.item()}')
        

SyntaxError: invalid syntax (2587352704.py, line 16)

In [None]:
def test(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        total_acc = 0 
        with tqdm(range(len(test_loader))) as pbar :
            for idx, (context, predi) in enumerate(test_loader):
                context, predi = context.to(device), predi.to(device)
                print(context)
                logits = model(context)
                loss = nn.CrossEntropyLoss()(logits[:,:-1].reshape(-1, vocab_size), x[:,1:].reshape(-1))
                #accuracy
                pred = logits.argmax(dim=-1)
                acc = (pred == predi).float().mean()
                total_acc += acc.item()

                if i % 100 == 0:
                    print(loss.item())
                pbar.update(1)
        print(f'[TEST] Accuracy : {total_acc/len(test_dataset)} Loss : {loss.item()}')

In [5]:
from transformers import AutoTokenizer 

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
test = "Je m'appellle Henry ! haha lol"
test_encoded = tokenizer(test)
print(test_encoded)
text_decoded = tokenizer.decode(test_encoded['input_ids'])
print(text_decoded)

{'input_ids': [101, 15333, 1049, 1005, 10439, 5349, 2571, 2888, 999, 5292, 3270, 8840, 2140, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] je m'appellle henry! haha lol [SEP]


In [9]:
t = torch.randn(2,3,1)

In [None]:
device = 'mps'


block_size = 32 # -> context length 
vocab_size = train_dataset.vocab_size
emb_size = 512
head_nb = 8
block_nb = 6

model = LLM(vocab_size, emb_size, head_nb, block_nb, block_size).to(device)

In [373]:
epoch = 20
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

for i in range(epoch):
    print(f'EPOCH {i+1}')
    train(model, optimizer, train_loader, device)
    test(test_loader, device)

EPOCH 1


  0%|          | 0/123045 [00:00<?, ?it/s]

torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
torch.Size([33])
tensor([[33, 28, 43,  ..., 28, 42, 29],
        [45, 42,  2,  ...,  1,  5, 44],
        [43,  2, 45,  ...,  2, 43, 33],
        ...,
        [37,  2, 32,  ..., 40, 42, 25],
        [44, 33, 39,  ..., 29, 25, 44],
        [43, 32, 25,  ..., 25, 43, 44]], device='mps:0', dtype=torch.int32)
torch.Size([32, 33])
torch.Size([32, 33, 51]) torch.Size([32, 51])


: 

In [356]:
len(train_loader)

123045