In [190]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import AutoTokenizer, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [203]:


tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
new_vocab = dict(list(tokenizer.get_vocab().items())[:VOCAB_SIZE])


print("Padding token:", tokenizer.pad_token)
print("EOS token:", tokenizer.sep_token)

Padding token: [PAD]
EOS token: [SEP]


In [204]:
data = pd.read_csv("datasets/text/good.csv")
text = data["text"].tolist()
text_ids = []
for t in tqdm(text):
    text_ids.append(tokenizer.encode(t))

print(tokenizer.decode(text_ids[-1]))


  0%|          | 0/30279 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (823 > 512). Running this sequence through the model will result in indexing errors
 80%|████████  | 24260/30279 [02:48<00:37, 160.70it/s]

In [158]:
with open ("datasets/text/good_ids.pkl", "wb") as f:
    pickle.dump(text_ids, f)

In [178]:
text_ids = []  
BLOCK_SIZE = 50
VOCAB_SIZE = tokenizer.vocab_size

with open("datasets/text/good_ids.pkl", "rb") as f:
    text_ids = pickle.load(f)

for i in range (len(text_ids)):
    if len(text_ids[i]) < BLOCK_SIZE + 2:
        text_ids[i] +=  (BLOCK_SIZE - len(text_ids[i]))*[tokenizer.pad_token_id]

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

print("Train size: ", train_size)
print("Test size: ", test_size)




Train size:  27251
Test size:  3028


In [179]:
print(tokenizer.decode(train_text_ids[0]))

[CLS] Nycticebus linglom is a fossil strepsirrhine primate from the Miocene of Thailand. Known only from a single tooth, an upper third molar, it is thought to be related to the living slow lorises genus Nycticebus, but the material is not sufficient to assign the species to Nycticebus with certainty, and the species name therefore uses open nomenclature. With a width of 1. 82 mm, this tooth is very small for a primate. It is triangular in shape, supported by a single root, and shows three main cusps, in addition to various crests. The absence of a fourth cusp, the hypocone, distinguishes it from various other prosimian primates. Nycticebus linglom was described in 1997 by French paleontologists Pierre Mein and Leonard Ginsburg in a report on the fossil mammals of Li Mae Long, a Miocene site in Thailand. The animal is known from a single tooth, and on the basis of comparisons with other prosimian primates Mein and Ginsburg concluded that it is most closely related to the living slow lo

In [187]:
N_EMB = 300
N_LAYERS = 4
N_HEADS = 5
DROPOUT = 0.2
print(tokenizer.vocab_size)

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
    
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, block_size, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )
        
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))
        
        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)
        
        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform
        
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, block_size, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            
            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature
            
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == tokenizer.sep_token_id:
                break
        return idx





# Create model, optimizer
model = LanguageModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, n_emb=N_EMB, n_layers=N_LAYERS, \
    n_heads=N_HEADS, dropout=DROPOUT).to(device)

print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')




28996
Number of parameters 13302788


In [181]:
def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - 2, (batch_size,))
    x = torch.zeros((batch_size, block_size), dtype=torch.long)
    y = torch.zeros((batch_size, block_size), dtype=torch.long)
    for j, i in enumerate(idx):
        if len(data[i]) < block_size + 2:
            data[i] = data[i] + (block_size + 2 - len(data[i])) * tokenizer.pad_token_id
        random_start = random.randint(0, len(data[i]) - block_size - 2)

        x[j] = torch.tensor(data[i][random_start:random_start + block_size], dtype=torch.long)
        y[j] = torch.tensor(data[i][random_start + 1:random_start + block_size + 1], dtype=torch.long)
    return x, y


a, b = get_batch(val_text_ids, block_size=BLOCK_SIZE, batch_size=1)
c, d = get_batch(train_text_ids, block_size=BLOCK_SIZE, batch_size=1)

print(a.shape, b.shape)

print(tokenizer.decode(a[0].tolist()).replace('##', ''))
print(tokenizer.decode(b[0].tolist()).replace('##', ''))

torch.Size([1, 50]) torch.Size([1, 50])
held the title from April 3, 2005, to January 10, 2006, for a total of 282 days. Triple H holds the record for longest combined reigns at 616 days. The shortest reigning champion was Randy Orton in his fourth reign
the title from April 3, 2005, to January 10, 2006, for a total of 282 days. Triple H holds the record for longest combined reigns at 616 days. The shortest reigning champion was Randy Orton in his fourth reign,


In [183]:
EARLY_STOP = 20
N_EPOCHS = 1000
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

last_val_loss = 1e9

for steps in range(N_EPOCHS):
    model.train()
    xb, yb = get_batch(train_text_ids, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_text_ids, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = 20
            last_val_loss = val_loss

IndexError: index out of range in self

In [28]:
torch.save(model, 'good_wiki_transformer_2.pth')

In [139]:
starting_tokens = 'The wife'

encoded_start = tokenizer.encode(starting_tokens)
encoded_start.pop(-1)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
generation = model.generate(idx, max_new_tokens=100, block_size=block_size, temperature=0.9, stop_token=True)[0].tolist()
model.train()
print(tokenizer.decode(generation))


[CLS] The wife Controllergat Historia Housing Gregory Tad employed Imageitating [unused97]aws treatments Yachttary hugged token nightclubenstein Walkover Voyager Good 157 Entertainment eighty Moor Ronald Directionrestle runnerilitatingOX ensure supplied Titus fame DCreenfari destructivemation AngeloȚ busy saloonensis Cecil trails Clinton combustion”headander Hiltonlis ეvić Institut online noting Alonepeculativeچ hardcover Ninja ά Fear formal highlygarttures Atkinson flashlight Frankunda BAFTA discussion bulk harbor operative exercise contents referendum mean Mick lifts Sharkwana attackers discouraged slashlie mild Scream listed rushedllyScript Citizens ₉ tens
