In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

block_size = 50

In [5]:
data = pd.read_csv("datasets/text/reddit_comments.csv")
body = data["body"].tolist()
text = "\n".join(body)

with open("datasets/text/reddit_comments.txt", "w", encoding='utf-8') as f:
    f.write(text)


In [6]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=["datasets/text/reddit_comments.txt"], vocab_size=5000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model("datasets/text/")
output = tokenizer.encode("I love programming.")
print("Encoded string: ", output.ids)  # output.ids is the tokenized representation
print("Decoded string: ", tokenizer.decode(output.ids))  # decoding back to the original string

Encoded string:  [45, 981, 4365, 338, 2260, 18]
Decoded string:  I love programming.


In [68]:
data = pd.read_csv("datasets/text/reddit_comments.csv")
text = data["body"].tolist()

text_ids = []

for t in tqdm(text):
    next_ids = tokenizer.encode(t).ids
    next_ids.append(tokenizer.encode("</s>").ids[0])
    text_ids.append(next_ids)


100%|██████████| 1000000/1000000 [01:39<00:00, 10024.63it/s]


In [69]:

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

print("Train size: ", train_size)
print("Test size: ", test_size)

Train size:  900000
Test size:  100000


In [127]:
with open("datasets/text/reddit_text_ids.pkl", "wb") as f:
    pickle.dump(text_ids, f)

In [7]:

vocab_file_path = "datasets/text/vocab.json"
merges_file_path = "datasets/text/merges.txt"
tokenizer = ByteLevelBPETokenizer(vocab_file_path, merges_file_path)

text_ids = pickle.load(open("datasets/text/reddit_text_ids.pkl", "rb"))

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

In [157]:
vocab_size = tokenizer.get_vocab_size()

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
    
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )
        
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))
        
        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)
        
        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform
        
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, block_size, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            
            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature
            
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == tokenizer.token_to_id('</s>'):
                break
        return idx

# Hyperparameters

vocab_size = vocab_size  # set your vocab size
n_emb = 500
n_layers = 4
n_heads = 4
dropout = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create model, optimizer
model = LanguageModel(vocab_size, n_emb, n_layers, n_heads, dropout).to(device)
print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')




Number of parameters 19250692


In [149]:
def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - 2, (batch_size,))
    x = torch.zeros((batch_size, block_size), dtype=torch.long)
    y = torch.zeros((batch_size, block_size), dtype=torch.long)
    for j, i in enumerate(idx):
        if len(data[i]) < block_size + 2:
            pad_id = tokenizer.token_to_id("<pad>")
            data[i] = data[i] + (block_size + 2 - len(data[i])) * [pad_id]
        random_start = random.randint(0, len(data[i]) - block_size - 2)

        x[j] = torch.tensor(data[i][random_start:random_start + block_size], dtype=torch.long)
        y[j] = torch.tensor(data[i][random_start + 1:random_start + block_size + 1], dtype=torch.long)
    return x, y

a, b = get_batch(val_text_ids, block_size, batch_size)
c, d = get_batch(train_text_ids, block_size, batch_size)
print(a.shape, b.shape)
print(c.shape, d.shape)

print(tokenizer.decode(a[0].tolist()))
print(tokenizer.decode(b[0].tolist()))

torch.Size([32, 50]) torch.Size([32, 50])
torch.Size([32, 50]) torch.Size([32, 50])
I misread that as "only one of my wife" and got really confused.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
 misread that as "only one of my wife" and got really confused.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [75]:
print(tokenizer.token_to_id('</s>'))

2


In [203]:
early_stop = 20
last_val_loss = 1e9
n_epochs = 15000
learning_rate = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
batch_size = 32

for steps in range(n_epochs):
    model.train()
    xb, yb = get_batch(train_text_ids, block_size, batch_size)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_text_ids, block_size, batch_size)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = 20
            last_val_loss = val_loss

Step: 0 Training Loss: 3.024886131286621
Validation loss: 2.9100911617279053
Step: 100 Training Loss: 2.4201560020446777
Validation loss: 2.700127601623535
Step: 200 Training Loss: 2.4169325828552246
Validation loss: 2.68891978263855
Step: 300 Training Loss: 2.4036340713500977
Validation loss: 2.3761820793151855
Step: 400 Training Loss: 2.2797653675079346
Validation loss: 2.272076368331909
Step: 500 Training Loss: 2.8898205757141113
Validation loss: 2.2783169746398926
Step: 600 Training Loss: 2.5931007862091064
Validation loss: 2.6256775856018066
Step: 700 Training Loss: 2.6782519817352295
Validation loss: 2.6166839599609375
Step: 800 Training Loss: 2.364764451980591
Validation loss: 2.3968307971954346
Step: 900 Training Loss: 2.8317925930023193
Validation loss: 2.4520225524902344
Step: 1000 Training Loss: 2.5702385902404785
Validation loss: 2.295271873474121
Step: 1100 Training Loss: 2.4665656089782715
Validation loss: 2.7514758110046387
Step: 1200 Training Loss: 2.779698371887207
Val

In [206]:
torch.save(model, 'reddit_comments.pth')

In [209]:
starting_tokens = 'technology'
n_comments = 20

encoded_start = tokenizer.encode(starting_tokens).ids
print(encoded_start)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
for _ in range(n_comments):
    generation = model.generate(idx, max_new_tokens=200, block_size=block_size, temperature=1, stop_token=True)[0].tolist()
    print(tokenizer.decode(generation))
    print('-----------------------------------')




[643, 374, 82, 3407]
technology, but I know it's actually not a years ago, which you are glorious enough to suck someone your own children. Why would you make it a bit to break today their own community?

Personal-ww I've been selling my ancase once you staring around hardens.. Com�ie_pension. It out of a few weeks, but don't seem to feel like Harden. I'm so glad we can" here punched it? Actually. I'd say shitty is unbrawnable while assuming gay wouldn't have been to a lot more effective unexpected. Not that he just wasn't really happy to whal. Don/don't trade on the wall, he's just so desperately badly he is misarrassed.

Is this dead argument with her ass owned abro-RIession


If you hold her in that rocks to her
-----------------------------------
technology are allasions power insurance out, but since I just think that he was going to be the easters of the homach. What's she rey and NK. Harte was running about 2 seasons later in the entire days. We's been 16 because it might fit in