In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

block_size = 50

In [5]:
data = pd.read_csv("datasets/text/reddit_comments.csv")
body = data["body"].tolist()
text = "\n".join(body)

with open("datasets/text/reddit_comments.txt", "w", encoding='utf-8') as f:
    f.write(text)


In [2]:
tokenizer = ByteLevelBPETokenizer()
tokenizer.train(files=["datasets/text/reddit_comments.txt"], vocab_size=15000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save_model("datasets/text/")
output = tokenizer.encode("I love programming.")
print("Encoded string: ", output.ids)  # output.ids is the tokenized representation
print("Decoded string: ", tokenizer.decode(output.ids))  # decoding back to the original string

Encoded string:  [45, 981, 5166, 2260, 18]
Decoded string:  I love programming.


In [3]:
data = pd.read_csv("datasets/text/reddit_comments.csv")
text = data["body"].tolist()

text_ids = []

for t in tqdm(text):
    next_ids = tokenizer.encode(t).ids
    next_ids.append(tokenizer.encode("</s>").ids[0])
    text_ids.append(next_ids)


100%|██████████| 1000000/1000000 [01:36<00:00, 10334.85it/s]


In [4]:

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

print("Train size: ", train_size)
print("Test size: ", test_size)

Train size:  900000
Test size:  100000


In [5]:
with open("datasets/text/reddit_text_ids.pkl", "wb") as f:
    pickle.dump(text_ids, f)

In [7]:

vocab_file_path = "datasets/text/vocab.json"
merges_file_path = "datasets/text/merges.txt"
tokenizer = ByteLevelBPETokenizer(vocab_file_path, merges_file_path)

text_ids = pickle.load(open("datasets/text/reddit_text_ids.pkl", "rb"))

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

In [8]:
vocab_size = tokenizer.get_vocab_size()

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
    
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )
        
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))
        
        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)
        
        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform
        
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, block_size, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            
            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature
            
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == tokenizer.token_to_id('</s>'):
                break
        return idx

# Hyperparameters

vocab_size = vocab_size  # set your vocab size
n_emb = 300
n_layers = 4
n_heads = 4
dropout = 0.1

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create model, optimizer
model = LanguageModel(vocab_size, n_emb, n_layers, n_heads, dropout).to(device)
print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')




Number of parameters 16125692


In [26]:
def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - 2, (batch_size,))
    x = torch.zeros((batch_size, block_size), dtype=torch.long)
    y = torch.zeros((batch_size, block_size), dtype=torch.long)
    for j, i in enumerate(idx):
        if len(data[i]) < block_size + 2:
            pad_id = tokenizer.token_to_id("<pad>")
            data[i] = data[i] + (block_size + 2 - len(data[i])) * [pad_id]
        random_start = random.randint(0, len(data[i]) - block_size - 2)

        x[j] = torch.tensor(data[i][random_start:random_start + block_size], dtype=torch.long)
        y[j] = torch.tensor(data[i][random_start + 1:random_start + block_size + 1], dtype=torch.long)
    return x, y

a, b = get_batch(val_text_ids, block_size, 1)
c, d = get_batch(train_text_ids, block_size, 1)
print(a.shape, b.shape)
print(c.shape, d.shape)

print(tokenizer.decode(a[0].tolist()))
print(tokenizer.decode(b[0].tolist()))

torch.Size([1, 50]) torch.Size([1, 50])
torch.Size([1, 50]) torch.Size([1, 50])
 me seriously rip u. If i ever meet u irl, i will bet a 100 bucks on that you either are in a abusive psychological terror relationship or a mental wreck. But gl in the future, and dont let the pussy eat your mind,
 seriously rip u. If i ever meet u irl, i will bet a 100 bucks on that you either are in a abusive psychological terror relationship or a mental wreck. But gl in the future, and dont let the pussy eat your mind, you


In [28]:
print(tokenizer.token_to_id('</s>'))

2


In [29]:
early_stop = 100
last_val_loss = 1e9
n_epochs = 15000
learning_rate = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
batch_size = 32

for steps in range(n_epochs):
    model.train()
    xb, yb = get_batch(train_text_ids, block_size, batch_size)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_text_ids, block_size, batch_size)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = 100
            last_val_loss = val_loss

Step: 0 Training Loss: 9.68064022064209
Validation loss: 8.08499526977539
Step: 100 Training Loss: 3.664595603942871
Validation loss: 3.7453010082244873
Step: 200 Training Loss: 2.981350898742676
Validation loss: 2.8595376014709473
Step: 300 Training Loss: 3.253497838973999
Validation loss: 3.011801242828369
Step: 400 Training Loss: 3.3305392265319824
Validation loss: 2.9725754261016846
Step: 500 Training Loss: 2.5168373584747314
Validation loss: 3.45869779586792
Step: 600 Training Loss: 3.5765695571899414
Validation loss: 3.0200717449188232
Step: 700 Training Loss: 3.1930768489837646
Validation loss: 3.3541150093078613
Step: 800 Training Loss: 2.3372642993927
Validation loss: 2.8022594451904297
Step: 900 Training Loss: 3.661689043045044
Validation loss: 3.5577988624572754
Step: 1000 Training Loss: 2.6247994899749756
Validation loss: 2.8534739017486572
Step: 1100 Training Loss: 2.9267828464508057
Validation loss: 3.2338998317718506
Step: 1200 Training Loss: 2.6099724769592285
Validatio

KeyboardInterrupt: 

In [220]:
torch.save(model, 'reddit_comments.pth')

In [41]:
starting_tokens = 'Politics'
n_comments = 20

encoded_start = tokenizer.encode(starting_tokens).ids
print(encoded_start)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
for _ in range(n_comments):
    generation = model.generate(idx, max_new_tokens=200, block_size=block_size, temperature=0.5, stop_token=True)[0].tolist()
    print(tokenizer.decode(generation))
    print('-----------------------------------')




[14987]
Politics is the most likely to be a debut. I'm not your opinion. They’re not telling me. I think he should be a great player. 

I don’t think he’s a man, but I’m not just a great idea that I’ll have to be a shame to do what you’re doing.

&amp;#x200B;

&amp;#x200B;

&gt;I’m not a lot of this. 

&amp;#x200B;

I would probably be playing up and that’s what you’re doing. 

&amp;#x200B;

The whole world is not a big way.

Also, I’m not sure if you’re not a joke about it, but I’m not watching you. I think you’ll be interested.
-----------------------------------
Politics and the time is that you can't be able to get a problem with the person to do it.

&amp;#x200B;

They can't get a shit about it. She's not the case of the game. 

They are not a lot of people that have to start people, but I think the current person is still going to do.
-----------------------------------
Politics of the "cone" and the "decurious" or a problem" is the only one. It's a lot easier to be the best team