In [79]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import AutoTokenizer, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [80]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

print("Padding token:", tokenizer.pad_token)
print("EOS token:", tokenizer.sep_token)

Padding token: [PAD]
EOS token: [SEP]


In [87]:
data = pd.read_csv("datasets/text/good.csv")
text = data["text"].tolist()
text_ids = []
for t in tqdm(text):
    text_ids.append(tokenizer.encode(t))

print(tokenizer.decode(text_ids[-1]))


  0%|          | 0/30279 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (823 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 30279/30279 [03:20<00:00, 150.73it/s]

[CLS] the zytglogge bernese german is a landmark medieval tower in bern, switzerland. built in the early 13th century, it has served the city as guard tower, prison, clock tower, centre of urban life and civic memorial. despite the many redecorations and renovations it has undergone in its 800 years of existence, the zytglogge is one of bern's most recognisable symbols and the oldest monument of the city, and with its 15th century astronomical clock, a major tourist attraction. it is a heritage site of national significance, and part of the old city of bern, a unesco world cultural heritage site. when it was built around 121820, the zytglogge served as the gate tower of bern's western fortifications. these were erected after the city's first westward expansion following its de facto independence from the empire. at that time, the zytglogge was a squat building of only 16 metres 52 ft in height. when the rapid growth of the city and the further expansion of the fortifications up to the 




In [88]:
with open ("datasets/text/good_ids.pkl", "wb") as f:
    pickle.dump(text_ids, f)

In [107]:
BLOCK_SIZE = 75
VOCAB_SIZE = tokenizer.vocab_size

In [134]:
text_ids = []  
VOCAB_SIZE = tokenizer.vocab_size

with open("datasets/text/good_ids.pkl", "rb") as f:
    text_ids = pickle.load(f)

for i in range (len(text_ids)):
    if len(text_ids[i]) < BLOCK_SIZE + 2:
        text_ids[i] +=  (BLOCK_SIZE - len(text_ids[i]))*[tokenizer.pad_token_id]

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

print("Train size: ", train_size)
print("Test size: ", test_size)
print("Vocab size: ", VOCAB_SIZE)




Train size:  27251
Test size:  3028
Vocab size:  30522


In [98]:
print(tokenizer.decode(train_text_ids[1]))

[CLS] oryzomys pliocaenicus is a fossil rodent from the hemphillian late miocene of kansas, central united states. it is known from a single mandible lower jaw with the back part missing. all three molars are present, but very worn. together, the molars are 3. 6 mm long. the fossil was discovered in 1935 and described in 1939 as a possible species of oryzomys in open nomenclature. later authors doubted this allocation and suggested that it may instead belong in bensonomys or jacobsomys, but the material may not allow a definite identification. the only known specimen of oryzomys pliocaenicus is a mandible lower jaw found in the spring of 1935 by david dunkle in edson quarry, sherman county, kansas. it is in the collections of the museum of comparative zoology at harvard university as specimen mcz 6202. edson quarry is in the ogallala formation and the hemphillian north american land mammal age. claude w. hibbard described the mandible as oryzomys pliocaenicus in a 1939 paper. hibbard w

In [109]:
N_EMB = 300
N_LAYERS = 5
N_HEADS = 5
DROPOUT = 0.2

print(tokenizer.vocab_size)

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask
    
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, block_size, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )
        
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))
        
        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)
        
        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform
        
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, block_size, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self.forward(idx_cond)
            
            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature
            
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == tokenizer.sep_token_id:
                break
        return idx

# Create model, optimizer
model = LanguageModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, n_emb=N_EMB, n_layers=N_LAYERS, \
    n_heads=N_HEADS, dropout=DROPOUT).to(device)

print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')


30522
Number of parameters 27055462


In [144]:
wsb_df = pd.read_csv("datasets/text/r_wallstreetbets_posts.csv")
wsb_df.dropna(inplace=True)
text_ids = []
VOCAB_SIZE = tokenizer.vocab_size


for comment in wsb_df["title"]:
    text_ids.append(tokenizer.encode(comment))



for i in range (len(text_ids)):
    if len(text_ids[i]) < BLOCK_SIZE + 2:
        text_ids[i] +=  (BLOCK_SIZE - len(text_ids[i]))*[tokenizer.pad_token_id]

N = len(text_ids)
train_size = int(0.9 * N)
test_size = N - train_size
train_text_ids = text_ids[:train_size]
val_text_ids = text_ids[train_size:]

print("Train size: ", train_size)
print("Test size: ", test_size)
print("Vocab size: ", VOCAB_SIZE)


print(tokenizer.decode(text_ids[0]))

  wsb_df = pd.read_csv("datasets/text/r_wallstreetbets_posts.csv")


Train size:  13056
Test size:  1451
Vocab size:  30522
[CLS] need explanations on level 2 data for gme, why isn ’ t the price higher if asks are only 4000 $ + [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [150]:
def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - 2, (batch_size,))
    x = torch.zeros((batch_size, block_size), dtype=torch.long)
    y = torch.zeros((batch_size, block_size), dtype=torch.long)
    for j, i in enumerate(idx):
        if len(data[i]) < block_size + 2:
            data[i] = data[i] + (block_size + 2 - len(data[i])) * [tokenizer.pad_token_id]
        random_start = random.randint(0, len(data[i]) - block_size - 2)

        x[j] = torch.tensor(data[i][random_start:random_start + block_size], dtype=torch.long)
        y[j] = torch.tensor(data[i][random_start + 1:random_start + block_size + 1], dtype=torch.long)
    return x, y


a, b = get_batch(val_text_ids, block_size=BLOCK_SIZE, batch_size=1)
c, d = get_batch(train_text_ids, block_size=BLOCK_SIZE, batch_size=1)

print(a.shape, b.shape)

print(tokenizer.decode(a[0].tolist()).replace('##', ''))
print(tokenizer.decode(b[0].tolist()).replace('##', ''))

torch.Size([1, 75]) torch.Size([1, 75])
[CLS] 18 % of u. s. workers have lost jobs or hours since coronavirus hit, poll finds [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
18 % of u. s. workers have lost jobs or hours since coronavirus hit, poll finds [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


In [6]:
model = torch.load("good_wiki_transformer_2.pth")

In [151]:
EARLY_STOP = 50
N_EPOCHS = 1000
BATCH_SIZE = 32
LEARNING_RATE = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

last_val_loss = 1e9
early_stop = EARLY_STOP

for steps in range(N_EPOCHS):
    model.train()
    xb, yb = get_batch(train_text_ids, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(model, val_text_ids, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = EARLY_STOP
            last_val_loss = val_loss

Step: 0 Training Loss: 17.458295822143555
Validation loss: 11.722884178161621
Step: 100 Training Loss: 1.0263479948043823
Validation loss: 1.2497804164886475
Step: 200 Training Loss: 1.0243841409683228
Validation loss: 1.1921515464782715
Step: 300 Training Loss: 0.9292372465133667
Validation loss: 0.9884623289108276
Step: 400 Training Loss: 1.1566933393478394
Validation loss: 1.2487475872039795
Step: 500 Training Loss: 0.8876675367355347
Validation loss: 0.9503032565116882
Step: 600 Training Loss: 0.9562360644340515
Validation loss: 1.2220863103866577
Step: 700 Training Loss: 1.0917285680770874
Validation loss: 1.2872898578643799


KeyboardInterrupt: 

In [125]:
torch.save(model, 'good_wiki_transformer_2.pth')

In [159]:
starting_tokens = ''

encoded_start = tokenizer.encode(starting_tokens)
encoded_start.pop(-1)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
N_SAMPLES = 100
for _ in range(N_SAMPLES):
    generation = model.generate(idx, max_new_tokens=500, block_size=BLOCK_SIZE, temperature=1, stop_token=True)[0].tolist()
    print(tokenizer.decode(generation))
    print('------------------')




[CLS] time the plot shows [SEP]
------------------
[CLS] i ’ m kinda sure this nothing....mner like guys? [SEP]
------------------
[CLS] congress hits 10 day in return from wisdom [SEP]
------------------
[CLS] 23k sell on!! [SEP]
------------------
[CLS] keep in 2021 that afternoon your outlook, just missed buying you? [SEP]
------------------
[CLS] if you love you guys, why's my brrroy [SEP]
------------------
[CLS] or smart he?????? [UNK] [SEP]
------------------
[CLS] do not get “?? [SEP]
------------------
[CLS] bb ) is a wsb adventure, growing [SEP]
------------------
[CLS] another german full retardedевич making sense over $ 1000. aapl powell [SEP]
------------------
[CLS] holding gme this kid and scrolling stocks [SEP]
------------------
[CLS] be interesting! [SEP]
------------------
[CLS] people hit losses on sounds [unused772] amc " trading of waiting for my anthem on our needs right of funds to = [SEP]
------------------
[CLS] loadedრ is near the moon [UNK] [SEP]
-----------