In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd

In [2]:
torch.manual_seed(101)

def get_batch(data, block_size, batch_size):
    idx = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([train_data[i:i+block_size] for i in idx])
    y = torch.stack([train_data[i+1:i+block_size+1] for i in idx])
    return x, y

def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = get_batch(val_data, block_size, batch_size)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()



In [2]:


import torch.nn as nn
import torch.nn.functional as F

class Head(nn.Module):
    """ one head self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_emb, head_size, bias=False)
        self.query = nn.Linear(n_emb, head_size, bias=False)
        self.value = nn.Linear(n_emb, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        # compute attention scores
        wei = q @ k.transpose(-2, -1) / (C**0.5)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = F.dropout(wei, p=dropout)
        # perform score aggregation
        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):

    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj = nn.Linear(n_emb, n_emb)

    def forward(self, x):
        x = torch.cat([h(x) for h in self.heads], dim=-1)
        x = self.proj(x)
        x = F.dropout(x, p=dropout)
        return x

class FeedForward(nn.Module):

    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_emb, 4*n_emb),
            nn.ReLU(),
            nn.Linear(4*n_emb, n_emb),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    """ Transformer Block followed by computation
    """
    def __init__(self, n_emb, n_heads):
        super().__init__()
        self.head_size = n_emb // n_heads
        self.sa = MultiHeadAttention(n_heads, self.head_size)
        self.ff = FeedForward()
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        x = F.dropout(x, p=dropout)
        return x
        
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.blocks = nn.Sequential(*[Block(n_emb, n_heads) for _ in range(n_layers)])
        self.feed_forward = FeedForward()
        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device)) 
        x = token_emb + position_emb
        x = self.blocks(x) 
        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self.forward(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
        return idx


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#filename = 'datasets/text/1001nights.txt'

# folder_path = 'datasets/vietnamese/vietnamese/selected/'
# number_of_files = 500
# data = []

# counter = 0
# #file_names = os.listdir(folder_path)[:number_of_files]

# for file_name in file_names:
#     print(file_name)
#     with open(folder_path + file_name, 'r', encoding='utf-8') as file:
#         data.append(file.read())
#     counter += 1
#     if counter > number_of_files:
#         break



filename = 'datasets/text/r_wallstreetbets_posts.csv'
df = pd.read_csv(filename)
data = df['title']
data.dropna(inplace=True)
data = data.values.tolist()

print(data[:10])

special_token = b'\x03'

for idx in range(len(data)):
    data[idx] = data[idx] + special_token.decode('utf-8')

text = ' '.join(data)
text_set = set(text)
stoi = {ch: i for i, ch in enumerate(sorted(text_set))}
itos = {i: ch for i, ch in enumerate(sorted(text_set))}
vocab_size = len(stoi)
print('Sample dict:', {k: stoi[k] for k in list(stoi)[:50]})
print('Sample dict:', {k: itos[k] for k in list(itos)[:50]})

n_train = int(len(data) * 0.9)
train_data = data[:n_train]
val_data = data[n_train:]

# print(len(text))
# print(text[:1000])

# stoi = {ch: i for i, ch in enumerate(sorted(set(text)))}
# itos = {i: ch for i, ch in enumerate(sorted(set(text)))}
# vocab_size = len(stoi)

# print('Vocab size:', len(stoi))
# print('Sample dict:', {k: stoi[k] for k in list(stoi)[:50]})
# print('Sample dict:', {k: itos[k] for k in list(itos)[:50]})

# encode = lambda s: [stoi[ch] for ch in s]
# decode = lambda x: ''.join([itos[i] for i in x])

# data = torch.tensor(encode(text), dtype=torch.long).to(device)

# n = int(len(data) * 0.9)
# train_data = data[:n]
# val_data = data[n:]




  df = pd.read_csv(filename)


['Whats going on with PLTR?', 'Need explanations on Level 2 data for GME, why isn’t the price higher if asks are only 4000$+', 'XRT is being used as a laundry short machine', 'Airlines?', 'Buy TRXC 🚀', '$AMTX', 'Lost 99% of its value....this stock can only go up from here....', 'Bull run AMC ENTERTAINMENT Europe market', 'AMC 2 MILLION!!', 'Overview of clean Battery Graphite Miners for US and EU Electric Vehicle, some already up +1000% in the last 6 month']
Sample dict: {'\x03': 0, '\n': 1, '\x10': 2, '\x11': 3, '\x1b': 4, ' ': 5, '!': 6, '"': 7, '#': 8, '$': 9, '%': 10, '&': 11, "'": 12, '(': 13, ')': 14, '*': 15, '+': 16, ',': 17, '-': 18, '.': 19, '/': 20, '0': 21, '1': 22, '2': 23, '3': 24, '4': 25, '5': 26, '6': 27, '7': 28, '8': 29, '9': 30, ':': 31, ';': 32, '<': 33, '=': 34, '>': 35, '?': 36, '@': 37, 'A': 38, 'B': 39, 'C': 40, 'D': 41, 'E': 42, 'F': 43, 'G': 44, 'H': 45, 'I': 46, 'J': 47, 'K': 48, 'L': 49}
Sample dict: {0: '\x03', 1: '\n', 2: '\x10', 3: '\x11', 4: '\x1b', 5: '

In [20]:
batch_size = 32
n_emb = 300

n_layers = 10
n_heads = 6
dropout = 0.2
learning_rate = 3e-4
block_size = 300
m = LanguageModel(vocab_size=vocab_size, n_emb=n_emb).to(device)
optimizer = torch.optim.Adam(m.parameters(), lr=learning_rate)

In [21]:
early_stop = 10
last_val_loss = 1e9
n_epochs = 4000

for steps in range(n_epochs):
    xb, yb = get_batch(train_data, block_size, batch_size)
    xb = xb.to(device)
    logits, loss = m(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        val_loss = estimate_loss(m, val_data, block_size, batch_size)
        print('Validation loss:', val_loss)
        if val_loss >= last_val_loss:
            early_stop -= 1
            if early_stop == 0:
                print('Early stop!')
                break
        else:
            early_stop = 5
            last_val_loss = val_loss


Step: 0 Training Loss: 5.0106987953186035
Validation loss: 4.1693878173828125
Step: 100 Training Loss: 2.298553943634033
Validation loss: 2.3292412757873535
Step: 200 Training Loss: 2.2397875785827637
Validation loss: 2.2539689540863037
Step: 300 Training Loss: 2.224942207336426
Validation loss: 2.2263102531433105
Step: 400 Training Loss: 2.197479248046875
Validation loss: 2.1885271072387695
Step: 500 Training Loss: 2.1425485610961914
Validation loss: 2.13698673248291
Step: 600 Training Loss: 2.081874370574951
Validation loss: 2.0737390518188477
Step: 700 Training Loss: 2.0377748012542725
Validation loss: 2.016596794128418
Step: 800 Training Loss: 1.9833909273147583
Validation loss: 1.991495966911316
Step: 900 Training Loss: 1.9465136528015137
Validation loss: 1.9333055019378662
Step: 1000 Training Loss: 1.8941233158111572
Validation loss: 1.9011832475662231
Step: 1100 Training Loss: 1.87446129322052
Validation loss: 1.8602803945541382
Step: 1200 Training Loss: 1.8456058502197266
Valid

In [22]:
print(sum(p.numel() for p in m.parameters() if p.requires_grad))
starting_tokens = 'Em'
len_starting_tokens = len(starting_tokens)
idx = torch.tensor(encode(starting_tokens)).reshape(1, len_starting_tokens).to(device)
print(decode(m.generate(idx, max_new_tokens=1000)[0].tolist()))

11716024
Em một có chờm hục. – Của thể phải gia tám mê phặc lôới trâm qua thoạc:- Rồi ngúi vám. Huệ ba mà anh  muốc có người gặt, tông xiệu. Cho anh nói miếp hai biêng thỉ mình tay mật không nghe nước đi tể đầm vào, muốn bà chịu lời biểu chu hết? - Càng do vậy? – Câu chuyện cáu  chiếu sẻ Hưa ngờ loàih, sẽ như tỏ vù đẩy nhân hấp  những đó ứng an ban đời đo, Hiếu uôi phem bộ dầu dện thút, xong có ra ắpxúc dài sớm lên và hiên cáo, “ phìu sớm bỏ,  hổi đángười từng hoàng có, mà ca nhưng đã lia duốc. Nhình em trong của, Huệa thoát cách. Phốc có tiêm, màu cào trai đều cái đủ mười thông ra  kiêng ta khí nhắc, giống chất bớm đủ tay sẽ còn về người nhuộ, chỗ thay khàng vừa sẻ để vậy  con ông thung trời cổ cổ không? Eu chăn nhiên nỗi gio ruồi cứ lòng. Cười, bộ mùng hen nhìn xây: - Nghe- Những bảo..hà!- Vậy trởi trang sợ quới thiệm phôm, nóng không  coi được  tỏ ngồi th