In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
import math
from tqdm import tqdm
from tokenizers import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import AutoTokenizer, BertTokenizer
from sklearn.model_selection import train_test_split
import gc
from tokenizers import ByteLevelBPETokenizer, processors
import sqlite3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(device)


if not torch.cuda.is_available():
    print("CUDA is not available on your system.")
else:
    # Print the number of CUDA devices
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}\n")

    for i in range(torch.cuda.device_count()):
        print(f"Device {i}:")
        print(f"    Name: {torch.cuda.get_device_name(i)}")
        print(f"    Computational Capability: {torch.cuda.get_device_capability(i)}")
        print(f"    Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024**2):.2f} MB\n")



cuda
Number of CUDA Devices: 1

Device 0:
    Name: NVIDIA GeForce RTX 3090
    Computational Capability: (8, 6)
    Total Memory: 24575.50 MB



In [2]:
data_path = 'datasets/reddit_comments/may2015/reddit-comments-may-2015'

In [9]:


def fetch_first_n_rows(filename, table_name, num_rows=4000000):
    # Connect to SQLite database
    conn = sqlite3.connect(filename)
    cur = conn.cursor()
    
    # Execute query to fetch first 'num_rows' rows
    cur.execute(f"SELECT body FROM {table_name} LIMIT {num_rows}")
    
    # Fetch all the data
    data = cur.fetchall()
    
    # Fetch column names from the cursor description
    col_names = [desc[0] for desc in cur.description]
    
    # Close the cursor and the database connection
    cur.close()
    conn.close()
    
    # Convert the data list into a DataFrame
    df = pd.DataFrame(data, columns=col_names)
    
    return df


filename = os.path.join(data_path, 'database.sqlite')
df = fetch_first_n_rows(filename=filename, table_name='May2015', num_rows=(int(1.5 * 10**9)))
print(df.head(10))


                                                body
0                    くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな
1  gg this one's over. off to watch the NFL draft...
2  Are you really implying we return to those tim...
3  No one has a European accent either  because i...
4  That the kid "..reminds me of Kevin."   so sad...
5  Haha, i was getting nauseous from it, if that ...
6  After reading this, I wholeheartedly believe y...
7     Let's do this. See you guys on the other side.
8  You can buy a mystery sampler from small batch...
9  Nihilum and LG are significantly better off in...


In [11]:
keywords = [
    "Economics", "Finance", "Investment", "Capital", "Assets", "Debt", "Inflation", "Recession", "Interest",
    "Stocks", "Bond", "Portfolio", "Dividend", "Market", "GDP", "Taxation", "Budget", "Risk", "Liquidity"
]


keywords = set(keywords)
# Convert all words to lowercase in the set
keywords = {word.lower() for word in keywords}

# Print the lowercase words
for word in keywords:
    print(word)




finance
budget
stocks
market
liquidity
recession
bond
assets
portfolio
gdp
capital
taxation
economics
debt
interest
risk
investment
dividend
inflation


In [10]:

data = df
data.dropna(inplace=True)
train_data = data 
#N = df.shape[0]
#train_size = int(N * 0.95)
# train_data = data[:train_size]
# val_data = data[train_size:]

In [13]:
from transformers import BertTokenizer, AutoModel, AutoTokenizer

#tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
# Get the IDs
cls_id = tokenizer.cls_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
pad_id = tokenizer.pad_token_id

print("PAD Token ID:", pad_id)
print("UNK Token ID:", unk_id)
print("CLS Token ID:", cls_id)
print("SEP Token ID:", sep_id)

PAD Token ID: 0
UNK Token ID: 100
CLS Token ID: 101
SEP Token ID: 102


In [14]:
BLOCK_SIZE = 200
VOCAB_SIZE = tokenizer.vocab_size
MIN_LENGTH = 100
print('Vocab size is ', VOCAB_SIZE)


Vocab size is  28996


In [17]:
import re

def contains_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.search(text) is not None

def get_batch_generator(data, block_size, min_length, batch_size):
    data_len = len(data)
    idx = 0  # Internal counter

    while True:  # Infinite loop to keep generating batches
        x = torch.zeros((batch_size, block_size), dtype=torch.long)
        y = torch.zeros((batch_size, block_size), dtype=torch.long)
        i = 0

        while i < batch_size:
            if idx >= data_len:
                idx = 0  # Reset the counter if you've gone through all data
                print("Resetting counter...")

            sample = data.iloc[idx]
            idx += 1  # Increment the counter
            flag = False
            if contains_url(sample['body']):
                continue
            for word in keywords:
                if word in sample['body'].lower():
                    flag = True
                    break
            if not flag:
                continue

            summary_ids = tokenizer.encode(sample['body'])
            if len(summary_ids) < min_length:
                continue
            summary_ids = [id for id in summary_ids if id != unk_id]

            if len(summary_ids) < block_size + 2:
                summary_ids = summary_ids + [pad_id] * (block_size + 2 - len(summary_ids))

            random_start = random.randint(0, len(summary_ids) - block_size - 2)
            x[i, :len(summary_ids)] = torch.tensor(summary_ids[random_start:random_start + block_size], dtype=torch.long)
            y[i, :len(summary_ids)] = torch.tensor(summary_ids[random_start + 1:random_start + block_size + 1], dtype=torch.long)
            i += 1

        yield x, y

batch = get_batch_generator(train_data, block_size=BLOCK_SIZE, min_length=MIN_LENGTH, batch_size=32)


In [16]:
x, y = next(batch)
print(x.shape, y.shape)
print(tokenizer.decode(x[0].tolist(), skip_special_tokens=False))
print(tokenizer.decode(y[0].tolist(), skip_special_tokens=False))

Token indices sequence length is longer than the specified maximum sequence length for this model (616 > 512). Running this sequence through the model will result in indexing errors


torch.Size([32, 200]) torch.Size([32, 200])
them they'd open their mouths wide as a snakes and reveal their teeth and tongue ( looked like the mouth of a freaking deep sea angler fish ), horrifying. The last time I dreamed about Sweetums though was the most interesting... The dream started the same way it always did. The impending music, Sweetums lumbering into the light of the culdersac, but this time I didn't hide. I didn't even leave my room. In fact I just sat there, waiting, watching the door for Sweetums. And as the music got to it's usual loud state with his proximity he stopped in the door way and just stared at me. Me at him and him back at me. For the first time ever he spoke, in my Dad's voice... " Well done. I'm proud of you. " Then he walked away. I've never seen him since... 7 years of recurring nightmares at
they'd open their mouths wide as a snakes and reveal their teeth and tongue ( looked like the mouth of a freaking deep sea angler fish ), horrifying. The last time I

In [None]:
del model, xb, yb  # delete the tensor variable
torch.cuda.empty_cache()  # clear unused memory in PyTorch
gc.collect()  # call Python garbage collector

In [18]:
N_EMB = 1200
N_LAYERS = 6
N_HEADS = 5
DROPOUT = 0.1

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def get_sine_position_encodings(length, dim):
    pos = torch.arange(length, dtype=torch.float32).reshape(-1, 1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))
    pos_encodings = torch.zeros(length, dim)
    pos_encodings[:, 0::2] = torch.sin(pos * div_term)
    pos_encodings[:, 1::2] = torch.cos(pos * div_term)
    return pos_encodings

class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, block_size, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.block_size = block_size

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )

        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table(torch.arange(T, device=device))

        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)

        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform

        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self.forward(idx_cond)

            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature

            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == sep_id:
                break
        return idx

# Create model, optimizer
model = LanguageModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, n_emb=N_EMB, n_layers=N_LAYERS, \
    n_heads=N_HEADS, dropout=DROPOUT).to(device)

print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')



Number of parameters 145513684


In [20]:
model_path = os.path.join(data_path, "reddit_comment.pth")
print(model_path)
model = torch.load(model_path)

datasets/reddit_comments/may2015/reddit-comments-may-2015\reddit_comment.pth


In [19]:
def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = next(batch)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

EARLY_STOP = 50
N_EPOCHS = 40000
BATCH_SIZE = 32
SAVE_FREQUENCY = 500
LEARNING_RATE = 3e-4
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

last_val_loss = 1e9
early_stop = EARLY_STOP

for steps in range(N_EPOCHS):
    model.train()
    xb, yb = next(batch)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Delete xb, yb and free GPU memory
    del xb, yb
    torch.cuda.empty_cache()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
        # val_loss = estimate_loss(model, val_data, block_size=BLOCK_SIZE, batch_size=BATCH_SIZE)
        # print('Validation loss:', val_loss)
    if steps % SAVE_FREQUENCY == 0:
        model_path = os.path.join(data_path, "reddit_comment_6_layers.pth")
        torch.save(model, model_path)
        print('Model saved at ', model_path)



Step: 0 Training Loss: 10.258922576904297
Model saved at  datasets/reddit_comments/may2015/reddit-comments-may-2015\reddit_comment_6_layers.pth


In [128]:
model_path = os.path.join(data_path, "reddit_comment_6_layers.pth")
torch.save(model, model_path)

In [146]:
starting_tokens = 'I think we are heading towards a recession.'
encoded_start = tokenizer.encode(starting_tokens)
encoded_start.pop(-1)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
N_SAMPLES = 10
for _ in range(N_SAMPLES):
    generation = model.generate(idx, max_new_tokens=2000, temperature=0.5, stop_token=True)[0].tolist()
    story = tokenizer.decode(generation, skip_special_tokens=True)

    print('Story ', _ + 1, ':')
    print(story)
    print('\n')






Story  1 :
I think we are heading towards a recession. It't be a $ 5 years ( and that't get a few years ). If you't really cool and so it's a lot of your headphones, I's a room. I have a great. I't have a few years. I't. The idea of my own. I't have to the propagabond. The only way that's a few years. I think about $ 15k and the past, you'm not only a good.


Story  2 :
I think we are heading towards a recession. I's a " It's what you have to be the economy. I's not a few times, I think it. I think you't know what I't a lot of the one day. And on the food, and can't have no way. It was a great job, but it is a few years ago. You can be more than the US. This is a great. But it, but you't want to just don's a mana, the same as a REALYS, but I have to be a lot of the big corporations.


Story  3 :
I think we are heading towards a recession. I think that would be a very rarely a lot of people have it. The only way. The last night, but I think it is a little more than a bit of all of the m

In [74]:
xb, yb = next(batch)
print(tokenizer.decode(xb[0].tolist(), skip_special_tokens=False))
print(tokenizer.decode(yb[0].tolist(), skip_special_tokens=False))

