In [163]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import random
import pickle
import math
from tqdm import tqdm
from transformers import AutoTokenizer, BertTokenizer
import gc
import sqlite3
import contractions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
print(device)


if not torch.cuda.is_available():
    print("CUDA is not available on your system.")
else:
    # Print the number of CUDA devices
    print(f"Number of CUDA Devices: {torch.cuda.device_count()}\n")

    for i in range(torch.cuda.device_count()):
        print(f"Device {i}:")
        print(f"    Name: {torch.cuda.get_device_name(i)}")
        print(f"    Computational Capability: {torch.cuda.get_device_capability(i)}")
        print(f"    Total Memory: {torch.cuda.get_device_properties(i).total_memory / (1024**2):.2f} MB\n")



cuda
Number of CUDA Devices: 1

Device 0:
    Name: NVIDIA GeForce RTX 3090
    Computational Capability: (8, 6)
    Total Memory: 24575.50 MB



In [3]:
data_path = 'datasets/reddit_comments/may2015/reddit-comments-may-2015'

In [4]:


def fetch_first_n_rows(filename, table_name, num_rows=4000000):
    # Connect to SQLite database
    conn = sqlite3.connect(filename)
    cur = conn.cursor()
    
    # Execute query to fetch first 'num_rows' rows
    cur.execute(f"SELECT body FROM {table_name} LIMIT {num_rows}")
    
    # Fetch all the data
    data = cur.fetchall()
    
    # Fetch column names from the cursor description
    col_names = [desc[0] for desc in cur.description]
    
    # Close the cursor and the database connection
    cur.close()
    conn.close()
    
    # Convert the data list into a DataFrame
    df = pd.DataFrame(data, columns=col_names)
    
    return df


filename = os.path.join(data_path, 'database.sqlite')
df = fetch_first_n_rows(filename=filename, table_name='May2015', num_rows=(int(1.5 * 10**9)))
print(df.head(10))


                                                body
0                    くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな
1  gg this one's over. off to watch the NFL draft...
2  Are you really implying we return to those tim...
3  No one has a European accent either  because i...
4  That the kid "..reminds me of Kevin."   so sad...
5  Haha, i was getting nauseous from it, if that ...
6  After reading this, I wholeheartedly believe y...
7     Let's do this. See you guys on the other side.
8  You can buy a mystery sampler from small batch...
9  Nihilum and LG are significantly better off in...


In [525]:
top_words = []
with open('datasets/reddit_comments/wiki-100k.txt', 'r', encoding='utf-8') as f:
    for line in f:
        top_words.append(line.lower().strip('\n'))

print(top_words[:100])
top_words = set(top_words[:5000])
top_words.add(' ')
print(len(top_words))
        

['the', 'of', 'and', 'to', 'a', 'in', 'that', 'i', 'was', 'he', 'his', 'with', 'is', 'it', 'for', 'as', 'had', 'you', 'not', 'be', 'on', 'at', 'by', 'her', 'which', 'have', 'or', 'from', 'this', 'but', 'all', 'him', 'she', 'were', 'they', 'my', 'are', 'so', 'me', 'their', 'an', 'one', 'de', 'we', 'who', 'would', 'said', 'been', 'no', 'he', 'will', 'them', 'when', 'if', 'there', 'more', 'out', 'and', 'it', 'any', 'up', 'into', 'your', 'has', 'do', 'what', 'could', 'but', 'our', 'than', 'other', 'some', 'very', 'man', 'upon', 'about', 'its', 'only', 'time', 'may', 'la', 'like', 'little', 'then', 'now', 'should', 'can', 'made', 'did', 'such', 'a', 'great', 'in', 'must', 'these', 'two', 'before', 'see', 'us', 'over']
4310


In [6]:
data = df
data.dropna(inplace=True)
train_data = data 

In [231]:
#tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

cls_id = tokenizer.cls_token_id
sep_id = tokenizer.sep_token_id
unk_id = tokenizer.unk_token_id
pad_id = tokenizer.pad_token_id

print("PAD Token ID:", pad_id)
print("UNK Token ID:", unk_id)
print("CLS Token ID:", cls_id)
print("SEP Token ID:", sep_id)

PAD Token ID: 0
UNK Token ID: 100
CLS Token ID: 101
SEP Token ID: 102


In [232]:
BLOCK_SIZE = 128
VOCAB_SIZE = tokenizer.vocab_size
MIN_LENGTH = 100
MAX_LENGTH = 10000
print('Vocab size is ', VOCAB_SIZE)


Vocab size is  30522


In [563]:
import re
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatize_list(word_list):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in word_list]

def contains_url(text):
    url_pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    return url_pattern.search(text) is not None

def get_batch_generator(data, block_size, min_length, max_length, batch_size):
    data_len = len(data)
    idx = 0  # Internal counter

    while True:  # Infinite loop to keep generating batches
        x = torch.zeros((batch_size, block_size), dtype=torch.long)
        y = torch.zeros((batch_size, block_size), dtype=torch.long)
        i = 0

        while i < batch_size:
            if idx >= data_len:
                idx = 0  # Reset the counter if you've gone through all data
                print("Resetting counter...")

            sample = data.iloc[idx]
            idx += 1  # Increment the counter

            comment = contractions.fix(sample['body']).lower()
            matches = re.findall(r'\w+|[ ,.!?]', comment)
            n_uncommon_words = 0
            filtered_words = []
            ignored_words = []
            matches = lemmatize_list(matches)
            for word in matches:
                if word not in top_words and not re.match(r'[ ,.!?]', word):
                    n_uncommon_words += 1
                filtered_words.append(word)

            #filtered_words = [word if word in top_words or re.match(r'[ ,.!?]', word) else '' for word in matches]
            if len(filtered_words) == 0 or n_uncommon_words/len(filtered_words) > 0.05:
                continue
            comment = ''.join(filtered_words)

            summary_ids = tokenizer.encode(comment, add_special_tokens=True)

            if len(summary_ids) < min_length or len(summary_ids) > max_length:
                continue

            summary_ids = [id for id in summary_ids if id != unk_id]

            if len(summary_ids) < block_size + 2:
                summary_ids = summary_ids + [pad_id] * (block_size + 2 - len(summary_ids))

            random_start = random.randint(0, len(summary_ids) - block_size - 2)
            x[i, :len(summary_ids)] = torch.tensor(summary_ids[random_start:random_start + block_size], dtype=torch.long)
            y[i, :len(summary_ids)] = torch.tensor(summary_ids[random_start + 1:random_start + block_size + 1], dtype=torch.long)
            i += 1

        yield x, y

batch = get_batch_generator(train_data, block_size=BLOCK_SIZE, min_length=MIN_LENGTH, max_length=MAX_LENGTH, batch_size=16)


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hvutr\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [562]:
x, y = next(batch)
print(x.shape, y.shape)
print(tokenizer.decode(x[0].tolist(), skip_special_tokens=False))
print(tokenizer.decode(y[0].tolist(), skip_special_tokens=False))


torch.Size([16, 128]) torch.Size([16, 128])
[CLS] what the fuck is the h on the end for? get the fuck out here? that is not right. it is get the fuck out of here. and my self esteem is not so low that i need to show a stranger how much money i make. i just wanted to call you out for acting like a dumbass on this post about this man that had his life fucked up by the dumbass rioter. all the money you make probably come from the tax i pay anyways, so you are welcome. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
what the fuck is the h on the end for? get the fuck out here? that is not right. it is get the fuck out of here. and my self esteem is not so low that i need to show a stranger how much money i make. i just wanted to call you out for acting like a dumbass on this post about this man that had his life fucked up by the dumbass rioter. all the money you make probably come from the tax i pay a

In [None]:
del model, xb, yb  # delete the tensor variable
torch.cuda.empty_cache()  # clear unused memory in PyTorch
gc.collect()  # call Python garbage collector

In [564]:
N_EMB = 1000
N_LAYERS = 6
N_HEADS = 4
DROPOUT = 0.2

def generate_square_subsequent_mask(sz):
    mask = (torch.tril(torch.ones(sz, sz)) == 1).float()
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def get_sine_position_encodings(length, dim):
    pos = torch.arange(length, dtype=torch.float32).reshape(-1, 1)
    div_term = torch.exp(torch.arange(0, dim, 2).float() * -(math.log(10000.0) / dim))
    pos_encodings = torch.zeros(length, dim)
    pos_encodings[:, 0::2] = torch.sin(pos * div_term)
    pos_encodings[:, 1::2] = torch.cos(pos * div_term)
    return pos_encodings

class LanguageModel(nn.Module):

    def __init__(self, vocab_size, n_emb, block_size, n_layers, n_heads, dropout=0.2):
        super(LanguageModel, self).__init__()

        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = get_sine_position_encodings(block_size + 2, n_emb).to(device)
        self.block_size = block_size

        encoder_layer = nn.TransformerEncoderLayer(d_model=n_emb, nhead=n_heads, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

        self.feed_forward = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb)
        )

        self.lm_head = nn.Linear(n_emb, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        token_emb = self.token_embedding_table(idx)
        position_emb = self.position_embedding_table[:T, :]

        x = token_emb + position_emb
        x_transform = x.clone()
        mask = generate_square_subsequent_mask(T).to(device)

        x_transform = self.transformer_encoder(x_transform.permute(1, 0, 2), mask=mask)
        x_transform = x_transform.permute(1, 0, 2)
        x = x + x_transform

        x = self.feed_forward(x)
        logits = self.lm_head(x)

        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        else:
            return logits, None


    def generate(self, idx, max_new_tokens, temperature=1.0, stop_token=False):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -self.block_size:]
            logits, _ = self.forward(idx_cond)

            # Scale logits by the temperature
            logits = logits[:, -1, :] / temperature

            probs = F.softmax(logits, dim=-1)
            idx_new = torch.multinomial(probs, num_samples=1)
            idx = torch.cat([idx, idx_new], dim=-1)
            if stop_token and idx_new.item() == sep_id:
                break
        return idx

# Create model, optimizer
model = LanguageModel(vocab_size=VOCAB_SIZE, block_size=BLOCK_SIZE, n_emb=N_EMB, n_layers=N_LAYERS, \
    n_heads=N_HEADS, dropout=DROPOUT).to(device)

print(f'Number of parameters {sum(p.numel() for p in model.parameters() if p.requires_grad)}')



Number of parameters 117721810


In [11]:
model_path = os.path.join(data_path, "reddit_comment_6_layers.pth")
print(model_path)
model = torch.load(model_path)

datasets/reddit_comments/may2015/reddit-comments-may-2015\reddit_comment_10_layers.pth


In [571]:
def estimate_loss(model, val_data, block_size, batch_size):
    model.eval()
    with torch.no_grad():
        x, y = next(batch)
        x, y = x.to(device), y.to(device)
        _, loss = model(x, y)
    model.train()
    return loss.item()

EARLY_STOP = 50
N_EPOCHS = 1000
BATCH_SIZE = 32
SAVE_FREQUENCY = 500
LEARNING_RATE = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

last_val_loss = 1e9
early_stop = EARLY_STOP

for steps in range(N_EPOCHS):
    model.train()
    xb, yb = next(batch)
    xb = xb.to(device)
    yb = yb.to(device)
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Delete xb, yb and free GPU memory
    del xb, yb
    torch.cuda.empty_cache()

    if steps % 100 == 0:
        print('Step:', steps, 'Training Loss:', loss.item())
    if steps % SAVE_FREQUENCY == 0:
        model_path = os.path.join(data_path, "reddit_comment_6_layers.pth")
        torch.save(model, model_path)
        print('Model saved at ', model_path)



Step: 0 Training Loss: 4.288944244384766
Model saved at  datasets/reddit_comments/may2015/reddit-comments-may-2015\reddit_comment_6_layers.pth
Step: 100 Training Loss: 4.5208234786987305
Step: 200 Training Loss: 4.54716157913208


In [569]:
model_path = os.path.join(data_path, "reddit_comment_6_layers.pth")
torch.save(model, model_path)

In [570]:
starting_tokens = ''
encoded_start = tokenizer.encode(starting_tokens.lower())
encoded_start.pop(-1)
len_starting_tokens = len(encoded_start)

idx = torch.tensor(encoded_start).reshape(1, len_starting_tokens).to(device)
model.eval()
N_SAMPLES = 10
for _ in range(N_SAMPLES):
    generation = model.generate(idx, max_new_tokens=1000, temperature=0.8, stop_token=True)[0].tolist()
    story = tokenizer.decode(generation, skip_special_tokens=True)

    print('Comment ', _ + 1, ':')
    print(story)
    print('\n')

Comment  1 :
i am glad you are dealing with my nose. it is a problem but it is easy to quit. i do not know anything you may not be a great person, but it is not your best to do it with you. i do not know it a girlfriend just saying it doe not make an idea of a good idea in a way that people have such a different way. i just do not know anyone who i am still with no problem. and i have a chance of this game where i really love new people!


Comment  2 :
gti am sorry people are just teenager. i cannot write this game but i got physically out of rank. that is not the important.. but i just got to realize that it sound like you are in a thread. it is why at least you are doing it. i do not know what to do with the movie but i do not know a fucking fan of the same thing. i just do not think it is cool that we are doing it in a random policy which is just a good thing.


Comment  3 :
you finish the place of their right and will not. it is not a big question, but it may be at the street. it i

In [74]:
xb, yb = next(batch)
print(tokenizer.decode(xb[0].tolist(), skip_special_tokens=False))
print(tokenizer.decode(yb[0].tolist(), skip_special_tokens=False))

