In [1]:
import os
import shutil
from typing import Any
import requests
import re
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter
import torch
import torch.nn as nn
from torch.nn import functional as F
import pickle
import tqdm
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
class Vocab:
    def __init__(self, tokens = [], min_freq = 0, reserved_tokens = []):
        self._build(tokens, min_freq, reserved_tokens)
    
    def _build(self, tokens, min_freq, reserved_tokens):
        print(f'building vocab from {len(tokens)} tokens')
        counter = Counter(tokens)
        self.token_freq = sorted(counter.items(), key = lambda x: x[1], reverse = True)

        self.idx_to_tokens = list(sorted(set(['<unk>'] + reserved_tokens \
        + [ token for token, freq in self.token_freq if freq > min_freq])))
    
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_tokens)}

        print('built vocab object')

    def __len__(self):
        return len(self.idx_to_tokens)
    
    def __getitem__(self, tokens):
        #if not type list or tuple
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(t) for t in tokens]
    
    def to_tokens(self, idx):
        if not isinstance(idx, (list, tuple)):
            return self.idx_to_tokens[idx]
        return [self.to_tokens(i) for i in idx]
    
    @property
    def unk(self):
        return self.token_to_idx['<unk>']

class ProjectGutenbergDataset(Dataset):
    def _download(self, links_path: str = 'links.txt', output_dir: str = 'dataset') -> None:
        '''
        iterate through links in links.txt in Project Gutenberg to download books
        '''
        #read links from file
        if os.path.exists(output_dir) == False:
            print('Downloading books ... ')
            books = []
            try:
                with open(links_path, 'r') as file:
                    errors = []
                    for link in file.readlines():
                        link = link.rstrip()

                        res = requests.get(link)
                        if res.status_code != 200:
                            raise Exception(f"Failed to Fetch, Error code {res.status_code}")
                        books.append(res.text)
                        print(f"SUCCESS {link}")

                if os.path.exists(output_dir) == False:
                    os.mkdir(output_dir) 

                for id, book in enumerate(books):
                    output_path = os.path.join(output_dir, f'book{id}.txt')
                    with open(output_path, 'w') as file:
                        file.write(book)

            except Exception as e:
                print('Error while downloading books, error = ', e)
        else:
            print("PG dataset loaded")


    def _preprocess(self, text):
        #remove digits and anything but letters and space
        return re.sub('[^a-zA-Z\s]', '', text).lower()
    
    def _tokenize(self, tokenizer, text: str, save_to_file = False) -> list[str]:
        tokens = tokenizer(self._preprocess(text))
        # save tokens
        if save_to_file:
            with open('tokens.txt', 'w') as file:
                for t in tokens:
                    file.writelines(f'{t} \n')

        return tokens

    def _build(self, dataset_path):
        '''
        @param:
            dataset_path: str, path to PG dataset
        @return
            corpus: list[int] 
            vocab: Vocab object
        '''
        alltext = ''
        for file in os.listdir(dataset_path):
            filepath = os.path.join(dataset_path, file)
            with open(filepath, 'r') as file:
                alltext += file.read()
    
        #init tokenizer
        tokenizer = word_tokenize

        tokens = self._tokenize(tokenizer, alltext, save_to_file=True)
        
        vocab = Vocab(tokens, min_freq = 2)
        
        #build corpus, list of indices, [1, 2,100,44,33,...] 
        corpus = [vocab[token] for token in tokens]

        return corpus, vocab

    def __init__(self, dataset_path = './dataset/', num_steps = 100):
        self.num_steps = num_steps 

        self._download()
        corpus, vocab = self._build(dataset_path)

        #save the corpus and vocab for inspection later
        with open('corpus.pkl', 'wb') as file:
            pickle.dump(corpus, file)

        with open('corpus.txt', 'w') as file:
            for token in corpus: 
                file.write(str(token) + " ")
        
        with open('vocab_obj.pkl', 'wb') as file:
            pickle.dump(vocab, file)

        with open('vocab_obj.txt', 'w') as file:
            for token, idx in vocab.token_to_idx.items(): 
                file.writelines(f'{token}:{idx}\n')

        N = len(corpus)

        array = torch.tensor([corpus[i : i + num_steps + 1] for i in range(N - num_steps)])
        self.X = array[:,:-1] 
        self.Y = array[:,1:] 
        self.vocab = vocab
 
    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, index: int) -> Any:
        return self.X[index], self.Y[index]

dataset = ProjectGutenbergDataset()

feature, label = dataset[0]

PG dataset loaded
building vocab from 772176 tokens
built vocab object


In [3]:
print(len(dataset))
print(feature)
print(len(feature))
print(label)
print(len(label))

772076
tensor([11772,  9081,  5269,  3670,  8036,  7519,  3145,  8122, 11772, 12952,
        11831,  3670,  6381,  4646, 11772, 12524,  8036,   505,   507,  5963,
        11772, 12399, 11137,   434,  7599,  8153,  8338,  8036, 11772, 13181,
          727,  7855,  2547,   434, 13113,   362,  7855,  9800, 12976, 13333,
         7283,  2507,  6398,  5003,  6398,   828,  8122,  9836,  6398, 12316,
        11772, 11735,  8036, 11772,  9081,  5269,  6847,  5997, 13113, 11831,
         3670,  8122,  8089,   727, 13247,  5828, 13333,   596,  7896,  6959,
         5963, 11772, 12399, 11137, 13333, 13069,  5415, 11944,  1822, 11772,
         6738,  8036, 11772,  2578, 12987, 13333,   596,  6959,  1009, 12532,
        11831,  3670, 11943,  7519,  3145,  8122, 11772, 12952,   790,     0])
100
tensor([ 9081,  5269,  3670,  8036,  7519,  3145,  8122, 11772, 12952, 11831,
         3670,  6381,  4646, 11772, 12524,  8036,   505,   507,  5963, 11772,
        12399, 11137,   434,  7599,  8153,  8338,  8

In [4]:
with open('vocab_obj.pkl', 'rb') as file:
    vocab = pickle.load(file)

print(vocab.to_tokens(feature.tolist()))
print(vocab.to_tokens(label.tolist()))

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'you', 'will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebook', 'title', 'moby', 'dick', 'or', 'the', 'whale', 'author', '<unk>']
['project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'unit

### Try loading the corpus pkl

In [5]:
with open('corpus.pkl', 'rb')  as file:
    corpus = pickle.load(file)

print(type(corpus))
print(corpus[:10])
print(vocab.to_tokens(corpus[500:600]))

<class 'list'>
[11772, 9081, 5269, 3670, 8036, 7519, 3145, 8122, 11772, 12952]
['chapter', 'stowing', 'down', 'and', 'clearing', 'up', 'chapter', 'the', 'doubloon', 'chapter', 'leg', 'and', 'arm', 'chapter', 'the', 'decanter', 'chapter', 'a', 'bower', 'in', 'the', 'arsacides', 'chapter', 'measurement', 'of', 'the', 'whales', 'skeleton', 'chapter', 'the', 'fossil', 'whale', 'chapter', 'does', 'the', 'whales', 'magnitude', '<unk>', 'he', 'perish', 'chapter', 'ahabs', 'leg', 'chapter', 'the', 'carpenter', 'chapter', 'ahab', 'and', 'the', 'carpenter', 'chapter', 'ahab', 'and', 'starbuck', 'in', 'the', 'cabin', 'chapter', 'queequeg', 'in', 'his', 'coffin', 'chapter', 'the', 'pacific', 'chapter', 'the', 'blacksmith', 'chapter', 'the', 'forge', 'chapter', 'the', '<unk>', 'chapter', 'the', 'pequod', 'meets', 'the', 'bachelor', 'chapter', 'the', 'dying', 'whale', 'chapter', 'the', 'whale', 'watch', 'chapter', 'the', 'quadrant', 'chapter', 'the', 'candles', 'chapter', 'the', 'deck', 'towards', '

# Dataloader

# RNN and LM from d2l to establish baselines

In [6]:
DEVICE = 'cuda' if torch.cuda.device_count() else 'cpu'
print(f"DEVICE = {DEVICE}")

DEVICE = cuda


In [7]:
a = torch.tensor([[1,2,3,4], [2,3,4,5]])
print(a.shape)
b = torch.tensor([[9,8,7,6], [7,6,5,4]])
print(b.shape)
c = torch.stack((a,b), 0)
print(c.shape)
c = torch.stack((a,b), 1)
print(c.shape)
c = torch.stack((a,b), 2)
print(c)
print(c.shape)

torch.Size([2, 4])
torch.Size([2, 4])
torch.Size([2, 2, 4])
torch.Size([2, 2, 4])
tensor([[[1, 9],
         [2, 8],
         [3, 7],
         [4, 6]],

        [[2, 7],
         [3, 6],
         [4, 5],
         [5, 4]]])
torch.Size([2, 4, 2])


In [8]:
class RNN(nn.Module):
    def __init__(self, num_inputs, num_hiddens,sigma = 0.001):
        super().__init__()
        self.num_inputs = num_inputs
        self.num_hiddens = num_hiddens
        self.Wxh = nn.Parameter(torch.rand((num_inputs, num_hiddens), dtype = torch.float, device = DEVICE) * sigma)
        self.Whh = nn.Parameter(torch.rand((num_hiddens, num_hiddens), dtype = torch.float, device = DEVICE) * sigma)
        self.bh = nn.Parameter(torch.rand((1, num_hiddens), dtype = torch.float, device = DEVICE) * sigma)
    
    def forward(self, inputs, state = None):
        # N is num steps
        # n is batch size
        # d is num inputs
        N, n, d = inputs.shape
        if state == None:
            #state is not Parameter and will not be used for backprop, initalize state with 0s
            state = torch.zeros((n, self.num_hiddens), device = DEVICE)
        else:
            state, = state

        outputs = []

        for X in inputs:
            
            state = torch.tanh(X @ self.Wxh + state @ self.Whh + self.bh)
            outputs.append(state)
        
        outputs = torch.stack(outputs, 0)
        return outputs, state

#params
num_steps = 100
batch_size = 8
num_inputs = 16
num_hiddens = 32

sample_inputs = torch.rand((num_steps, batch_size, num_inputs), device = DEVICE)
print('sample input shape = ',sample_inputs.shape)
#Testing RNN correctness
rnn = RNN(num_inputs, num_hiddens)
outputs, state = rnn(sample_inputs)
print(outputs.shape)
print(state.shape)

    

sample input shape =  torch.Size([100, 8, 16])
torch.Size([100, 8, 32])
torch.Size([8, 32])


In [9]:
class LanguageModel(nn.Module):

    def __init__(self, rnn, vocab_size, sigma = 0.001):
        super().__init__()
        self.rnn = rnn
        self.vocab_size = vocab_size
        self.sigma = sigma

        #init params
        rnn_num_hiddens = self.rnn.num_hiddens
        self.Whq = nn.Parameter(torch.rand((rnn_num_hiddens, self.vocab_size), device= DEVICE) * sigma)
        self.bq = nn.Parameter(torch.rand((1, self.vocab_size), device= DEVICE) * sigma)
    
    def one_hot(self, X):
        # original X shape is (batch_size, num_steps)
        # we want to encode its shape to (num_steps, batch_size, vocab_size)
        # TODO: WHY WE TRANSPOSE LIKE THIS?
        # https://d2l.ai/chapter_recurrent-neural-networks/rnn-scratch.html#one-hot-encoding
        # We often transpose the input so that we will obtain an output of 
        # shape (number of time steps, batch size, vocabulary size). 
        # This will allow us to loop more conveniently through the 
        # outermost dimension for updating hidden states of a minibatch, time step by time step
        return F.one_hot(X.T, self.vocab_size).type(torch.float)

    def forward(self, inputs):
        print('inputs shape = ', inputs.shape)
        embedding = self.one_hot(inputs)
        print('embedding shaep = ', embedding.shape)

        rnn_outputs, state = self.rnn(embedding)
        print('rnn output shape = ', rnn_outputs.shape)
        print('state output shape = ', state.shape)

        return self.output_layer(rnn_outputs)
    
    def output_layer(self, rnn_outputs):
        outputs = torch.stack([H @ self.Whq + self.bq for H in rnn_outputs], dim = 1)
        print('LM output shape = ', outputs.shape)
        return outputs
    
    def train_step(self, input):
        pass

    def valid_step(self, input):
        pass

#Test correctness of LM

batch_size = 8
num_steps = 16
vocab_size = 1000
num_hiddens = 32

sample_inputs = torch.randint(0, vocab_size, (batch_size, num_steps), device = DEVICE)
rnn = RNN(vocab_size, num_hiddens)

lm  = LanguageModel(rnn, vocab_size)

outputs = lm(sample_inputs)
pred = torch.argmax(outputs[0], dim = 1)

print(pred)
print(pred.shape)

inputs shape =  torch.Size([8, 16])
embedding shaep =  torch.Size([16, 8, 1000])
rnn output shape =  torch.Size([16, 8, 32])
state output shape =  torch.Size([8, 32])
LM output shape =  torch.Size([8, 16, 1000])
tensor([256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256, 256,
        256, 256], device='cuda:0')
torch.Size([16])


# Trainer class

In [10]:
train_dataloader = DataLoader(dataset, batch_size = 32, shuffle= True)
print('total len of dataset = ', len(train_dataloader))

feature, label = next(iter(train_dataloader))
# print(feature)
print(feature.shape)
# print(label)
print(label.shape)

total len of dataset =  24128
torch.Size([32, 100])
torch.Size([32, 100])


In [11]:
class Trainer:
    def __init__(self, max_epochs = 100, lr = 0.001):
        self.max_epochs = max_epochs
        self.lr = lr
        self.device = 'cuda' if torch.cuda.device_count() else 'cpu'

    def fit(self, model, train_dataloader, valid_dataloader = None):
        optimizer = torch.optim.SGD(model.parameters(), lr = self.lr)

        for batch_id, (X, y) in enumerate(train_dataloader):
            X = X.to(device = self.device)
            y = y.to(device = self.device)
            print('batch id ', batch_id)
            print(f'X: {X.shape}, device = {X.device}')
            print(f'y: {y.shape}, device = {y.device}')

            output = model(X)

            break

trainer = Trainer()
print('dataset num steps = ', dataset.num_steps)

vocab_size = len(dataset.vocab)
print('vocab size ', vocab_size)
batch_size = 8 
num_hiddens = 16

train_dataloader = DataLoader(dataset, batch_size = batch_size, shuffle= True)

rnn = RNN(vocab_size, num_hiddens)
model = LanguageModel(rnn, vocab_size)

trainer.fit(lm, train_dataloader=train_dataloader)




dataset num steps =  100
vocab size  13361
batch id  0
X: torch.Size([8, 100]), device = cuda:0
y: torch.Size([8, 100]), device = cuda:0


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
