In [1]:
import os
import shutil
from typing import Any
import requests
import re
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter
import torch
import pickle
import tqdm
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
class Vocab:
    def __init__(self, tokens = [], min_freq = 0, reserved_tokens = []):
        self._build(tokens, min_freq, reserved_tokens)
    
    def _build(self, tokens, min_freq, reserved_tokens):
        print(f'building vocab from {len(tokens)} tokens')
        counter = Counter(tokens)
        self.token_freq = sorted(counter.items(), key = lambda x: x[1], reverse = True)

        self.idx_to_tokens = list(sorted(set(['<unk>'] + reserved_tokens \
        + [ token for token, freq in self.token_freq if freq > min_freq])))
    
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_tokens)}

        print('built vocab object')

    def __len__(self):
        return len(self.idx_to_tokens)
    
    def __getitem__(self, tokens):
        #if not type list or tuple
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(t) for t in tokens]
    
    def to_tokens(self, idx):
        if not isinstance(idx, (list, tuple)):
            return self.idx_to_tokens[idx]
        return [self.to_tokens(i) for i in idx]
    
    @property
    def unk(self):
        return self.token_to_idx['<unk>']

class ProjectGutenbergDataset(Dataset):
    def _download(self, links_path: str = 'links.txt', output_dir: str = 'dataset') -> None:
        '''
        iterate through links in links.txt in Project Gutenberg to download books
        '''
        #read links from file
        if os.path.exists(output_dir) == False:
            print('Downloading books ... ')
            books = []
            try:
                with open(links_path, 'r') as file:
                    errors = []
                    for link in file.readlines():
                        link = link.rstrip()

                        res = requests.get(link)
                        if res.status_code != 200:
                            raise Exception(f"Failed to Fetch, Error code {res.status_code}")
                        books.append(res.text)
                        print(f"SUCCESS {link}")

                if os.path.exists(output_dir) == False:
                    os.mkdir(output_dir) 

                for id, book in enumerate(books):
                    output_path = os.path.join(output_dir, f'book{id}.txt')
                    with open(output_path, 'w') as file:
                        file.write(book)

            except Exception as e:
                print('Error while downloading books, error = ', e)
        else:
            print("PG dataset loaded")


    def _preprocess(self, text):
        #remove digits and anything but letters and space
        return re.sub('[^a-zA-Z\s]', '', text).lower()
    
    def _tokenize(self, tokenizer, text: str, save_to_file = False) -> list[str]:
        tokens = tokenizer(self._preprocess(text))
        # save tokens
        if save_to_file:
            with open('tokens.txt', 'w') as file:
                for t in tokens:
                    file.writelines(f'{t} \n')

        return tokens

    def _build(self, dataset_path):
        '''
        @param:
            dataset_path: str, path to PG dataset
        @return
            corpus: list[int] 
            vocab: Vocab object
        '''
        alltext = ''
        for file in os.listdir(dataset_path):
            filepath = os.path.join(dataset_path, file)
            with open(filepath, 'r') as file:
                alltext += file.read()
    
        #init tokenizer
        tokenizer = word_tokenize

        tokens = self._tokenize(tokenizer, alltext, save_to_file=True)
        
        vocab = Vocab(tokens, min_freq = 2)
        
        #build corpus, list of indices, [1, 2,100,44,33,...] 
        corpus = [vocab[token] for token in tokens]

        return corpus, vocab

    def __init__(self, dataset_path = './dataset/', num_steps = 100):
        self._download()
        corpus, vocab = self._build(dataset_path)

        with open('corpus.pkl', 'wb') as file:
            pickle.dump(corpus, file)

        with open('corpus.txt', 'w') as file:
            for token in corpus: 
                file.write(str(token) + " ")
        
        with open('vocab_obj.pkl', 'wb') as file:
            pickle.dump(vocab, file)

        N = len(corpus)

        array = torch.tensor([corpus[i : i + num_steps + 1] for i in range(N - num_steps)])
        self.X = array[:,:-1] 
        self.Y = array[:,1:] 
    
    def __len__(self) -> int:
        return len(self.X)

    def __getitem__(self, index: int) -> Any:
        return self.X[index], self.Y[index]

dataset = ProjectGutenbergDataset()

feature, label = dataset[0]

PG dataset loaded
building vocab from 772176 tokens
built vocab object


AttributeError: 'Vocab' object has no attribute 'items'

In [27]:
print(len(dataset))
print(feature)
print(len(feature))
print(label)
print(len(label))

tensor([7512, 5746, 3312, 2300, 5085, 4740, 1995, 5149, 7512, 8225, 7548, 2300,
        4007, 2923, 7512, 7962, 5085,  338,  340, 3769, 7512, 7892, 7081,  293,
        4788, 5169, 5291, 5085, 7512, 8390,  475, 4964, 1621,  293, 8351,  237,
        4964, 6199, 8248, 8466, 4598, 1596, 4020, 3149, 4020,  539, 5149, 6221,
        4020, 7851, 7512, 7490, 5085, 7512, 5746, 3312, 4315, 3788, 8351, 7548,
        2300, 5149, 5127,  475, 8431, 3697, 8466,  391, 4990, 4393, 3769, 7512,
        7892, 7081, 8466, 8319, 3407, 7622, 1175, 7512, 4239, 5085, 7512, 1639,
        8258, 8466,  391, 4393,  650, 7968, 7548, 2300, 7621, 4740, 1995, 5149,
        7512, 8225,  520,    0])
100
tensor([5746, 3312, 2300, 5085, 4740, 1995, 5149, 7512, 8225, 7548, 2300, 4007,
        2923, 7512, 7962, 5085,  338,  340, 3769, 7512, 7892, 7081,  293, 4788,
        5169, 5291, 5085, 7512, 8390,  475, 4964, 1621,  293, 8351,  237, 4964,
        6199, 8248, 8466, 4598, 1596, 4020, 3149, 4020,  539, 5149, 6221, 4020,
   

In [7]:
with open('vocab.pkl', 'rb') as file:
    vocab = pickle.load(file)

print(vocab.to_tokens(feature.tolist()))
print(vocab.to_tokens(label.tolist()))

['the', 'project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'united', 'states', 'and', 'most', 'other', 'parts', 'of', 'the', 'world', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'if', 'you', 'are', 'not', 'located', 'in', 'the', 'united', 'states', 'you', 'will', 'have', 'to', 'check', 'the', 'laws', 'of', 'the', 'country', 'where', 'you', 'are', 'located', 'before', 'using', 'this', 'ebook', 'title', 'moby', 'dick', 'or', 'the', 'whale', 'author', '<unk>']
['project', 'gutenberg', 'ebook', 'of', 'moby', 'dick', 'or', 'the', 'whale', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'in', 'the', 'unit

### Try loading the corpus pkl

In [15]:
with open('corpus.pkl', 'rb')  as file:
    corpus = pickle.load(file)

print(type(corpus))
print(corpus[:10])
print(vocab.to_tokens(corpus[500:600]))

<class 'list'>
[11772, 9081, 5269, 3670, 8036, 7519, 3145, 8122, 11772, 12952]
['chapter', 'stowing', 'down', 'and', 'clearing', 'up', 'chapter', 'the', 'doubloon', 'chapter', 'leg', 'and', 'arm', 'chapter', 'the', 'decanter', 'chapter', 'a', 'bower', 'in', 'the', 'arsacides', 'chapter', 'measurement', 'of', 'the', 'whales', 'skeleton', 'chapter', 'the', 'fossil', 'whale', 'chapter', 'does', 'the', 'whales', 'magnitude', '<unk>', 'he', 'perish', 'chapter', 'ahabs', 'leg', 'chapter', 'the', 'carpenter', 'chapter', 'ahab', 'and', 'the', 'carpenter', 'chapter', 'ahab', 'and', 'starbuck', 'in', 'the', 'cabin', 'chapter', 'queequeg', 'in', 'his', 'coffin', 'chapter', 'the', 'pacific', 'chapter', 'the', 'blacksmith', 'chapter', 'the', 'forge', 'chapter', 'the', '<unk>', 'chapter', 'the', 'pequod', 'meets', 'the', 'bachelor', 'chapter', 'the', 'dying', 'whale', 'chapter', 'the', 'whale', 'watch', 'chapter', 'the', 'quadrant', 'chapter', 'the', 'candles', 'chapter', 'the', 'deck', 'towards', '

# Dataloader

In [46]:
train_dataloader = DataLoader(dataset, batch_size = 32, shuffle= True)
print(len(train_dataloader))

10802


In [48]:
feature, label = next(iter(train_dataloader))
print(feature)
print(len(feature))
print(label)
print(len(label))

tensor([[4990, 3504, 2952,  ..., 4020, 5074, 7622],
        [6507, 7622,  614,  ..., 7510, 4598,    0],
        [6117, 7795, 7678,  ..., 6952, 5085, 4836],
        ...,
        [3415, 8157, 8033,  ..., 4990, 5196,  293],
        [6860,  975, 3677,  ...,  614, 8033,   32],
        [7042,  975, 3415,  ...,  293, 8358,    1]])
32
tensor([[3504, 2952, 7622,  ..., 5074, 7622, 2352],
        [7622,  614, 7963,  ..., 4598,    0, 8401],
        [7795, 7678, 7516,  ..., 5085, 4836, 7522],
        ...,
        [8157, 8033, 4376,  ..., 5196,  293, 7622],
        [ 975, 3677, 1771,  ..., 8033,   32, 5122],
        [ 975, 3415, 7377,  ..., 8358,    1,    0]])
32


# Trainer class

In [None]:
class Trainer:
    def __init__(self):
        pass

    def train(self, model, train_dataloader, valid_dataloader):
        pass


# RNN and RNN LM from d2l to establish baselines