In [1]:
import os
import shutil
from typing import Any
import requests
import re
import pandas as pd
from torch.utils.data import Dataset
from nltk.tokenize import word_tokenize
import nltk
from collections import Counter
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nguye\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [13]:
class ProjectGutenbergDataset(Dataset):
    def _download(self, links_path: str = 'links.txt', output_dir: str = 'dataset') -> None:
        '''
        iterate through links in links.txt in Project Gutenberg to download books
        '''
        #read links from file
        if os.path.exists(output_dir) == False:
            print('Downloading books ... ')
            books = []
            try:
                with open(links_path, 'r') as file:
                    errors = []
                    for link in file.readlines():
                        link = link.rstrip()

                        res = requests.get(link)
                        if res.status_code != 200:
                            raise Exception(f"Failed to Fetch, Error code {res.status_code}")
                        books.append(res.text)
                        print(f"SUCCESS {link}")

                if os.path.exists(output_dir) == False:
                    os.mkdir(output_dir) 

                for id, book in enumerate(books):
                    output_path = os.path.join(output_dir, f'book{id}.txt')
                    with open(output_path, 'w') as file:
                        file.write(book)

            except Exception as e:
                print('Error while downloading books, error = ', e)
        else:
            print("PG dataset loaded")


    def _preprocess(self, text):
        return re.sub('[^a-zA-Z\s]', '', text).lower()
    
    def _tokenize(self, tokenizer, text: str, save_to_file = False) -> list[str]:
        tokens = tokenizer(self._preprocess(text))
        # save tokens
        if save_to_file:
            with open('tokens.txt', 'w') as file:
                for t in tokens:
                    file.writelines(f'{t} ')

        return tokens

    def _build(self, dataset_path):
        #Merge the dataset
        alltext = ''
        for file in os.listdir(dataset_path):
            filepath = os.path.join(dataset_path, file)
            with open(filepath, 'r') as file:
                alltext += file.read()
        return alltext

    def __init__(self, dataset_path = './dataset/', num_steps = 100, batch_sie = 32):
        self._download()
        alltext = self._build(dataset_path)
        self.vocab = None

        #init tokenizer
        tokenizer = word_tokenize

        tokens = self._tokenize(tokenizer, alltext, save_to_file=True)
        
        if not self.vocab:
            self.vocab = Vocab(tokens, min_freq = 2)
            self.vocab.save_to_file()
        
        #build corpus, list of indices, [1, 2,100,44,33,...] 
        self.corpus = [self.vocab[token] for token in tokens]
    
    def __len__(self) -> int:
        pass

    def __getitem__(self, index: int) -> Any:
        pass

class Vocab:
    def __init__(self, tokens = [], min_freq = 0, reserved_tokens = []):
        self._build(tokens, min_freq, reserved_tokens)
    
    def _build(self, tokens, min_freq, reserved_tokens):
        print(f'building vocab from {len(tokens)} tokens')
        counter = Counter(tokens)
        self.token_freq = sorted(counter.items(), key = lambda x: x[1], reverse = True)

        self.idx_to_tokens = list(sorted(set(['<unk>'] + reserved_tokens \
        + [ token for token, freq in self.token_freq if freq > min_freq])))
    
        self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_tokens)}

        print('built vocab object')

    def __len__(self):
        return len(self.idx_to_tokens)
    
    def __getitem__(self, tokens):
        #if not type list or tuple
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(t) for t in tokens]
    
    def to_tokens(self, idx):
        if not isinstance(idx, (list, tuple)):
            return self.idx_to_tokens.get(idx, self.unk)
        return [self.to_tokens(i) for i in idx]
    
    def save_to_file(self, path = 'vocab.csv'):
        print(f'saving vocab object to {path}')
        with open(path,'w') as file:
            file.writelines(f'tokens,idx\n')
            for token, index in self.token_to_idx.items():
                file.writelines(f'{token},{index}\n')
        
    # def load_from_file(self, path = 'vocab.csv'):
    #     print(f'loading vocab object from {path}')
    #     with open(path,'r') as file:
    #         for line in file.readlines():
    #             token, idx = line.rstrip().split(',')


    @property
    def unk(self):
        return self.token_to_idx['<unk>']

dataset = ProjectGutenbergDataset()

PG dataset loaded
building vocab from 772176 tokens
built vocab object
saving vocab object to vocab.csv
[11772, 9081, 5269, 3670, 8036, 7519, 3145, 8122, 11772, 12952]


In [26]:
link = 'https://www.gutenberg.org/ebooks/2701.txt.utf-8'
res = requests.get(link)
print(link)
print(res.text[:100])
with open('sample.txt', 'w') as file:
    file.write(res.text)

https://www.gutenberg.org/ebooks/2701.txt.utf-8
﻿The Project Gutenberg eBook of Moby Dick; Or, The Whale
    
This ebook is for the use of anyone 
