In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import pandas as pd
import numpy as np
import torch
import os
import re

In [2]:
torch.manual_seed(256)
torch.cuda.manual_seed(256)
np.random.seed(256)

## 2.1 Word embedding

1. Read txt files and tokenize them to obtain train/validation/test lists of words.

In [3]:
TOKENIZER = get_tokenizer("basic_english")


def read_txt_files(datapath):
    files = os.listdir(datapath)
    files = [datapath + f for f in files if f.endswith(".txt")]

    lines = []
    for f_name in files:
        with open(f_name) as f:
            lines += f.readlines()
    return lines


def tokenize(lines, tokenizer=TOKENIZER):
    list_text = []
    for line in lines:
        list_text += tokenizer(line)
    return list_text


def yield_tokens(lines, tokenizer=TOKENIZER):
    no_digits = "\w*[0-9]+\w*"  # Regex to match words containing numbers
    no_names = "\w*[A-Z]+\w*"  # Regex to match words with capital letters (names)
    no_spaces = "\s+"  # Regex to match sequences of whitespace

    # Processing each line to remove digits, names, and extra spaces
    for line in lines:
        line = re.sub(no_digits, " ", line)
        line = re.sub(no_names, " ", line)
        line = re.sub(no_spaces, " ", line)
        # Yielding the tokenized and cleaned line
        yield tokenizer(line)

In [4]:
GENERATED_PATH = "./generated/" # Path where generated data files are stored

# Check if the training data file already exists in the generated path
if os.path.isfile(GENERATED_PATH + "words_train.pt"):
    # Load preprocessed training, validation, and test word lists from .pt files
    words_train = torch.load(GENERATED_PATH + "words_train.pt")
    words_val = torch.load(GENERATED_PATH + "words_val.pt")
    words_test = torch.load(GENERATED_PATH + "words_test.pt")
else:
    # If preprocessed data does not exist, read text files
    lines_books_train = read_txt_files("data/data_train/")
    lines_books_val = read_txt_files("data/data_val/")
    lines_books_test = read_txt_files("data/data_test/")

    # Tokenize the lines from train, validation, and test datasets
    words_train = tokenize(lines_books_train)
    words_val = tokenize(lines_books_val)
    words_test = tokenize(lines_books_test)

    # Save the tokenized word lists to .pt files
    torch.save(words_train, GENERATED_PATH + "words_train.pt")
    torch.save(words_val, GENERATED_PATH + "words_val.pt")
    torch.save(words_test, GENERATED_PATH + "words_test.pt")

2. Define a vocabulary based on the training dataset. To avoid getting a too large vocabulary, a solution can be to keep only words that appear at least 100 times in the training dataset. Report the total number of words in the training dataset, the number of distinct words in the training dataset, and the size of the defined vocabulary. Comment on your results.

In [5]:
MIN_FREQ = 100


def create_vocabulary(lines, min_freq=MIN_FREQ):
    # Building vocabulary from an iterator of tokenized lines, filtering out infrequent tokens
    vocab = build_vocab_from_iterator(yield_tokens(lines), min_freq=min_freq, specials=["<unk>"])
    # Appending token "I", since we removed all words with an uppercase when building the vocabulary
    vocab.append_token("i")
    # Setting default index for unknown words
    vocab.set_default_index(vocab["<unk>"])
    return vocab

In [6]:
VOCAB_FILENAME = "vocabulary.pt"

# Check if the vocabulary file already exists in the generated path
if os.path.isfile(GENERATED_PATH + VOCAB_FILENAME):
    # Load the vocabulary from a file if it already exists
    vocab = torch.load(GENERATED_PATH + VOCAB_FILENAME)
else:
    # If the vocabulary file does not exist, create a new vocabulary from training data
    vocab = create_vocabulary(lines_books_train, min_freq=MIN_FREQ)
    # Save the newly created vocabulary to a file
    torch.save(vocab, GENERATED_PATH + VOCAB_FILENAME)

VOCAB_SIZE = len(vocab)

In [7]:
print(f"Total number of words in the training dataset: {len(words_train):,}")
print(f"Total number of words in the validation dataset: {len(words_val):,}")
print(f"Total number of words in the test dataset: {len(words_test):,}")
print(f"Number of distinct words in the training dataset: {len(set(words_train)):,}")
print(f"Size of the defined vocabulary: {VOCAB_SIZE:,}")

Total number of words in the training dataset: 2,684,706
Total number of words in the validation dataset: 49,526
Total number of words in the test dataset: 124,152
Number of distinct words in the training dataset: 52,105
Size of the defined vocabulary: 1,880


In [8]:
def count_occurrences(words, vocab):
    occurrences = torch.zeros(len(vocab), dtype=torch.int)
    for w in words:
        occurrences[vocab[w]] += 1
    return occurrences

In [9]:
word_counts_df = pd.DataFrame({
    "Word": vocab.lookup_tokens(range(len(vocab))),
    "Occurrences": count_occurrences(words_train, vocab).numpy()
})

sorted_word_counts = word_counts_df.sort_values(by="Occurrences", ascending=False).reset_index(drop=True)
sorted_word_counts.index = sorted_word_counts.index + 1

In [10]:
sorted_word_counts

Unnamed: 0,Word,Occurrences
1,<unk>,433907
2,",",182537
3,the,151278
4,.,123727
5,and,82289
...,...,...
1876,pistol,100
1877,slipped,100
1878,station-master,100
1879,wounds,100
