In [None]:
# Import libraries
from transformers import AutoTokenizer
from collections import defaultdict
import os

# Function to read corpus.txt from data folder
def load_corpus_from_folder(folder_path):
    corpus = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):  # Ensure only text files are read
            with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as f:
                corpus.append(f.read())  # Read entire file as a single string
    return corpus

# Load text files from data folder into corpus
# This assumes that the folder contains a subfolder "data" with the legal and literary documents in .txt format
corpus_folder = "data/" 
corpus = load_corpus_from_folder(corpus_folder)

# Pre-tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [27]:
# Compute frequencies of each word in corpus as we do pre-tokenization

word_freqs = defaultdict(int)
for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freqs[word] += 1

# Display frequencies of the first 10 words
print(dict(list(word_freqs.items())[:10]))  

{'United': 50, 'States': 50, 'Court': 51, 'of': 1672, 'Appeals': 3, 'FOR': 4, 'THE': 10, 'DISTRICT': 1, 'OF': 14, 'COLUMBIA': 1}


In [28]:
# Generate alphabet
alphabet = []
for word in word_freqs.keys():
    if word[0] not in alphabet:
        alphabet.append(word[0])
    for letter in word[1:]:
        if f"##{letter}" not in alphabet:
            alphabet.append(f"##{letter}")

alphabet.sort()
alphabet

['!',
 '"',
 '#',
 '##0',
 '##1',
 '##2',
 '##3',
 '##4',
 '##5',
 '##6',
 '##7',
 '##8',
 '##9',
 '##A',
 '##B',
 '##C',
 '##D',
 '##E',
 '##F',
 '##G',
 '##H',
 '##I',
 '##J',
 '##K',
 '##L',
 '##M',
 '##N',
 '##O',
 '##P',
 '##Q',
 '##R',
 '##S',
 '##T',
 '##U',
 '##V',
 '##W',
 '##X',
 '##Y',
 '##Z',
 '##a',
 '##b',
 '##c',
 '##d',
 '##e',
 '##f',
 '##g',
 '##h',
 '##i',
 '##j',
 '##k',
 '##l',
 '##m',
 '##n',
 '##o',
 '##p',
 '##q',
 '##r',
 '##s',
 '##t',
 '##u',
 '##v',
 '##w',
 '##x',
 '##y',
 '##z',
 '##™',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '>',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'y',
 'z'

In [29]:
# Special tokens used by the model (BERT)
vocab = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] + alphabet.copy()

# Split each word not prefixed by '##'
splits = {
    word: [c if i == 0 else f"##{c}" for i, c in enumerate(word)]
    for word in word_freqs.keys()
}

# Function to computes the score of each pair
def compute_pair_scores(splits):
    letter_freqs = defaultdict(int)
    pair_freqs = defaultdict(int)
    for word, freq in word_freqs.items():
        split = splits[word]
        if len(split) == 1:
            letter_freqs[split[0]] += freq
            continue
        for i in range(len(split) - 1):
            pair = (split[i], split[i + 1])
            letter_freqs[split[i]] += freq
            pair_freqs[pair] += freq
        letter_freqs[split[-1]] += freq

    scores = {
        pair: freq / (letter_freqs[pair[0]] * letter_freqs[pair[1]])
        for pair, freq in pair_freqs.items()
    }
    return scores

# Compute pair scores on our corpus
pair_scores = compute_pair_scores(splits)

# Display the top 5 pair scores
for i, key in enumerate(pair_scores.keys()):
    print(f"{key}: {pair_scores[key]}")
    if i >= 5:
        break

('U', '##n'): 2.6814356894214924e-05
('##n', '##i'): 1.7108658390789986e-06
('##i', '##t'): 8.559704109335822e-06
('##t', '##e'): 3.877434238834249e-06
('##e', '##d'): 8.24734434073905e-06
('S', '##t'): 5.827623951027689e-06


In [30]:
# Function to merge a pair of tokens
def merge_pair(a, b, splits):
    for word in word_freqs:
        split = splits[word]
        if len(split) == 1:
            continue
        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                merge = a + b[2:] if b.startswith("##") else a + b
                split = split[:i] + [merge] + split[i + 2 :]
            else:
                i += 1
        splits[word] = split
    return splits

# Merge to a fixed vocabulary size (70)
vocab_size = 70
while len(vocab) < vocab_size:
    scores = compute_pair_scores(splits)
    best_pair, max_score = "", None
    for pair, score in scores.items():
        if max_score is None or max_score < score:
            best_pair = pair
            max_score = score
    splits = merge_pair(*best_pair, splits)
    new_token = (
        best_pair[0] + best_pair[1][2:]
        if best_pair[1].startswith("##")
        else best_pair[0] + best_pair[1]
    )
    vocab.append(new_token)

# Display the final vocabulary
print(vocab)

['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]', '!', '"', '#', '##0', '##1', '##2', '##3', '##4', '##5', '##6', '##7', '##8', '##9', '##A', '##B', '##C', '##D', '##E', '##F', '##G', '##H', '##I', '##J', '##K', '##L', '##M', '##N', '##O', '##P', '##Q', '##R', '##S', '##T', '##U', '##V', '##W', '##X', '##Y', '##Z', '##a', '##b', '##c', '##d', '##e', '##f', '##g', '##h', '##i', '##j', '##k', '##l', '##m', '##n', '##o', '##p', '##q', '##r', '##s', '##t', '##u', '##v', '##w', '##x', '##y', '##z', '##™', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', '—', '‘', '’', '“', '”', '•', '\ufeff']


In [31]:
# Function to encode a word into tokens from vocabulary
def encode_word(word):
    tokens = []
    while len(word) > 0:
        i = len(word)
        while i > 0 and word[:i] not in vocab:
            i -= 1
        if i == 0:
            return ["[UNK]"]
        tokens.append(word[:i])
        word = word[i:]
        if len(word) > 0:
            word = f"##{word}"
    return tokens

# Function to tokenize text
def tokenize(text):
    pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)
    pre_tokenized_text = [word for word, offset in pre_tokenize_result]
    encoded_words = [encode_word(word) for word in pre_tokenized_text]
    return sum(encoded_words, [])

In [32]:
# Tokenize legal document
with open("data/legal_document.txt", "r", encoding="utf-8") as f:
    legal_text = f.read()

legal_tokens = tokenize(legal_text)

# Tokenize literary work
with open("data/lit_work.txt", "r", encoding="utf-8") as f:
    lit_work = f.read()

lit_tokens = tokenize(lit_work)



In [33]:
# Compute the number of unique tokens in each document
print(f"Number of unique tokens in the legal document: {len(set(legal_tokens))}")
print(f"Number of unique tokens in the literary document: {len(set(lit_tokens))}")

Number of unique tokens in the legal document: 131
Number of unique tokens in the literary document: 148


In [34]:
# Convert lists to sets for faster computation
legal_tokens = set(legal_tokens)
lit_tokens = set(lit_tokens)

# Compute the number of unique tokens in each document
print(f"Number of unique tokens in the legal document: {len(legal_tokens)}")
print(f"Number of unique tokens in the literary document: {len(lit_tokens)}")

# Compute intersection (common tokens)
common_tokens = set(legal_tokens) & set(lit_tokens)
print(f"Number of common tokens: {len(common_tokens)}")
print(f"Common tokens: {common_tokens}")

# Compute the number of tokens unique to each document
unique_legal_tokens = legal_tokens - lit_tokens
unique_lit_tokens = lit_tokens - legal_tokens
print(f"Number of unique tokens in the legal document: {len(unique_legal_tokens)}")
print(f"Number of unique tokens in the literary document: {len(unique_lit_tokens)}")

Number of unique tokens in the legal document: 131
Number of unique tokens in the literary document: 148
Number of common tokens: 126
Common tokens: {'##L', '##z', 'B', 'm', '7', 's', 'j', 'O', '##g', 'C', 'u', '##S', 'J', '-', 'l', '&', 'T', '.', '##k', '##C', '##a', '##J', 'N', '3', '(', 'z', '8', '##7', '##D', 'e', '##R', 'b', '##8', '##1', '##2', 'E', 'D', 'f', '##y', '##E', '##v', '##B', '##l', '##p', 'P', '##o', '6', ',', '##j', '##x', 'k', '##H', '##3', 'y', '##A', '?', 'i', 'U', '##4', '##I', 'L', '##t', '##u', '0', 'I', 'V', 'w', 'q', 'o', '##U', ')', '##r', '##b', '4', '[', '##e', 'Y', 'M', 'R', 'H', '##w', ';', 'c', 'h', '##0', 'g', 't', 'r', '##c', '##5', '##6', 'p', '##F', '##h', 'W', 'd', '##d', '1', '##i', '2', '9', '##m', 'a', '##O', 'Q', '##V', '##9', '##M', '/', ':', '##P', '##T', 'F', 'G', 'K', 'n', '5', '##s', '##q', '##n', ']', '##f', '*', 'v', 'S', 'A'}
Number of unique tokens in the legal document: 5
Number of unique tokens in the literary document: 22
