In [12]:
import torch
names_file = open("./data/names.txt", "r")
G = torch.Generator().manual_seed(42)
names_all = names_file.read().split("\n")

names_train, names_dev, names_test = torch.utils.data.random_split(names_all, [0.8, 0.1, 0.1], generator=G)

In [13]:
FIRST_CHAR = '<S>'
LAST_CHAR = '<E>'

all_letters = set()
for name in names_train:
    for char in name:
        all_letters.add(char)

letter_to_int32 = {letter: index for index, letter in enumerate(all_letters)}
letter_to_int32[FIRST_CHAR] = len(letter_to_int32)
letter_to_int32[LAST_CHAR] = len(letter_to_int32)

In [14]:
int32_to_letter = {value: key for key,value in letter_to_int32.items()}

In [15]:
letters_count = len(letter_to_int32)
N = torch.zeros((letters_count, letters_count, letters_count))

for name in names_train:
    normalized_name = [FIRST_CHAR, FIRST_CHAR] + list(name) + [LAST_CHAR]
    for ch1, ch2, ch3 in zip(normalized_name, normalized_name[1:], normalized_name[2:]):
        int_ch1 = letter_to_int32[ch1]
        int_ch2 = letter_to_int32[ch2]
        int_ch3 = letter_to_int32[ch3]
        N[int_ch1,int_ch2, int_ch3] += 1

# maximum likelihood mul(P[x][i]) => log sum(log(P[x][i])) => negative -sum(log(P[x][i]))
# regularize N to avoid situation where we have 0 prob that some bigram occurs after another one
N = N + 0.2
# normalize to probability row wise
P = N / N.sum(dim=2, keepdim=True)

def loss(set):
    loss = 0
    n = 0
    for name in set:
        normalized_name = [FIRST_CHAR] + list(name) + [LAST_CHAR]
        for ch1, ch2, ch3 in zip(normalized_name, normalized_name[1:], normalized_name[2:]):
            int_ch1 = letter_to_int32[ch1]
            int_ch2 = letter_to_int32[ch2]
            int_ch3 = letter_to_int32[ch3]

            loss += torch.log(P[int_ch1, int_ch2, int_ch3])
            n += 1

    return -loss/n

print(f'{loss(names_dev)=}')
print(f'{loss(names_train)=}')

print(f'{loss(names_test)=}')

# 0.1
#loss(names_dev)=tensor(2.1169)
#loss(names_train)=tensor(2.0617)

# 5
#loss(names_dev)=tensor(2.2166)
#loss(names_train)=tensor(2.1894)

loss(names_dev)=tensor(2.1169)
loss(names_train)=tensor(2.0666)
loss(names_test)=tensor(2.1176)
