In [50]:
import numpy as np
import random

# Load data

In [51]:
with open("../data/dog_names.txt", "r") as f:
    names = list(set([i for i in f.read().splitlines()]))

In [52]:
names_train = random.sample(names, k=int(0.95*(len(names))))
names_test = list(set(names) - set(names_train))

# Process data

In [53]:
def ngram_list(list_object, n):
    ngrams = []
    for element in list_object:
        if len(element)>=n:
            ngrams.append(("<>" + element[0:n-2], element[n-2]))
            for idx in range(len(element) - n + 1):
                ngrams.append((element[idx:idx+n-1], element[idx+n-1]))
            ngrams.append((element[len(element)-n+1:], "<>"))
    return ngrams

def ngram_list_updated(list_object, n):
    ngrams = []
    for n_of_grams in range(2, n+1):
        for element in list_object:
            element = ["<>"] + list(element) + ["<>"]
            if len(element)>=(n_of_grams-2):
                for idx in range(len(element) - n_of_grams + 1):
                    ngrams.append(("".join(element[idx:idx+n_of_grams-1]), "".join(element[idx+n_of_grams-1])))
    return ngrams

def ngram_count(list_ngrams):
    ngrams_counts = {}
    for ngram in list_ngrams:
        if ngram in ngrams_counts:
            ngrams_counts[ngram] += 1
        else:
            ngrams_counts[ngram] = 1
    return ngrams_counts

def calculate_conditional_probabilities(ngram_counts):
    firsts = sorted(list(set([i[0] for i in ngram_counts.keys()])))
    nexts = sorted(list(set([i[1] for i in ngram_counts.keys()])))
    probabilities = np.zeros((len(firsts), len(nexts)))
    for idx_f, f in enumerate(firsts):
        for idx_n, n in enumerate(nexts):
            probabilities[(idx_f, idx_n)] = counts.get((f, n), 0)
    probabilities = probabilities / probabilities.sum(axis=1, keepdims=True)
    return firsts, nexts, probabilities

def generate_word(firsts, nexts, probabilites, n):
    first_char = str(np.random.choice(nexts, size=1, replace=True, p=probabilities[firsts.index("<>")])[0])
    word = first_char
    while True:
        prev_ngram = word[-(n-1):] if len(word)>=(n-1) else '<>'+word
        next_char = str(np.random.choice(nexts, size=1, replace=True, p=probabilities[firsts.index(prev_ngram)])[0])
        if next_char == "<>":
            break
        word += next_char
    return word

def generate_words(n_words, firsts, nexts, probabilities, n):
    words = []
    for i in range(n_words):
        words.append(generate_word(firsts, nexts, probabilities, n))
    return words

def calculate_perplexity(word, probabilities, n):
    word = ["<>"] + list(word) + ["<>"]
    predictor_grams = []
    for idx_char, char in enumerate(word[:-1]):
        predictor_grams.append("".join(word[max(0, idx_char - (n-1) + 1):idx_char+1]))
    perplexity = 1
    for predictor, test in zip(predictor_grams, word[1:]):
        try:
            probability = float(probabilities[firsts.index(predictor)][nexts.index(test)])
            probability = probability if probability>0 else 0.001
        except:
            probability = 1
        perplexity *= (probability)**(-1/len(predictor_grams))
    return perplexity

def calculate_test_set_perplexities(test_list, probabilities, n):
    perplexities = []
    for element in test_list:
        perplexities.append(calculate_perplexity(element, probabilities, n))
    return perplexities

# Test set perplexity

In [108]:
# Test set perplexity
n = 6
ngrams = ngram_list_updated(names_train, n)
counts= ngram_count(ngrams)
firsts, nexts, probabilities = calculate_conditional_probabilities(counts)
perplexities = calculate_test_set_perplexities(names_test, probabilities, n)
print(f"Total perplexity: {sum(perplexities)}")
print(f"Mean perplexity: {sum(perplexities)/len(perplexities)}")

Total perplexity: 6098.503465321157
Mean perplexity: 12.838954663834015


# Word generation

In [109]:
words = generate_words(1000, firsts, nexts, probabilities, n)
print(f"% of words generated that are also in train set: {len(set(words).intersection(set(names_train)))/len(set(words))}")
print("")

% of words generated that are also in train set: 0.9307036247334755



In [110]:
new_names = ", ".join(list(set(words) - set(names_train))[0:10])
print(f"Examples of new words: {new_names}")

Examples of new words: charella, clementin, lionhearty, chambar, harastro, yolandi, agnolia, raubautzi, ottavian, carnell


In [124]:
[i for i in names if "agnolia" in i]

['magnolia']