## Importing modules

In [None]:
import os
import time
import math
import string
import random

In [None]:
import collections
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import torch
from torch import nn
from torch import optim
from torch.functional import F
from torch.utils.data import DataLoader

In [None]:
from utils import evaluate
from utils import training

In [None]:
from importlib import reload

## Preprocessing data

In [None]:
def split_to_names(fname):
    """
    Input:
        fname: Path to data file.
    
    Output:
        data: List of names (which is a list of characters).
    """
    EOS = "<EOS>"
    data = []
    
    with open(fname) as file:
        text = file.read().lower()
        
    names = text.splitlines()
    for i, name in enumerate(names):
        
        # Ignore names containing non-ascii characters
        if not name.isascii():
            continue
            
        # Split names to chars and append the End of Sentence (EOS) Token
        ch_list = list(name) + [EOS]
        data.append(ch_list)
    return data

In [None]:
all_names = split_to_names("./data/dinos.txt")
print("Number of names:", len(all_names))

## Analysing data

In [None]:
all_names_onelist = []
for _name in all_names:
    all_names_onelist.extend(_name)
print(len(all_names_onelist))

In [None]:
_counter = collections.Counter(all_names_onelist)

In [None]:
print(_counter.keys())

In [None]:
_counter.most_common()#[::-1]

In [None]:
for _ch in ["a", "e", "i", "o", "u", "", "q", "s"]:
    print("{}:{}".format(_ch, _counter[_ch]), end=", ")

In [None]:
print(len(_counter.keys()))

In [None]:
_non_count = 0
for _key in _counter.keys():
    if _key.isalpha():
        continue
    _non_count += 1
    print("{", _key, ":", _counter[_key], "}", end=" ")

In [None]:
print(_non_count)

In [None]:
print(all_names[0:2])

In [None]:
char_vocab = ["<EOS>"] + sorted([ch for ch in string.ascii_lowercase]) + ["<PAD>"]

## Saving Processed data

In [None]:
data_dict = {"data_in_char": all_names,
             "char_vocab": char_vocab}

In [None]:
torch.save(data_dict, "./saves/data/clean_names.pt")