In [None]:
import random
import numpy as np
from tensorflow.keras.utils import to_categorical

In [None]:
# Imagine there are 10^15 names in this list and we can't load into a list examples anymore!!
# examples = [a.strip() for a in open('names-train.txt') if a.strip()]

In [None]:
# We still need the vocabulary, but can't compute from examples anymore
# Instead, load the examples one by one and add characters as vocabulary items
vocab = set(['<PAD>'])
for a in open('names-train.txt'):
    if a.strip():
        for c in a.strip():
            vocab.add(c)
encoder = dict((c,i) for i,c in enumerate(vocab))
decoder = dict((i,c) for i,c in enumerate(vocab))
inputlen = 10

In [None]:
def example_generator_file(batchsize, filenm, total_examples, enc, inlen):
    ex_file = open(filenm, 'r')
    currexample = ''
    examplenum = 0
    charindex = 0
    Xenc = [enc['<PAD>']]*inlen
    while True:
        X = []
        y = []
        while len(y) < batchsize:  
            while charindex == len(currexample):
                if currexample != '':
                    X.append(Xenc.copy())
                    y.append(enc['<PAD>'])
                currexample = ex_file.readline()
                examplenum += 1
                if currexample == '' or examplenum == total_examples:
                    ex_file.seek(0)
                    currexample = ex_file.readline()
                    examplenum = 0
                currexample = currexample.rstrip('\n')
                charindex = 0
                Xenc = [enc['<PAD>']]*inlen
            nextchar = currexample[charindex]
            X.append(Xenc.copy())
            y.append(enc[nextchar])
            Xenc.pop(0)
            Xenc.append(enc[nextchar])
            charindex += 1
        yield (np.array(X), to_categorical(y, num_classes=len(enc)))

In [None]:
# below we'll train on full dataset; to separate out development data
# you'll ideally put that data in a second file and call example_generator_file on that
#from sklearn.model_selection import train_test_split
#X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.1, random_state = 42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
emb_dim = 10
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=emb_dim, input_length=inputlen))
model.add(Flatten())
model.add(Dense(40))
model.add(Dense(len(vocab), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

In [None]:
batch=1024
totalnum=7574
model.fit_generator(example_generator_file(batch, 'names-train.txt', totalnum, encoder, inputlen), steps_per_epoch=totalnum//batch, epochs=200, verbose=1)

In [None]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def generate(temperature=1.0):
    answer = ''
    Xout = [encoder['<PAD>']]*inputlen
    while True:
        ydist = model.predict(np.array([Xout]), verbose=0)
        nextchar_index = sample(ydist[0],temperature)
        nextchar = decoder[nextchar_index]
        if nextchar == '<PAD>' or len(answer)>100:
            break
        answer += nextchar
        Xout.append(nextchar_index)
        Xout.pop(0)
    return answer
    

In [None]:
generate(0.5)