In [1]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras import layers
from keras.optimizers import Adam
import sys

Using TensorFlow backend.


In [2]:
MAXLEN = 2 # sequence length to consider
STEP = 1
LEARNING_RATE = .001
NUM_EPOCHS = 10
BATCH_SIZE = 5096

In [3]:
def load_data(filename):
    df = pd.read_csv(filename)
    df = df.filter(['Name'])
    df = np.array(df)
    return df

In [4]:
def preprocess_data(df):
    
    inputs = []
    targets = []
    
    all_chars = ""
    
    for item in df:
        item = str(np.squeeze(item))
        # track all possible characters to generate
        all_chars += item
        
        # create tokens from items
        for i in range(0, len(item) - MAXLEN, STEP):
            inputs.append(item[i : i + MAXLEN])
            targets.append(item[i + MAXLEN])
    
    # get list of unique characters to generate from
    chars = sorted(list(set(all_chars)))
    print("Unique chars: " + str(len(chars)))
    char_indices = dict((char, chars.index(char)) for char in chars)
    
    # create empty numpy arrays for X and y
    X = np.zeros((len(inputs), MAXLEN, len(chars)), dtype=np.bool)
    y = np.zeros((len(inputs), len(chars)), dtype=np.bool)
    
    # one-hot encode selections
    for i, indiv_input in enumerate(inputs):
        for t, indiv_char in enumerate(indiv_input):
            X[i, t, char_indices[indiv_char]] = 1
        y[i, char_indices[targets[i]]] = 1
    
    return X, y, chars, char_indices, len(chars)

In [5]:
def build_model(charLength):
    model = Sequential()
    model.add(layers.LSTM(128, input_shape=(MAXLEN, charLength)))
    model.add(layers.Dense(charLength, activation='softmax')) # multi-label classification problem
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=LEARNING_RATE))
    
    return model

In [6]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [14]:
def generate_with_seed(seed, length, chars, char_indices, char_length):
    seed = seed[:MAXLEN]
    generated_text = seed[:MAXLEN]
    name = seed
    for i in range(length - len(seed)):
        sampled = np.zeros((1, MAXLEN, char_length))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.
            
        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, 0.5)
        next_char = chars[next_index]

        generated_text += next_char
        generated_text = generated_text[1:]

        name += next_char
        
    return name

In [8]:
data = load_data("names.csv")
X, y, chars, char_indices, char_length = preprocess_data(data)
print("X.shape == " + str(X.shape))
print("y.shape == " + str(y.shape))

Unique chars: 52
X.shape == (7619532, 2, 52)
y.shape == (7619532, 52)


In [9]:
# train a RNN to generate new names
model = build_model(char_length)
model.fit(X, y, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe9fe3b8f28>

In [18]:
generate_with_seed("Ab", 5, chars, char_indices, char_length)

'Abbin'

In [16]:
generate_with_seed("Ja", 6, chars, char_indices, char_length)

'Janiah'

In [17]:
generate_with_seed("S", 3, chars, char_indices, char_length)

'Sal'