In [1]:
import tensorflow as tf

with open('superheroes.txt') as file:
    data = file.read()

In [2]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~',
    split='\n',
)

tokenizer.fit_on_texts(data)

In [3]:
char_to_index = tokenizer.word_index
index_to_char = dict((v, k) for k, v in char_to_index.items())

names = data.splitlines()
tokenizer.texts_to_sequences(names[0])

[[25], [16], [12], [20], [2], [1]]

In [4]:
#function to convert name to sequence
def name_to_seq(name):
  return [tokenizer.texts_to_sequences(c)[0][0] for c in name]

name_to_seq(names[0])

[25, 16, 12, 20, 2, 1]

In [5]:
#function to convert sequence to name
def seq_to_name(seq):
  return ''.join([index_to_char[i] for i in seq if i!= 0])

seq_to_name([25, 16, 12, 20, 2, 1])

'jumpa\t'

In [6]:
#append each successive character to the sequence to calculate probability of the next character given the current
sequences = []

for name in names: 
  seq = name_to_seq(name)
  if len(seq) >= 2:
    sequences += [seq[:i] for i in range(2, len(seq) + 1)]

In [7]:
#find max length of superhero names
max_len = max([len(x) for x in sequences])

In [8]:
#pads sequences to the same length and transforms the sequence into a 2D array
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
    
    #adds 0's before each sequence
    sequences, padding = 'pre',
    
    #sets the length to the length of the longest sequence
    maxlen = max_len
)

x, y = padded_sequences[:,:-1], padded_sequences[:,-1]

In [9]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y)

In [10]:
num_chars = len(char_to_index.keys()) + 1

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, LSTM, Bidirectional, Dense

model = Sequential([
                    Embedding(num_chars, 8, input_length = max_len - 1),
                    Conv1D(64, 5, strides = 1, activation = 'tanh', padding = 'causal'),
                    MaxPool1D(2),
                    LSTM(32),
                    Dense(num_chars, activation = 'softmax')
])

model.compile(
    loss = 'sparse_categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 32, 8)             232       
                                                                 
 conv1d (Conv1D)             (None, 32, 64)            2624      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 16, 64)           0         
 )                                                               
                                                                 
 lstm (LSTM)                 (None, 32)                12416     
                                                                 
 dense (Dense)               (None, 29)                957       
                                                                 
Total params: 16,229
Trainable params: 16,229
Non-trainable params: 0
____________________________________________________

In [12]:
h = model.fit(
    x_train, y_train,
    validation_data = (x_test, y_test),
    epochs = 50, verbose = 2,
    callbacks = [
                 tf.keras.callbacks.EarlyStopping(monitor = 'val_accuracy', patience = 3)
    ]
)

Epoch 1/50
2070/2070 - 13s - loss: 2.7427 - accuracy: 0.1892 - val_loss: 2.5788 - val_accuracy: 0.2308 - 13s/epoch - 6ms/step
Epoch 2/50
2070/2070 - 11s - loss: 2.5358 - accuracy: 0.2384 - val_loss: 2.5085 - val_accuracy: 0.2457 - 11s/epoch - 5ms/step
Epoch 3/50
2070/2070 - 12s - loss: 2.4737 - accuracy: 0.2526 - val_loss: 2.4597 - val_accuracy: 0.2527 - 12s/epoch - 6ms/step
Epoch 4/50
2070/2070 - 11s - loss: 2.4278 - accuracy: 0.2631 - val_loss: 2.4227 - val_accuracy: 0.2657 - 11s/epoch - 6ms/step
Epoch 5/50
2070/2070 - 12s - loss: 2.3919 - accuracy: 0.2745 - val_loss: 2.3958 - val_accuracy: 0.2720 - 12s/epoch - 6ms/step
Epoch 6/50
2070/2070 - 12s - loss: 2.3630 - accuracy: 0.2824 - val_loss: 2.3723 - val_accuracy: 0.2821 - 12s/epoch - 6ms/step
Epoch 7/50
2070/2070 - 12s - loss: 2.3378 - accuracy: 0.2899 - val_loss: 2.3558 - val_accuracy: 0.2884 - 12s/epoch - 6ms/step
Epoch 8/50
2070/2070 - 12s - loss: 2.3145 - accuracy: 0.2979 - val_loss: 2.3409 - val_accuracy: 0.2927 - 12s/epoch - 6

In [13]:
#function that generates a superhero name given a seed
def generate_names(seed):
  for i in range(0,40):
    seq = name_to_seq(seed)
    padded = tf.keras.preprocessing.sequence.pad_sequences([seq], padding = 'pre', maxlen = max_len - 1, truncating = 'pre')

    #predict a character on a given seed S
    pred = model.predict(padded)[0]
    pred_char = index_to_char[tf.argmax(pred).numpy()]
    
    #append the predicted character to seed
    seed += pred_char
    
    #repeat until the predicted character is end of name i.e. '\t'
    if pred_char == '\t':
      break
    
  #print the value of seed S as the predicted name  
  print(seed)

In [17]:
generate_names('a')

alleming saracher	


In [18]:
generate_names('b')

black broeder	
