In [42]:
import requests
from urllib.request import urlopen

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_dinosaur_genera'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

In [54]:
listitems = soup.find_all('li')[6:-110]

In [96]:
dinonames = []

# returns the lowercased text from the first link in every list item
for match in listitems:
    if match.a is not None:
        dinonames.append(match.a.get_text().lower())

In [97]:
len(dinonames)

1648

In [98]:
import random

for i in range(10):
    print(random.choice(dinonames))

sinornithoides
tianchisaurus
wannanosaurus
australovenator
paleosaurus
megacervixosaurus
australodocus
magyarosaurus
blasisaurus
lohuecotitan


In [108]:
text = '\n'.join(dinonames)

vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

27 unique characters


In [113]:
max_char = len(max(dinonames, key = len))
name_dim = len(dinonames)
char_dim = len(vocab)

# turn training set into tensors: 
X = np.zeros((name_dim, max_char, char_dim))
Y = np.zeros((name_dim, max_char, char_dim))

for i in range(name_dim):
    name = list(dinonames[i])
    for j in range(len(name)):
        X[i, j, char2idx[name[j]]] = 1
        if j < len(name)-1:
            Y[i, j, char2idx[name[j+1]]] = 1

In [115]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import LambdaCallback

model = Sequential()
model.add(LSTM(128, input_shape=(max_char, char_dim), return_sequences=True))
model.add(Dense(char_dim, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

In [119]:
def generate_name(model):
    name = []
    x = np.zeros((1, max_char, char_dim))
    end = False
    i = 0
    
    while end == False:
        probs = list(model.predict(x)[0,i])
        norm_probs = probs / np.sum(probs)
        index = np.random.choice(range(char_dim), p=norm_probs)
        if i == max_char-2:
            char = '\n'
            end = True
        else:
            char = idx2char[index]
        name.append(char)
        x[0, i+1, index] = 1
        i += 1
        if char == '\n':
            end = True
    
    print(''.join(name))

In [122]:
def generate_name_loop(epoch, _):
    if epoch % 10 == 0:
        
        print('Names generated after epoch %d:' % epoch)

        for i in range(3):
            generate_name(model)
        
        print()
      
name_generator = LambdaCallback(on_epoch_end = generate_name_loop)

model.fit(X, Y, batch_size=128, epochs=100, callbacks=[name_generator], verbose=0)

Names generated after epoch 0:
racodinusaurusussusuu

epedonaxiaurussauruss

ycarmsuususaurussussu


Names generated after epoch 10:
inaceratossusucussuru

ustrotikusursusaurusv

ampasaurussussucuussu


Names generated after epoch 20:
ongnisauruscussusussu

utrovenatoraxiusausau

emcoravipantiaaiausau


Names generated after epoch 30:
thyanousaurusuusaurus

inesaurussurussurussu

inosaurussuusaurussuc


Names generated after epoch 40:
amperiasaurussuusuusu

striosaurussuchusauru

achiosauruscisssurusu


Names generated after epoch 50:
yuangosaurusuchusauru

yeminaptoryxeusaurusu

pinotosaurusucussauru


Names generated after epoch 60:
alcovenaaurussuusauru

eucerotopnisaudiannuc

utelosaurusueusaurusa


Names generated after epoch 70:
aplasaurussusucsuscuu

riharrasedioddotester

iabinitosaurussuusaur


Names generated after epoch 80:
equenosaurusseusasaur

ongdiasousaurusauruss

uteclenadoneptosaurus


Names generated after epoch 90:
ratilimussauruscussuu

inotrrasaurussuusauru

ugeus

<tensorflow.python.keras.callbacks.History at 0x2018d845b48>