In [5]:
import requests
from urllib.request import urlopen

from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import random

url = 'https://en.wikipedia.org/wiki/List_of_dinosaur_genera'
html = urlopen(url) 
soup = BeautifulSoup(html, 'html.parser')

listitems = soup.find_all('li')[6:-110]
dinonames = []

# returns the lowercased text from the first link in every list item
for match in listitems:
    if match.a is not None:
        dinonames.append(match.a.get_text().lower()+'\n')

# changes list to a single string
text = ''.join(dinonames)

vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

# Creating a mapping from unique characters to integers
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

# entire text converted to array of integers
text_as_int = np.array([char2idx[c] for c in text])

max_char = len(max(dinonames, key = len))
name_dim = len(dinonames)
char_dim = len(vocab)

# turn training set (integer sequence) into tensors: 
X = np.zeros((name_dim, max_char, char_dim))
Y = np.zeros((name_dim, max_char, char_dim))
for i in range(name_dim):
    name = list(dinonames[i])
    for j in range(len(name)):
        X[i, j, char2idx[name[j]]] = 1
        if j < len(name)-1:
            Y[i, j, char2idx[name[j+1]]] = 1

27 unique characters


In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.optimizers import RMSprop
from keras.callbacks import LambdaCallback

# recurrant neural network with parameters often used for text generation
# a lot of inspiration comes from https://www.tensorflow.org/tutorials/text/text_generation
# although a number of parameters are changed since this program generates words,
# whereas most uses of neural networks for text generation focus on novel-length texts

# long short term memory layer and dense layer:
model = Sequential()
model.add(LSTM(128, input_shape=(max_char, char_dim), return_sequences=True))
model.add(Dense(char_dim, activation='softmax'))

# loss and optimization techniques that seem to work well for other textgen applications
opt = RMSprop(learning_rate = 0.01)
model.compile(loss='categorical_crossentropy', optimizer=opt)

In [None]:
# a name generation function used in its simplest form for the training below
# it can generate a name with or without a starting character start_char,
# todo: implement a temperature that controls how predictable the text is
def generate_name(model, start_char=None, temp=1.0):
    name = []
    x = np.zeros((1, max_char, char_dim))
    i = 0
    end = False

    if start_char is not None:
      start_char = start_char.lower()
      x[0,0,char2idx[start_char]] = 1
      name.append(start_char)
      i = 1

    while end == False:
        probs = list(model.predict(x)[0,i])
        norm_probs = probs / np.sum(probs)
        index = np.random.choice(range(char_dim), p=norm_probs)
        if i == max_char-2:
            char = '\n'
            end = True
        else:
            char = idx2char[index]
        name.append(char)
        x[0, i+1, index] = 1
        i += 1
        if char == '\n':
            end = True
    
    name = ''.join(name)
    return name

# these two functions enable the output as the network trains
def name_print_loop(epoch, _):
    if epoch % 20 == 0:
        print('Names generated after epoch %d:' % epoch)
        for i in range(3):
            print(generate_name(model).capitalize(),end='')

name_generator = LambdaCallback(on_epoch_end = name_print_loop)

In [None]:
# model training
model.fit(X, Y, batch_size=128, epochs=101, callbacks=[name_generator], verbose=0)

Names generated after epoch 0:
Zttl
Mekhuis
Xwaituedouaus
Names generated after epoch 20:
Alaboristhides
Righanotonyxus
Inkongosaurus
Names generated after epoch 40:
Arkharodon
Olycondorus
Manospchus
Names generated after epoch 60:
Aasholongosaurus
Archaeodontosaurus
Ignanosaurus
Names generated after epoch 80:
Alvisprnes
Yamanasaurus
Tenchisaurus
Names generated after epoch 100:
Unbaeollin
Orkatitan
Arshanosaurus


In [None]:
# fake dinosaur names, from fully trained model
for i in range(20):
  name = generate_name(model)
  if name != '\n':
    if name.lower() in dinonames:
      print("this one is real: ",end='')
    print(name.capitalize(),end='')

Atacamasitaptor
Harbyngvenator
this one is real: Rachitrema
Amangltha
Arstariasaurus
Otherunosaurus
this one is real: Orinosaurus
Veteruposaurus
Othanosaurus
this one is real: Tomodon
Amyodon
Isanocaudia
Ichariosaurus
this one is real: Wyleyia
Angunosaurus
Ambosauriscus
Yorosaurus
Ulancolimn
this one is real: Omosaurus
Ulancolimn


In [None]:
# real dinosaur names for comparison
for i in range(20):
  print(random.choice(dinonames).capitalize(),end='')

Paronychodon
Palaeosaurus
Plateosauravus
Sulaimanisaurus
Lufengosaurus
Adeopapposaurus
Trachodon
Galesaurus
Texacephale
Epicampodon
Clarencea
Lusovenator
Sektensaurus
Huanghetitan
Albertavenator
Masiakasaurus
Megaraptor
Piveteausaurus
Yamanasaurus
Rahona


In [None]:
!pip install pyyaml h5py
model.save('dino_model.h5')
# if in google colab, click file icon in left sidebar to find and download.



In [6]:
df = pd.DataFrame(dinonames)
df.to_csv('dinonames.csv', index=False)