<a href="https://colab.research.google.com/gist/hkalexling/6bfe2f3d23cbb5ceffadcaecb739b7f8/trumpspeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
import keras.utils as ku
import numpy as np

In [None]:
CHECKPOINT='drive/My Drive/Colab/trump-speech/1.h5'

In [None]:
import requests
import re

# Download and pre-process the dataset

data = requests.get('https://raw.githubusercontent.com/ryanmcdermott/trump-speeches/master/speeches.txt').text
data = re.sub('SPEECH [0-9]+|\r|\ufeff', '', data)
data = re.sub('\n', ' ', data)
data = re.sub('[ ]+', ' ', data)

texts = [t.strip() for t in data.split('. ')]
print(texts)
print(len(texts))

In [None]:
# Initialize tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_count = len(tokenizer.word_index) + 1

print(word_count)

In [None]:
# Generate n-grams as training set
n_grams = []

seqs = tokenizer.texts_to_sequences(texts)
for seq in seqs:
  for i in range(1, len(seq)):
    n_grams.append(seq[:i+1])

max_len = max([len(n) for n in n_grams])
n_grams = np.array(pad_sequences(n_grams, maxlen=max_len, padding='pre'))

print(n_grams.shape)

In [None]:
X, y = n_grams[:,:-1], n_grams[:,-1]

In [None]:
# Build the model
model = Sequential()
    
model.add(Embedding(word_count, 10, input_length=max_len-1))

model.add(LSTM(256))
model.add(Dropout(0.2))
    
model.add(Dense(word_count, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam') 

model.summary()

In [None]:
import tensorflow as tf
cb = tf.keras.callbacks.ModelCheckpoint(filepath=CHECKPOINT, save_weights_only=True, verbose=True)

try:
  model.load_weights(CHECKPOINT)
except:
  pass

model.fit(X, y, batch_size=256, epochs=200, verbose=True, callbacks=[cb])

In [None]:
# Test the model
model.load_weights(CHECKPOINT)

txt = "Make"

for _ in range(20):
    tokens = tokenizer.texts_to_sequences([txt])[0]
    tokens = pad_sequences([tokens], maxlen=max_len-1, padding='pre')  
    index = np.random.choice(word_count, p=model.predict(tokens).reshape((word_count)))
    
    txt += " " + tokenizer.index_word[index]
    
print(txt)

In [None]:
# Save the model

model.save('drive/My Drive/Colab/trump-speech/model')
!saved_model_cli show --dir 'drive/My Drive/Colab/trump-speech/model' --all

In [None]:
 # Export the word-index mapping
 
 import json

 with open('word_index.json', 'w+') as f:
   json.dump(tokenizer.word_index, f)

 with open('index_word.json', 'w+') as f:
    json.dump(tokenizer.index_word, f)