sites to check out: 
https://keras.rstudio.com/articles/examples/lstm_seq2seq.html

In [0]:
pip install html2text

Collecting html2text
  Downloading https://files.pythonhosted.org/packages/49/21/eb38d335ab15fc13564a5e971c1403707fb3a037292f246fa82e17208794/html2text-2019.9.26-py3-none-any.whl
Installing collected packages: html2text
Successfully installed html2text-2019.9.26


In [0]:
import numpy as np
import pandas as pd
from pandas import read_csv
from __future__ import print_function
import sys
import io
import random

# NLP and text
import html2text
from html2text import html2text
import re
import string
import nltk
from nltk.data import find
import gensim
from gensim.models import Word2Vec
from nltk.tokenize.treebank import TreebankWordDetokenizer

# Machine learning
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, CuDNNLSTM
from keras.optimizers import RMSprop , Adam
from keras.utils.data_utils import get_file


Using TensorFlow backend.


In [0]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')
#drive.mount("/content/drive", force_remount=True) #Lia

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# load ascii text and covert to lowercase
filename = "/content/drive/My Drive/Unsupervised Project/all_djt_tweets.csv"
df = pd.read_csv(filename, header=0)
df['text']

  interactivity=interactivity, compiler=compiler, result=result)


0         Over 90% approval rating for your all time fav...
1         “Mainstream Media tries to rewrite history to ...
2         Fantastic numbers on consumer spending release...
3         ...And it will get, as I have always said, muc...
4         RT @realDonaldTrump: Social Media Giants are s...
                                ...                        
328048    "My persona will never be that of a wallflower...
328049    New Blog Post: Celebrity Apprentice Finale and...
328050    Donald Trump reads Top Ten Financial Tips on L...
328051    Donald Trump will be appearing on The View tom...
328052    Be sure to tune in and watch Donald Trump on L...
Name: text, Length: 328053, dtype: object

In [0]:
df = df.astype({'text': 'str'})
df['text'][0]

'Over 90% approval rating for your all time favorite (I hope) President within the Republican Party and 52% overall. This despite all of the made up stories by the Fake News Media trying endlessly to make me look as bad and evil as possible. Look at the real villains please!'

Since this data is generated from Tweets, we would want to get rid of things like urls, special characters

Src: https://www.kaggle.com/davidg089/all-djtrum-tweets

In [0]:
# cleanup
alphabet = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
def cleanup(sentence):

    if(isinstance(sentence, float)):
      return ''

    output = html2text(sentence) 
    # remove retweets and mentions
    output = re.sub("^RT @.*", "", output)

    output = re.sub("^@.*", "", output)
    
    output = output.lower()
    # remove hashtags
    output = re.sub("#\w+$", "", output)
    # remove urls
    urlregex = "https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,}"
    output = re.sub(urlregex, "", output)
    
    # remove special characters 
    output = re.sub("[…“\"”&+,:;=?#$|<>.^*()!–_]", "", output)

    output = re.sub("-", " ", output)
    output = ''.join(filter(lambda x: x in alphabet, output))
    
    # remove twitter handles 
    output = re.sub("@[a-zA-Z0-9]+", " <@twitter_handle> ", output)
    
    # remove numbers and percentages
    output = re.sub("(\d+%)\s+|\s+(\d+%)", " <percentage> ", output)
    output = re.sub("(\d+)\s+|\s+(\d+)", " <number> ", output)

    # remove extra spaces
    output = re.sub("\s+", " ", output).strip()
    # if(output == 'nan'):
    #   print(sentence)
    return  output + ' <eot>' if len(output) > 0 else ''

df['text_clean'] = df['text'].apply(cleanup)
tweets = df['text_clean']

In [0]:
all_text = '.\n'.join(tweets.tolist())
n_first_charactrers = 1000000
text = all_text[0:n_first_charactrers]

In [0]:
text



In [0]:
# https://keras.io/examples/lstm_text_generation/
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

total chars: 50
nb sequences: 333320
Vectorization...


In [0]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(CuDNNLSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dropout(0.2))
model.add(Dense(len(chars), activation='softmax'))

#optimizer = RMSprop(lr=0.001, decay=1e-5)
optimizer = Adam(lr=0.001, decay=1e-5)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, _):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

history = model.fit(x, y,
          batch_size=128,
          epochs=120,
          callbacks=[print_callback])

Build model...




Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Epoch 1/120






----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "ou <eot>.
<@twitter_handle> is much more"
ou <eot>.
<@twitter_handle> is much more to and and and and in <eot>.
the for the and on the will we will the deant and and and <eot>.
the all in seand and and and and and and and and and and and and and and and and and and on the and and and and and the and <eot>.
will the man in the hand the and and and and and and and and and and and and and and and and and and and and and and the will the pores and and and and and and and and and an
----- diversity: 0.5
----- Generating with seed: "ou <eot>.
<@twitter_handle> is much more"
ou <eot>.
<@twitter_handle> is much more for in fis country in <eot>.
and on mant bake 

In [0]:
print(history.history.keys())


In [0]:
from IPython.display import SVG
from keras.utils import model_to_dot

SVG(model_to_dot(model, dpi=65).create(prog='dot', format='svg'))

In [0]:
import matplotlib.pyplot as plt

# # Plot training & validation accuracy values
# plt.plot(history.history['acc'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()