<a href="https://colab.research.google.com/github/harshildarji/ML-Practise/blob/master/ML-Practise/Miscellaneous/tweet_generation_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Text generation using RNN trained on tweets

Based on: [Beginners Guide to Text Generation using LSTMs](https://www.kaggle.com/shivamb/beginners-guide-to-text-generation-using-lstms)

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
import tensorflow.keras.utils as ku

In [0]:
from tensorflow.random import set_seed
from numpy.random import seed
set_seed(2)
seed(1)

In [0]:
import pandas as pd
import numpy as np
import string, os

In [0]:
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

In [0]:
import io, requests

#### Load the dataset

In [0]:
url = 'https://raw.githubusercontent.com/harshildarji/DataScienceLab/master/data/train_data.csv'
_data = requests.get(url).content
data = pd.read_csv(io.StringIO(_data.decode('utf-8')))

In [0]:
data = data.pop('tweet')

In [0]:
data = data.dropna().reset_index(drop=True)

In [10]:
data.head()

0    mariotti did your ancestors have to apply thro...
1    rule of law all the liberals including her wan...
2    rt japanmissionun thank you unicef unhcr nyoff...
3    know a good law firm in stirling immigration c...
4    wall street ally with a skepticism of immigrat...
Name: tweet, dtype: object

#### Generating n-gram tokens

In [0]:
tokenizer = Tokenizer()

In [0]:
def get_sequence_of_token(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    
    return input_sequences, total_words

In [0]:
input_sequences, total_words = get_sequence_of_token(data)

#### Padding

In [0]:
def generate_pad_sequences(input_sequences):
    max_sequence_len = max([len(s) for s in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:,:-1], input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

In [0]:
predictors, label, max_sequence_len = generate_pad_sequences(input_sequences)

#### Define the model

In [0]:
def build_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential([
        Embedding(total_words, 10, input_length=input_len),
        LSTM(100),
        Dropout(0.1),
        Dense(total_words, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                 loss='categorical_crossentropy')
    
    return model

In [0]:
model = build_model(max_sequence_len, total_words)

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 63, 10)            102360    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 10236)             1033836   
Total params: 1,180,596
Trainable params: 1,180,596
Non-trainable params: 0
_________________________________________________________________


In [19]:
model.fit(predictors, label, epochs=50)

Train on 85819 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7fdfa02ffc50>

In [0]:
model.save('model.h5')

#### Generate text

In [0]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ''
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text += ' ' + output_word
        
    return seed_text.title()

In [22]:
generate_text('migrants from', 12, model, max_sequence_len)

'Migrants From Hungary Police Lines Hundreds Of Migrants Break Through Police Lines In Border'