## Import Package

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf 
import tensorflow.keras as keras
import matplotlib.pyplot as plt

## Load & Explore Dataset

In [2]:
dataset_path = "../input/nyt-comments/CommentsApril2017.csv"

In [3]:
dataset = pd.read_csv(dataset_path)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
dataset.shape

(243832, 34)

In [5]:
# to prevent run out of memory, I only select part of dataset
dataset = dataset[:1000]

In [6]:
dataset.head(10)

Unnamed: 0,approveDate,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,editorsSelection,parentID,...,userLocation,userTitle,userURL,inReplyTo,articleID,sectionName,newDesk,articleWordCount,printPage,typeOfMaterial
0,1491245186,This project makes me happy to be a 30+ year T...,22022598.0,22022598,<br/>,comment,1491237000.0,1,False,0.0,...,"Riverside, CA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
1,1491188619,Stunning photos and reportage. Infuriating tha...,22017350.0,22017350,,comment,1491180000.0,1,False,0.0,...,<br/>,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
2,1491188617,Brilliant work from conception to execution. I...,22017334.0,22017334,<br/>,comment,1491179000.0,1,False,0.0,...,Raleigh NC,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
3,1491167820,NYT reporters should provide a contributor's l...,22015913.0,22015913,<br/>,comment,1491150000.0,1,False,0.0,...,"Missouri, USA",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
4,1491167815,Could only have been done in print. Stunning.,22015466.0,22015466,<br/>,comment,1491147000.0,1,False,0.0,...,"Tucson, Arizona",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
5,1491142576,Thank you New York Times. People should be sup...,22012085.0,22012085,<br/>,comment,1491129000.0,1,False,0.0,...,new york,,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
6,1491060909,Proof that photojournalism is alive and well. ...,22003784.0,22003784,<br/>,comment,1491056000.0,1,False,0.0,...,"East Northport, NY",,,0,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
7,1491252241,"The OASIS Initiative, which I started with Pro...",22024897.0,22024897,<br/>,userReply,1491248000.0,2,False,22015913.0,...,"Tahoma, CA",,,22015913,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
8,1491668599,I agree. I've just spent 30 minutes trying to ...,22082978.0,22082978,,userReply,1491665000.0,2,False,22015913.0,...,Ann Arbor,,,22015913,58def1347c459f24986d7c80,Unknown,Insider,716.0,2,News
9,1491064414,How about Katrina Pierson? Back to Palookavill...,22004930.0,22004930,<br/>,comment,1491061000.0,1,False,0.0,...,California,,,0,58def3237c459f24986d7c84,Unknown,OpEd,823.0,23,Op-Ed


In [7]:
sentences = dataset["commentBody"].values

In [8]:
sentences[0]

'This project makes me happy to be a 30+ year Times subscriber... continue to innovate across all platforms, please.'

In [9]:
sentences[1]

"Stunning photos and reportage. Infuriating that the Trump admistration's draconian reinstatement of the global gag order will prevent men and women from receiving appropriate family planning advice, so obviously desperately   needed."

## Prepare Dataset for Training

In [10]:
# convert all words to lowercase
for idx, sentence in enumerate(sentences):
    sentences[idx] = sentence.lower()

In [11]:
# fit all sentences on tokenizer
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(sentences)

In [None]:
# word index of tokenizer
tokenizer.word_index

In [13]:
# number of total words
total_word = len(tokenizer.word_index)+1
print("Total number of word: ", total_word)

Total number of word:  8966


In [14]:
# convert sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)

In [15]:
# prepare training sequences
training_sequences = []

for seq in sequences:
    for i in range(2, len(seq)):
        training_sequences.append(seq[:i])
        
training_sequences = np.array(training_sequences)

In [16]:
print("Length of training_sequences: ", len(training_sequences))

Length of training_sequences:  61909


In [17]:
# take a look on training_sequences
print("The first sequence in training sequences: ", training_sequences[0])
print("The second sequence in training sequences: ", training_sequences[1])

The first sequence in training sequences:  [17, 1975]
The second sequence in training sequences:  [17, 1975, 356]


In [18]:
# pad all sequences to make them same length
longest_len = max([len(l) for l in training_sequences])
training_sequences = keras.preprocessing.sequence.pad_sequences(sequences=training_sequences,
                                           maxlen=longest_len,
                                           padding="pre")

In [19]:
# prepare x_train and y_train
x_train = training_sequences[:, :-1]
y_train = training_sequences[:, -1]

In [20]:
y_train = keras.utils.to_categorical(y=y_train, num_classes=total_word)

In [21]:
print("Shape of training_sequences: ", training_sequences.shape)
print("Shape of x_train: ", x_train.shape)
print("Shape of y_train: ", y_train.shape)

Shape of training_sequences:  (61909, 283)
Shape of x_train:  (61909, 282)
Shape of y_train:  (61909, 8966)


## Define Model

In [22]:
# model architechture
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=total_word,
                                 output_dim=64,
                                 input_length=longest_len))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(128)))
model.add(keras.layers.Dense(units=64, activation="relu"))
model.add(keras.layers.Dense(units=total_word, activation="softmax"))

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 283, 64)           573824    
_________________________________________________________________
bidirectional (Bidirectional (None, 283, 512)          657408    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               656384    
_________________________________________________________________
dense (Dense)                (None, 64)                16448     
_________________________________________________________________
dense_1 (Dense)              (None, 8966)              582790    
Total params: 2,486,854
Trainable params: 2,486,854
Non-trainable params: 0
_________________________________________________________________


In [27]:
# load model weight: model was trained to get the accuracy of 0.95
try:
    model.load_weights("../input/model-weight-generate-text-with-rnn/best_model_weight.h5")
except:
    print("ERROR")

In [24]:
# compile model
model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.Adam(learning_rate=0.0001), metrics=["acc"])

In [26]:
# define custom callback for training
class CustomCallback(keras.callbacks.Callback):
    
    def on_epoch_end(self, epoch, logs):
        if(logs["acc"] >= 0.95):
            self.model.stop_training = True

custome_callback = CustomCallback()
checkpoint = keras.callbacks.ModelCheckpoint(filepath="best_model.h5",
                                             monitor="acc",
                                             verbose=1,
                                             save_best_only=True,
                                             save_weights_only=True,
                                             mode="auto",
                                             save_freq="epoch")

In [None]:
# train model
history = model.fit(x=x_train,
                    y=y_train,
                    batch_size=32,
                    epochs=500,
                    callbacks=[custome_callback, checkpoint])

## Create Text with Trained Model

In [126]:
first_word = "You"

generated_sentence = [first_word]
num_word_to_generate = 25

In [127]:
generated_sentence

['You']

In [128]:
tokenizer.texts_to_sequences(generated_sentence)

[[24]]

In [None]:
# create a dict to map idx to word
idx2word = {idx:word for word, idx in tokenizer.word_index.items()}
idx2word

In [130]:
for i in range(num_word_to_generate):
    
    x = tokenizer.texts_to_sequences(generated_sentence)
    
    if len(x[0]) > longest_len:
        x[0] = x[0][-1 * longest_len:]
    else:
        x = keras.preprocessing.sequence.pad_sequences(sequences=x,
                                                   maxlen=longest_len,
                                                   padding="pre")
    x = np.array(x)
    y = model.predict(x)[0]
    idx = np.argmax(y)
    
    generated_word = idx2word[idx]
    
    generated_sentence[0] += " " + generated_word

In [131]:
generated_sentence

['You forgot kellyanne conway who will play the female henchman when the mel black reputation if they are enough for the military fake production with russia']