In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.13.1


In [0]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


## Insepcting the Data

In [0]:
movie_data = pd.read_csv("gdrive/My Drive/movies_text.csv")

In [0]:
# Remove null values and unneeded features
movie_data.shape
movie_data = movie_data.dropna()
movie_data = movie_data.reset_index(drop=True)
movie_data.shape

(20404, 2)

In [0]:
movie_data.shape
movie_data=movie_data.rename(columns = {"Summary": "tagLine","Text":"movie_plot"})
movie_data = pd.DataFrame(movie_data, columns=["movie_plot","tagLine"])

In [0]:
print("Actual Data :",movie_data.shape)
print("Train data:",movie_data.iloc[:int(movie_data.shape[0]*0.6)].shape)
print("Validation Data:",movie_data.iloc[int(movie_data.shape[0]*0.6):int(movie_data.shape[0]*0.8)].shape)
print("Test Data:",movie_data.iloc[int(movie_data.shape[0]*0.85):].shape)


Actual Data : (20404, 2)
Train data: (12242, 2)
Validation Data: (4081, 2)
Test Data: (3061, 2)


In [0]:
movie_training_data=movie_data.iloc[:int(movie_data.shape[0]*0.6)].reset_index(drop=True)
movie_val_data=movie_data.iloc[int(movie_data.shape[0]*0.6):int(movie_data.shape[0]*0.8)].reset_index(drop=True)
movie_test_data=movie_data.iloc[int(movie_data.shape[0]*0.85):].reset_index(drop=True)
   

movie_training_data = movie_training_data.sample(frac=1).reset_index(drop=True)
movie_val_data = movie_val_data.sample(frac=1).reset_index(drop=True)
movie_test_data = movie_test_data.sample(frac=1).reset_index(drop=True)

print(movie_training_data.shape)
print(movie_val_data.shape)
print(movie_test_data.shape)

movie_data.head()


(12242, 2)
(4081, 2)
(3061, 2)


Unnamed: 0,movie_plot,tagLine
0,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!
1,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...
2,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...
3,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...
4,"Obsessive master thief, Neil McCauley leads a ...",A Los Angeles Crime Saga


In [0]:

for i in range(5):
    print("Plot #",i+1)
    print(movie_training_data.movie_plot[i])
    print(movie_training_data.tagLine[i])
    
    print()

Plot # 1
A murderer is brought to court and only Miss Marple is unconvinced of his innocence. Once again she begins her own investigation. The third Miss Marple film starring Margaret Rutherford as the quirky amateur detective.
New misdeeds are afoot afoot the footlights!

Plot # 2
A beautiful vampire turns a crime lord into a creature of the night.
The movie that goes straight for the jugular.

Plot # 3
Theseus is a mortal man chosen by Zeus to lead the fight against the ruthless King Hyperion, who is on a rampage across Greece to obtain a weapon that can destroy humanity.
The Gods need a hero.

Plot # 4
As another round of preliminary tests approach for Keitaro, so does Christmas. And as the first Christmas of the millennium, there is a rumor that if one confesses his love on this special eve it will come true. Keitaro has decided what he needs to do on this Christmas eve. But being a re-taker, can he afford taking attention away from the tests on the same day?
If you make a wish on 

## Preparing the Data

In [0]:
# Refernce for http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [0]:
def clean_text(text, remove_stopwords = True):
   
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
   
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [0]:
import nltk
#nltk.download()
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Clean the Training Taglines and Movie Plots 
clean_training_tag = []
for tagline_train in movie_training_data.tagLine:
    clean_training_tag.append(clean_text(tagline_train, remove_stopwords=False))
print("Training Taglines are complete.")



clean_training_plot = []
for movieplot_train in movie_training_data.movie_plot:
    clean_training_plot.append(clean_text(movieplot_train))
print("Training movie plots are complete.")

# Clean the Validation Taglines and Movie Plots 
clean_val_tag = []
for tagline_val in movie_val_data.tagLine:
    clean_val_tag.append(clean_text(tagline_val, remove_stopwords=False))
print("Validation Taglines are complete.")

clean_val_plot = []
for movieplot_val in movie_val_data.movie_plot:
    clean_val_plot.append(clean_text(movieplot_val))
print("Validation movie plots are complete.")


# Clean the Testing Taglines and Movie Plots 
clean_test_tag = []
for tagline_test in movie_test_data.tagLine:
    clean_test_tag.append(clean_text(tagline_test, remove_stopwords=False))
print("Testing Taglines are complete.")

clean_test_plot = []
for movieplot_test in movie_test_data.movie_plot:
    clean_test_plot.append(clean_text(movieplot_test))
print("Testing movie plots are complete.")



Training Taglines are complete.
Training movie plots are complete.
Validation Taglines are complete.
Validation movie plots are complete.
Testing Taglines are complete.
Testing movie plots are complete.


In [0]:
# Inspect the cleaned summaries and texts to ensure they have been cleaned well
for i in range(5):
    print("Clean Plot #",i+1)
    print(clean_training_plot[i])
    print(clean_training_tag[i])
    
    print()

Clean Plot # 1
murderer brought court miss marple unconvinced innocence begins investigation third miss marple film starring margaret rutherford quirky amateur detective
new misdeeds are afoot afoot the footlights 

Clean Plot # 2
beautiful vampire turns crime lord creature night
the movie that goes straight for the jugular 

Clean Plot # 3
theseus mortal man chosen zeus lead fight ruthless king hyperion rampage across greece obtain weapon destroy humanity
the gods need a hero 

Clean Plot # 4
another round preliminary tests approach keitaro christmas first christmas millennium rumor one confesses love special eve come true keitaro decided needs christmas eve taker afford taking attention away tests day
if you make a wish on christmas eve  will it come true 

Clean Plot # 5
approaching forty ferro unsatisfied life construction worker part time boxing instructor los angeles ca successful bout young pro boxer ferro decides gloves one last time movie recounts unlikely quest olympic gold
s

In [0]:
def count_words(count_dict, text):
    
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [0]:

word_counts = {}

count_words(word_counts, clean_training_tag)
count_words(word_counts, clean_training_plot)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 37183


In [0]:

embeddings_index = {}
with open('gdrive/My Drive/numberbatch-en.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 417195


In [0]:

missing_words = 0
threshold = 2

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round((1.0*missing_words/len(word_counts))*100,4)

print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from CN: 464
Percent of words that are missing from vocabulary: 1.2479%


In [0]:

vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words: 37183
Number of words we will use: 33113
Percent of words we will use: 89.05%


In [0]:

embedding_dim = 300
nb_words = len(vocab_to_int)


word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding


print(len(word_embedding_matrix))

33113


In [0]:
def convert_to_ints(text, word_count, unk_count, eos=False):
    '''Convert words in text to an integer.
       If word is not in vocab_to_int, use UNK's integer.
       Total the number of words and UNKs.
       Add EOS token to the end of texts'''
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
                unk_count += 1
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count

In [0]:

word_count = 0
unk_count = 0

int_summaries, word_count, unk_count = convert_to_ints(clean_training_tag, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_training_plot, word_count, unk_count, eos=True)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in Tagline:", word_count)
print("Total number of UNKs in Tagline:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in Tagline: 495597
Total number of UNKs in Tagline: 4074
Percent of words that are UNK: 0.8200000000000001%


In [0]:
def create_lengths(text):
    '
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [0]:
lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)

#print("Summaries:")
#print(lengths_summaries.describe())
#print()
#print("Texts:")
#print(lengths_texts.describe())

In [0]:
def unk_counter(sentence):
    
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count

In [0]:


sorted_summaries = []
sorted_texts = []
max_text_length = 200
max_summary_length = 50
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0

for length in range(min(lengths_texts.counts), max_text_length): 
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= min_length and
            len(int_summaries[count]) <= max_summary_length and
            len(int_texts[count]) >= min_length and
            unk_counter(int_summaries[count]) <= unk_summary_limit and
            unk_counter(int_texts[count]) <= unk_text_limit and
            length == len(int_texts[count])
           ):
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])
        
# Compare lengths to ensure they match
print(len(sorted_summaries))
print(len(sorted_texts))
print(type(sorted_summaries))
print(type(sorted_texts))
print(sorted_summaries[:10])
print(sorted_texts[:10])




11205
11205
<class 'list'>
<class 'list'>
[[83, 1440, 146, 2967, 2968], [69, 486, 69], [211, 1528, 14, 1089, 10, 32, 83, 125, 348], [216, 56, 1058, 154, 644], [1351, 1352, 4, 102, 4, 207, 573], [4, 6512, 94, 4, 6512, 3593, 4, 6512, 347], [4, 80, 56, 4, 135, 56, 2556], [24, 77, 4, 1840, 56, 424, 1824, 125, 1294, 64, 2630, 2631], [362, 861, 948, 853, 4, 2940, 15, 37, 179, 59, 572, 362, 5569, 5570, 37, 1784, 594], [4, 393, 5803, 583, 130, 4, 393, 5804, 56, 574, 86]]
[[16036, 134, 33111], [16036, 134, 33111], [123, 83, 1829, 33111], [1058, 154, 644, 33111], [2897, 9354, 473, 33111], [514, 6512, 2149, 33111], [80, 135, 2556, 33111], [167, 181, 7320, 15065, 33111], [473, 1873, 4327, 10669, 33111], [8040, 3761, 2398, 187, 33111]]


## Building the Model

In [0]:
def model_inputs():
    
    
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [0]:
def process_encoding_input(target_data, vocab_to_int, batch_size):
    
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)

    return dec_input

In [0]:
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    
    
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                    input_keep_prob = keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                    input_keep_prob = keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                    cell_bw, 
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)
   
    enc_output = tf.concat(enc_output,2)
    
    return enc_output, enc_state

In [0]:
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_summary_length):
    
    
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer) 

    training_logits, _ , _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)
    return training_decoder

In [0]:
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)
                
    inference_logits, _ , _ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    return inference_decoder

In [0]:
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, 
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                     input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  text_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')

    dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,
                                                          attn_mech,
                                                          rnn_size)
            
    
    initial_state = dec_cell.zero_state(batch_size=batch_size,dtype=tf.float32).clone(cell_state=enc_state[0])

    with tf.variable_scope("decode"):
        training_decoder = training_decoding_layer(dec_embed_input, 
                                                  summary_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_summary_length)
        
        training_logits,_ ,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                  output_time_major=False,
                                  impute_finished=True,
                                  maximum_iterations=max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_decoder = inference_decoding_layer(embeddings,  
                                                    vocab_to_int['<GO>'], 
                                                    vocab_to_int['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)
        
        inference_logits,_ ,_ = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                  output_time_major=False,
                                  impute_finished=True,
                                  maximum_iterations=max_summary_length)

    return training_logits, inference_logits

In [0]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):
   
    embeddings = word_embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
    
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        vocab_to_int, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers)
    
    return training_logits, inference_logits

In [0]:
def pad_sentence_batch(sentence_batch):
    """Pad sentences with <PAD> so that each sentence of a batch has the same length"""
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [0]:
def get_batches(summaries, texts, batch_size):
    """Batch summaries, texts, and the lengths of their sentences together"""
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

In [0]:
# Hyperparameters
epochs = 30
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.002
keep_probability = 0.75

In [0]:
import warnings
warnings.filterwarnings("ignore")


In [0]:
import warnings
warnings.filterwarnings("ignore")

train_graph = tf.Graph()

with train_graph.as_default():
    
     
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(vocab_to_int)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      vocab_to_int,
                                                      batch_size)
    
   
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        
        optimizer = tf.train.AdamOptimizer(learning_rate)

        
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

Graph is built.


## Training the Model

In [0]:

start =0
end = start + 5000
sorted_summaries_short = sorted_summaries[start:end]
sorted_texts_short = sorted_texts[start:end]
print("The shortest text length:", len(sorted_texts_short[0]))
print("The longest text length:",len(sorted_texts_short[-1]))

The shortest text length: 3
The longest text length: 27


In [0]:

learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20 
stop_early = 0 
stop = 3 
per_epoch = 3 
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] 

checkpoint = "best_model.ckpt" 
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(sorted_summaries_short, sorted_texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(sorted_texts_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                
                if update_loss <= min(summary_update_loss):
                    print('New Record!') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    saver.save(sess, checkpoint)

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/30 Batch   20/78 - Loss:  3.493, Seconds: 3.42
Average loss for this update: 3.394
New Record!
Epoch   1/30 Batch   40/78 - Loss:  2.676, Seconds: 3.45
Average loss for this update: 2.441
New Record!
Epoch   1/30 Batch   60/78 - Loss:  2.354, Seconds: 3.13
Average loss for this update: 2.454
No Improvement.
Epoch   2/30 Batch   20/78 - Loss:  2.255, Seconds: 2.52
Average loss for this update: 2.353
New Record!
Epoch   2/30 Batch   40/78 - Loss:  2.486, Seconds: 2.38
Average loss for this update: 2.291
New Record!
Epoch   2/30 Batch   60/78 - Loss:  2.234, Seconds: 2.57
Average loss for this update: 2.337
No Improvement.
Epoch   3/30 Batch   20/78 - Loss:  2.210, Seconds: 2.61
Average loss for this update: 2.303
No Improvement.
Epoch   3/30 Batch   40/78 - Loss:  2.412, Seconds: 2.43
Average loss for this update: 2.217
New Record!
Epoch   3/30 Batch   60/78 - Loss:  2.159, Seconds: 2.62
Average loss for this update: 2.257
No Improvement.
Epoch   4/30 Batch   20/78 - Loss:  2.1

In [0]:
clean_train_plot_top10 = clean_training_plot[0:10]
clean_train_tag_top10 = clean_training_tag[0:10]


In [0]:
def text_to_seq(text):
    text = clean_text(text)
    return [vocab_to_int.get(word, vocab_to_int['<UNK>']) for word in text.split()]

In [0]:
output_summaries = []
checkpoint = "./best_model.ckpt"
pad = vocab_to_int["<PAD>"] 
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    
    for i in range(0, len(clean_train_tag_top10)):
        text = text_to_seq(clean_train_plot_top10[i])
        answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(10,20)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 
        temp = " ".join([int_to_vocab[i] for i in answer_logits if i != pad])
        output_summaries.append(temp)
       

INFO:tensorflow:Restoring parameters from ./best_model.ckpt


In [0]:
!pip install rouge



In [0]:
import json
from rouge import Rouge

In [0]:
rouge = Rouge()
scores = rouge.get_scores(output_summaries, clean_train_tag_top10, avg = True)
print("Training data scores:",scores)

Training data scores: {'rouge-1': {'f': 0.187341266984638, 'p': 0.21761904761904766, 'r': 0.1692063492063492}, 'rouge-2': {'f': 0.10389610291044023, 'p': 0.12, 'r': 0.09166666666666666}, 'rouge-l': {'f': 0.1696042044174822, 'p': 0.20095238095238094, 'r': 0.1608730158730159}}


**Tagline Predictions**

In [0]:
# Create your own review or use one from the dataset
#input_sentence = "Robbie, is set in 1998 and centres on a little girl, Gloria, who loves her nursemaid robot, Robbie. Her mother comes to believe that robots are unsafe, however, and Robbie is returned to the factory. Gloria is heartbroken. In an effort to show her that robots are machines, not people, her parents take her to see robots being assembled at a factory. One of the assembling robots is Robbie. Gloria endangers her life running to Robbie, and Robbie rescues Gloria, persuading Gloria's mother that robots can be trusted"
title = "A Revenge Story"
input_sentence="When Bhallaladeva conspires against his brother to become the king of Mahishmati, he has him killed by Katappa and imprisons his wife. Years later, his brother's son returns to avenge his father's death."
text = text_to_seq(input_sentence)
#random = np.random.randint(0,len(clean_training_plot))
#input_sentence = clean_training_plot[random]
#text = text_to_seq(clean_training_plot[random])

checkpoint = "./best_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 


pad = vocab_to_int["<PAD>"] 

print('Original Title:', title)

print('\nText')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))

print('\nSummary')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./best_model.ckpt
Original Title: A Revenge Story

Text
  Word Ids:    [33109, 17558, 1372, 140, 135, 33109, 1925, 33109, 12079, 1178, 411, 1165, 1372, 483, 1809, 1767, 482, 445]
  Input Words: <UNK> conspires brother become king <UNK> killed <UNK> imprisons wife years later brother son returns avenge father death

Summary
  Word Ids:       [87, 3869, 55, 3550, 1030]
  Response Words: an bold tale unlike betrayal


In [0]:

input_sentence = "Hi This is Surekha from Hyderabad.I love irobot movie and the plot is as follows.A robot may not injure a human being or, through inaction, allow a human being to come to harm"
text = text_to_seq(input_sentence)

checkpoint = "./best_model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 


pad = vocab_to_int["<PAD>"] 

#print('Original Text:', movie_training_data.movie_plot[random])
#print('Original summary:', movie_training_data.tagLine[random])#clean_summaries[random]

print('\nText')
print('  Word Ids:    {}'.format([i for i in text]))
print('  Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))

print('\nSummary')
print('  Word Ids:       {}'.format([i for i in answer_logits if i != pad]))
print('  Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./best_model.ckpt

Text
  Word Ids:    [2841, 33109, 33109, 83, 33109, 6, 1782, 6343, 7046, 747, 33109, 65, 33109, 1908, 65, 25, 1665]
  Input Words: hi <UNK> <UNK> love <UNK> movie plot follows robot may <UNK> human <UNK> allow human come harm

Summary
  Word Ids:       [1283, 2810, 2811, 192, 20, 1002]
  Response Words: super stuntman supermodel feel on inside


Mistakes:1)Names are becoming unownk for local names and places
                  2)Repetation of words 
                  3)Not exact prediction because restriction on Tagline length and Plot
                 

**Translation** **to** **Indian** **Languages**

In [0]:
output=format(" ".join([int_to_vocab[i] for i in answer_logits]))

In [0]:
from IPython import get_ipython
ipython = get_ipython()

ipython.magic("sx pip install googletrans")




In [0]:
from googletrans import Translator
def Translate_To_hindi(output):
    '''
    This function takes in the below parameters 
    summary = Movie summary/plot
    To translate the summary to English language
    '''
    translator = Translator()
    translation=translator.translate(output,dest='hindi') 
    return translation.text
def Translate_To_telugu(output):
    '''
    This function takes in the below parameters 
    summary = Movie summary/plot
    To translate the summary to English language
    '''
    translator = Translator()
    translation=translator.translate(output,dest='telugu') 
    return translation.text
def Translate_To_tamil(output):
    '''
    This function takes in the below parameters 
    summary = Movie summary/plot
    To translate the summary to English language
    '''
    translator = Translator()
    translation=translator.translate(output,dest='tamil') 
    return translation.text


In [0]:
summ=Translate_To_hindi(output)
#summ=Translate_To_hindi(summ_in_english1)
print("Hindi Translation:",summ)
summ=Translate_To_telugu(output)
print("Telugu Translation:",summ)
summ=Translate_To_tamil(output)
print("Tamil Translation:",summ)


Hindi Translation: सुपर स्टंटमैन सुपर मॉडल अंदर महसूस करता है
Telugu Translation: లోపల సూపర్ స్టంట్మ్యాన్ సూపర్మోడల్ అనుభూతి
Tamil Translation: சூப்பர் ஸ்டண்ட்மேன் சூப்பர்மாடல் உள்ளே உணர்கிறேன்
