In [1]:
from keras.models import Model
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import keras.backend as K
import pandas as pd
import numpy as np
import rouge

Using TensorFlow backend.


In [2]:
reviews = pd.read_csv('reviews2.csv')
summaries = pd.read_csv('summaries2.csv')
reviews = reviews.values
summaries = summaries.values


r_train, r_test, s_train, s_test = train_test_split(reviews, summaries, test_size=0.3, random_state=2)


summary_decoder_input = '<sos> ' + s_train
summary_decoder_target = s_train + ' <eos>'
summary_decoder_input = summary_decoder_input.flatten().tolist()
summary_decoder_target = summary_decoder_target.flatten().tolist()


r_train = r_train.flatten().tolist()
r_test = r_test.flatten().tolist()
s_train = s_train.flatten().tolist()
s_test = s_test.flatten().tolist()

reviews = reviews.flatten().tolist()
summaries = summaries.flatten().tolist()
print('number of examples: ', len(reviews))
print('number of training examples: ', len(r_train))
print('number of testing examples: ', len(r_test))
print('decoder input: ', summary_decoder_input[0])
print('decoder target: ', summary_decoder_target[0])

number of examples:  10076
number of training examples:  7053
number of testing examples:  3023
decoder input:  <sos> New features are really nice. It shows which since of the road you'll ...
decoder target:  New features are really nice. It shows which since of the road you'll ... <eos>


In [3]:
MAX_NUM_WORDS = 20000

In [4]:
tokenizer_r = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_r.fit_on_texts(reviews)
review_sequences = tokenizer_r.texts_to_sequences(reviews)

max_review_len = max(len(s) for s in review_sequences)

tokenizer_s = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_s.fit_on_texts(summaries)
summary_sequences = tokenizer_s.texts_to_sequences(summaries)


max_summary_len = max(len(s) for s in summary_sequences) + 1

print('maximum review length: ', max_review_len)
print('maximum summary length: ', max_summary_len)

maximum review length:  4832
maximum summary length:  31


In [5]:
tokenizer_r_train = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_r_train.fit_on_texts(r_train)
r_train_sequences = tokenizer_r_train.texts_to_sequences(r_train)

word2idx_inputs = tokenizer_r_train.word_index

tokenizer_s_train = Tokenizer(num_words=MAX_NUM_WORDS, filters='')
tokenizer_s_train.fit_on_texts(summary_decoder_input + summary_decoder_target)
decoder_input_sequences = tokenizer_s_train.texts_to_sequences(summary_decoder_input)
decoder_target_sequences = tokenizer_s_train.texts_to_sequences(summary_decoder_target)

word2idx_summaries = tokenizer_s_train.word_index

print('Review vocabulary size: ', len(word2idx_inputs))
print('Summary vocabulary size: ', len(word2idx_summaries))

Review vocabulary size:  12686
Summary vocabulary size:  5842


In [6]:
tokenizer_r_test = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_r_test.fit_on_texts(r_test)
r_test_sequences = tokenizer_r_test.texts_to_sequences(r_test)

test_sequences = pad_sequences(r_test_sequences, maxlen=max_review_len)

In [7]:
encoder_input = pad_sequences(r_train_sequences, maxlen=max_review_len)

print('encoder input shape: ', encoder_input.shape)

decoder_input = pad_sequences(decoder_input_sequences, maxlen=max_summary_len, padding='post')

print('decoder input shape:', decoder_input.shape)

decoder_target = pad_sequences(decoder_target_sequences, maxlen=max_summary_len, padding='post')

print('decoder target shape:', decoder_target.shape)

num_words_summaries = len(word2idx_summaries) + 1
num_words_reviews = len(word2idx_inputs) + 1

decoder_one_hot_targets = np.zeros((len(r_train), max_summary_len, num_words_summaries), dtype='uint8')

for i, d in enumerate(decoder_target):
    for t, word in enumerate(d):
        decoder_one_hot_targets[i, t, word] = 1
print('decoder one hot targets shape: ', decoder_one_hot_targets.shape)
print('encoder input[0]: ', encoder_input[0])
print('decoder input[0]: ', decoder_input[0])
print('decoder target[0]:', decoder_target[0])

encoder input shape:  (7053, 4832)
decoder input shape: (7053, 31)
decoder target shape: (7053, 31)
decoder one hot targets shape:  (7053, 31, 5843)
encoder input[0]:  [   0    0    0 ...    4   67 5519]
decoder input[0]:  [  1 121 706  54  43 299   9 645 197 435  20   3 607 943   4   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]
decoder target[0]: [121 706  54  43 299   9 645 197 435  20   3 607 943   4   2   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0]


In [8]:
encoder_model = load_model('enc_model3.h5')
decoder_model = load_model('dec_model3.h5')













In [9]:
encoder_model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 4832)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 4832, 100)         1268700   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     [(None, 256), (None, 256) 366592    
Total params: 1,635,292
Trainable params: 1,635,292
Non-trainable params: 0
_________________________________________________________________


In [10]:
decoder_model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 1)            0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 256)       1495808     input_5[0][0]                    
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 256)          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            (None, 256)          0                                            
____________________________________________________________________________________________

In [11]:
idx2word_review = {v:k for k, v in word2idx_inputs.items()}
idx2word_summary = {v:k for k, v in word2idx_summaries.items()}
idx2word_summary

{1: '<sos>',
 2: '<eos>',
 3: 'the',
 4: '...',
 5: 'and',
 6: 'i',
 7: 'great',
 8: 'a',
 9: 'it',
 10: 'for',
 11: 'case',
 12: 'this',
 13: 'is',
 14: 'to',
 15: 'my',
 16: 'phone',
 17: 'good',
 18: 'but',
 19: 'not',
 20: 'of',
 21: 'with',
 22: 'very',
 23: 'in',
 24: 'like',
 25: 'was',
 26: 'love',
 27: 'on',
 28: 'product',
 29: 'works',
 30: 'that',
 31: 'nice',
 32: 'have',
 33: 'as',
 34: 'so',
 35: 'quality',
 36: 'screen',
 37: 'price',
 38: 'one',
 39: 'you',
 40: "it's",
 41: 'best',
 42: 'easy',
 43: 'really',
 44: 'fit',
 45: 'just',
 46: 'your',
 47: 'perfect',
 48: 'iphone',
 49: 'looks',
 50: 'use',
 51: 'well',
 52: 'at',
 53: 'battery',
 54: 'are',
 55: 'fits',
 56: '-',
 57: 'only',
 58: 'has',
 59: 'excellent',
 60: 'charger',
 61: 'than',
 62: 'be',
 63: 'would',
 64: 'work',
 65: 'no',
 66: 'when',
 67: 'all',
 68: 'had',
 69: 'these',
 70: 'better',
 71: 'they',
 72: 'does',
 73: 'out',
 74: 'if',
 75: 'after',
 76: 'protector',
 77: 'an',
 78: 'great.',
 79

In [12]:
def decode_sequence(input_seq):
  
    states_value = encoder_model.predict(input_seq)


    target_seq = np.zeros((1, 1))


    target_seq[0, 0] = word2idx_summaries['<sos>']


    eos = word2idx_summaries['<eos>']


    output_sentence = []
    for _ in range(max_summary_len):
        output_tokens, h, c = decoder_model.predict(
          [target_seq] + states_value
        )
   
        idx = np.argmax(output_tokens[0, 0, :])

        if eos == idx:
            break

        word = ''
        if idx > 0:
            word = idx2word_summary[idx]
            output_sentence.append(word)


        target_seq[0, 0] = idx

        states_value = [h, c]

    return ' '.join(output_sentence)

In [13]:
s_pred = []
for i in range(len(test_sequences)):
    s_pred.append(decode_sequence(test_sequences[i:i+1]))

In [30]:
r_test[0]

"I ordered one and it's really pretty I love the decoration but within one day of having it, the corner was cracked and I didn't even drop my phone. So I ordered the free replacement and I had it for about a week an it's already chipped out of the bottom corner and again, I haven't dropped my phone."

In [28]:
s_test[0]

"I ordered one and it's really pretty I love the decoration but within one day of ..."

In [29]:
s_pred[0]

'i love the idea of this phone'

In [15]:
s_pred_train = []
for i in range(len(encoder_input)):
    s_pred_train.append(decode_sequence(encoder_input[i:i+1]))

In [16]:
def prepare_results(p, r, f):
    return '\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * p, 'R', 100.0 * r, 'F1', 100.0 * f)

In [17]:
print('Evaluation with {}'.format('Training Set'))
evaluator = rouge.Rouge(metrics=['rouge-n'],
                       max_n=4,
                       limit_length=False,
                       length_limit=100,
                       length_limit_type='words',
                       apply_avg='Avg',
                       apply_best=False,
                       alpha=0.5, # Default F1_score
                       weight_factor=1.2,
                       stemming=True)

scores = evaluator.get_scores(s_pred_train, s_train)
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))

Evaluation with Training Set
	rouge-1:	P: 33.22	R: 28.12	F1: 28.35
	rouge-2:	P: 13.67	R: 11.22	F1: 11.42
	rouge-3:	P:  5.58	R:  4.87	F1:  4.95
	rouge-4:	P:  2.73	R:  2.39	F1:  2.45


In [18]:
print('Evaluation with {}'.format('Testing Set'))
scores = evaluator.get_scores(s_pred, s_test)
for metric, results in sorted(scores.items(), key=lambda x: x[0]):
    print(prepare_results(results['p'], results['r'], results['f']))

Evaluation with Testing Set
	rouge-1:	P: 16.37	R: 13.84	F1: 13.65
	rouge-2:	P:  2.40	R:  2.18	F1:  2.05
	rouge-3:	P:  0.75	R:  0.70	F1:  0.68
	rouge-4:	P:  0.34	R:  0.33	F1:  0.33


In [32]:
print('---------------Summaries using test set------------------')
print()
while(True):
    i = np.random.randint(0, 3022)
    print('Review:', r_test[i])
    print()
    print('Original Summary: ', s_test[i])
    print()
    print('Generated Summary: ', s_pred[i])
    print()
    ans = input("Continue? [Y/n]")
    print('-------------------------------------------------')
    if ans and ans.lower().startswith('n'):
        break

---------------Summaries using test set------------------

Review: Probably the best bit of travel gear I've ever bought. When my wife and I travel, between us we have at least 4 gizmos that need charging. Finding even two plug points is often a challenge. With this baby I need just one. Grab it.

Original Summary:  Great travel accessory - indispensable.

Generated Summary:  the phone are really nice and i can need my iphone 6 from this ...

Continue? [Y/n]y
-------------------------------------------------
Review: This is a great case.  I love that it is fully enclosed with a built-in screen protector.  No problems with tactile sensitivity or anything else on the screen after almost 2 months of using it.  Protects really well and is a very nice looking case.

Original Summary:  This is a great case. I love that it is fully enclosed ...

Generated Summary:  this is a great case for the money

Continue? [Y/n]y
-------------------------------------------------
Review: Hands down, the be

In [25]:
print('---------------Summaries using training set------------------')
print()
while(True):
    i = np.random.randint(0, len(r_train))
    print('Review:', r_train[i])
    print()
    print('Original Summary: ', s_train[i])
    print()
    print('Generated Summary: ', s_pred_train[i])
    print()
    ans = input("Continue? [Y/n]")
    print('-------------------------------------------------')
    if ans and ans.lower().startswith('n'):
        break

---------------Summaries using training set------------------

Review: Great basic speaker - love that I can use as both a light and the blu-tooth speaker. LED color is a very cool white (blue) so used more for speaker! Teen actually stole mine - this has been a hit for all ages. I received this at a deeply discounted price for my honest opinion.

Original Summary:  Great basic speaker - love that I can use as ...

Generated Summary:  great charger

Continue? [Y/n]y
-------------------------------------------------
Review: Good case but takes awhile to charge back up and makes a whining sound while charging the battery pack back up.

Original Summary:  Good case but takes awhile to charge back up and ...

Generated Summary:  good case but the back is not very useful

Continue? [Y/n]y
-------------------------------------------------
Review: didnt work out for me display was scrambled purple and white you might have much better luck than me, if you have a problem do contact seller

Orig