In [178]:
import re
import nltk
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup 
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
data = pd.read_csv('data/Reviews.csv', nrows=100000)

In [18]:
data = data[['Text', 'Summary']]
data.sample(10)

Unnamed: 0,Text,Summary
11178,If you have serious food allergies or gluten i...,"Yummy, little crunchy cookies"
22128,I bought this for my 6 month old lab puppy. He...,Not for a large puppy or chewer
76138,These natural chewy candies have a wonderful u...,Ginger Chews
13687,Our black lab devoured this thing in 10 minute...,just an expensive dog treat
82931,I've been a San Francisco Bay coffee fan for y...,WONDERFUL!
798,"Of course, we all know how delicious Ghirardel...",Great chocolate...
37574,I love the strawberry flavor of these. I boug...,Love these!
44014,My boys love love love this brand and this par...,BOYS LOVEEE
4787,This food is a vast improvement over the store...,Blows away store-bought
12963,You will become addicted to it once you eat it...,The best kind of crackers on earth


In [25]:
data['Text'].nunique()
data['Summary'].nunique()

72348

In [35]:
data = data.drop_duplicates(subset=['Text']).dropna(axis=0)
data

Unnamed: 0,Text,Summary
0,I have bought several of the Vitality canned d...,Good Quality Dog Food
1,Product arrived labeled as Jumbo Salted Peanut...,Not as Advertised
2,This is a confection that has been around a fe...,"""Delight"" says it all"
3,If you are looking for the secret ingredient i...,Cough Medicine
4,Great taffy at a great price. There was a wid...,Great taffy
...,...,...
99995,I just love it and will buy another box when I...,yummy!
99996,My late father in law used to have a rating sy...,Tastes like More!
99997,This is my favorite brand of Korean ramen. It ...,Great ramen
99998,"I do like these noodles although, to say they ...",Spicy!!


In [36]:
contractions = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is", "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are", "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have", "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}
stop_words = stopwords.words('english')

In [111]:
def preprocess_sentence(sentence: str, remove_stopwords: bool = True):
    sentence = sentence.lower()
    sentence = BeautifulSoup(sentence, 'lxml').text
    sentence = re.sub(r'\([^)]*\)', '', sentence)
    sentence = re.sub('"', '', sentence)
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(' ')])
    sentence = re.sub(r"'s\b", '', sentence)
    sentence = re.sub(r"'[^a-zA-Z]", '', sentence)
    sentence = re.sub(r'[m]{2,}', 'mm', sentence)
    # sentence.split()
    if remove_stopwords:
        tokens = ' '.join(word for word in sentence.split() if not word in stop_words if len(word) > 1)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word) > 1)
    return tokens

In [113]:
clean_text = []
for s in data['Text']:
    clean_text.append(preprocess_sentence(s))

In [114]:
clean_summary = []
for s in data['Summary']:
    clean_summary.append(preprocess_sentence(s, False))



In [138]:
data['Text'] = clean_text
data['Summary'] = clean_summary
data = data.replace('', np.nan)
data = data.dropna(axis=0)

In [139]:
data = data.loc[data['Text'].apply(lambda x: len(x.split()) <= 50)]
data = data.loc[data['Summary'].apply(lambda x: len(x.split()) <= 8)]

In [157]:
data['decoder_input'] = data['Summary'].apply(lambda x: 'sostoken ' + x)
data['decoder_target'] = data['Summary'].apply(lambda x: x + ' eostoken')

In [158]:
encoder_input = np.array(data['Text'])
decoder_input = np.array(data['decoder_input'])
decoder_target = np.array(data['decoder_target'])

In [179]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]
n_of_val = int(len(encoder_input)*0.2)

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [187]:
src_tokenizer = Tokenizer(num_words=8000)
src_tokenizer.fit_on_texts(encoder_input_train)

encoder_input_int_train = src_tokenizer.texts_to_sequences(encoder_input_train)
encoder_input_int_test = src_tokenizer.texts_to_sequences(encoder_input_test)

In [191]:
tar_tokenizer = Tokenizer(num_words=2000)
tar_tokenizer.fit_on_texts(decoder_input_train)
tar_tokenizer.fit_on_texts(decoder_target_train)

decoder_input_int_train = tar_tokenizer.texts_to_sequences(decoder_input_train)
decoder_target_int_train = tar_tokenizer.texts_to_sequences(decoder_target_train)
decoder_input_int_test = tar_tokenizer.texts_to_sequences(decoder_input_test)
decoder_target_int_test = tar_tokenizer.texts_to_sequences(decoder_target_test)

In [195]:
drop_train = [index for index, sentence in enumerate(decoder_input_int_train) if len(sentence)==1]
drop_test = [index for index, sentence in enumerate(decoder_input_int_test) if len(sentence)==1]

encoder_input_int_droped_train = np.delete(encoder_input_int_train, drop_train, axis=0)
decoder_input_int_droped_train = np.delete(decoder_input_int_train, drop_train, axis=0)
decoder_target_int_droped_train = np.delete(decoder_target_int_train, drop_train, axis=0)

encoder_input_int_droped_test = np.delete(encoder_input_int_test, drop_test, axis=0)
decoder_input_int_droped_test = np.delete(decoder_input_int_test, drop_test, axis=0)
decoder_target_int_droped_test = np.delete(decoder_target_int_test, drop_test, axis=0)

  return array(a, dtype, copy=False, order=order)


In [202]:
encoder_input_train = pad_sequences(encoder_input_int_droped_train, maxlen=50, padding='post')
encoder_input_test = pad_sequences(encoder_input_int_droped_test, maxlen=50, padding='post')
decoder_input_train = pad_sequences(decoder_input_int_droped_train, maxlen=8, padding='post')
decoder_input_test = pad_sequences(encoder_input_int_droped_test, maxlen=8, padding='post')
decoder_target_train = pad_sequences(decoder_target_int_droped_train, maxlen=8, padding='post')
decoder_target_test = pad_sequences(decoder_target_int_droped_test, maxlen=8, padding='post')

In [203]:
decoder_target_test

array([[   3,  118,    2, ...,    0,    0,    0],
       [ 250,  198,    2, ...,    0,    0,    0],
       [  81,    6,  568, ...,    0,    0,    0],
       ...,
       [ 314,   52,  631, ...,    0,    0,    0],
       [   6,   40,    2, ...,    0,    0,    0],
       [ 179,   46, 1274, ...,    2,    0,    0]])