# Pipelining and Deploying

## Select a smaller dataset to work with

In [2]:
import pandas as pd

df = pd.read_json('./../../raw_data/IMDB_reviews.json', lines=True)

Unnamed: 0,is_spoiler,movie_id,rating,review_date,review_summary,review_text,user_id
0,True,tt0111161,10,10 February 2006,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt...",ur1898687
1,True,tt0111161,10,6 September 2000,Simply amazing. The best film of the 90's.,The Shawshank Redemption is without a doubt on...,ur0842118
2,True,tt0111161,8,3 August 2001,The best story ever told on film,I believe that this film is the best story eve...,ur1285640
3,True,tt0111161,10,1 September 2002,Busy dying or busy living?,"**Yes, there are SPOILERS here**This film has ...",ur1003471
4,True,tt0111161,8,20 May 2004,"Great story, wondrously told and acted",At the heart of this extraordinary movie is a ...,ur0226855


In [3]:
df.shape

(573913, 7)

In [78]:
t_df = df[df['is_spoiler']==True].iloc[:500]
f_df = df[df['is_spoiler']==False].iloc[:500]
small_df = pd.concat([t_df, f_df], ignore_index=True)

True     500
False    500
Name: is_spoiler, dtype: int64

In [79]:
small_df['review'] = small_df['review_summary'] + ' ' + small_df['review_text']
small_df = small_df[['is_spoiler', 'review']]
small_df.head()

Unnamed: 0,is_spoiler,review
0,True,A classic piece of unforgettable film-making. ...
1,True,Simply amazing. The best film of the 90's. The...
2,True,The best story ever told on film I believe tha...
3,True,"Busy dying or busy living? **Yes, there are SP..."
4,True,"Great story, wondrously told and acted At the ..."


# Split into Train and Test

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(small_df['review'], small_df['is_spoiler'])

In [81]:
X_train

369    You must to see this film A story of friendshi...
424    Awesome This is simply one of the best films e...
934    hope....is greatest thing i watched this movie...
422    An amazing movie everyone will love So, I firs...
316    the most downtrodden film Oscar 's story? The ...
755    Best movie ever One of the best movies that i ...
739    - Some things Are Best left Unsaid - When you ...
681    A true story of friendship and hard times Shaw...
826    Cinematic masterpiece Have seen this film a co...
813    Time Flies One interesting thing about this mo...
969    Plainly Superb! Let me start out by saying tha...
741    an alright movie to watch It is not uncommon t...
960    A Truly Amazing Experience This title has to b...
82     good reviews of best movie of all time The Sha...
566    prison escape This film manages to redeem Holl...
444    The best movie ever made. The Shawshank Redemp...
554    I didn't think films could be this good.... Th...
68     Why the ending is so mov

# Clean Reviews

### Clean reviews text

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import unidecode


def clean (text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    
    unaccented_string = unidecode.unidecode(lowercased) # remove accents
    
    tokenized = word_tokenize(unaccented_string) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    stop_words = set(stopwords.words('portuguese')) # Make stopword list
    
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    
    return " ".join(without_stopwords)

df['clean_text'] = df['title_comment'].apply(clean)

df.head()

In [82]:
# Convert to list of list of words
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train = convert_sentences(X_train)
X_test = convert_sentences(X_test)

In [83]:
X_train[0]

['You',
 'must',
 'to',
 'see',
 'this',
 'film',
 'A',
 'story',
 'of',
 'friendship',
 'and',
 'endurance',
 'set',
 'in',
 'the',
 'reality',
 'of',
 'prison',
 'life,',
 'The',
 'Shawshank',
 'Redemption',
 'is',
 'best',
 'known',
 'as',
 'an',
 'Oscar-nominated',
 'film.',
 'However,',
 'this',
 'theatre',
 'production',
 'is',
 'based',
 'instead',
 'on',
 'the',
 'original',
 'novella',
 'written',
 'by',
 'Stephen',
 'King',
 'relatively',
 'early',
 'in',
 'his',
 'career.The',
 'direction',
 'in',
 'this',
 'piece',
 'is',
 'stylised,',
 'with',
 'a',
 'deliberate',
 'slow',
 'rhythm',
 'and',
 'pace,',
 'particularly',
 'in',
 'the',
 'first',
 'act,',
 'greatly',
 'reducing',
 'any',
 'sense',
 'of',
 'menace',
 'that',
 'you',
 'would',
 'expect',
 'from',
 'this',
 'so-called',
 'tough',
 'prison.',
 'It',
 'is',
 'almost',
 'Brechtian',
 'in',
 'style.',
 'This',
 'becomes',
 'frustrating',
 'as',
 'it',
 'takes',
 'much',
 'of',
 'the',
 'emotion',
 'out',
 'of',
 'the

In [101]:
def data_cleaning(X):
    # LET IT AS IT IS
    return X

X_clean_train = data_cleaning(X_train)
X_clean_test = data_cleaning(X_test)

## Tokenization

In [87]:
word_to_id = {}
iter_ = 1
for sentence in X_clean_train:
    for word in sentence:
        if word in word_to_id:
            continue
        word_to_id[word] = iter_
        iter_ += 1

In [88]:
print(f'There are {len(word_to_id)} different words in the train sentences')

There are 18247 different words in the train sentences


In [89]:
id_to_word = {v:k for k, v in word_to_id.items()}

In [90]:
def tokenize(sentences, word_to_id):
    return [[word_to_id[_] for _ in s if _ in word_to_id] for s in sentences]

X_token_train = tokenize(X_clean_train, word_to_id)
X_token_test = tokenize(X_clean_test, word_to_id)

In [99]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_pad = pad_sequences(X_token_train, dtype='float32', padding='post')
X_test_pad = pad_sequences(X_token_test, dtype='float32', padding='post')

In [92]:
X_train_pad.shape

(750, 998)

### Tokenize labels

In [93]:
import numpy as np
y_train = np.array([1 if x else 0 for x in y_train])
y_test = np.array([1 if x else 0 for x in y_test])

# The model 

In [94]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers 

def init_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=30, mask_zero=True))
    model.add(layers.LSTM(10))
    model.add(layers.Dense(5))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    return model

In [95]:
##############
### Answer ###
##############

from tensorflow.keras.callbacks import EarlyStopping

model = init_model(len(word_to_id))

es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train_pad, y_train, 
          epochs=10, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<tensorflow.python.keras.callbacks.History at 0x125be0990>

In [96]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 30)          547440    
_________________________________________________________________
lstm_4 (LSTM)                (None, 10)                1640      
_________________________________________________________________
dense_8 (Dense)              (None, 5)                 55        
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 6         
Total params: 549,141
Trainable params: 549,141
Non-trainable params: 0
_________________________________________________________________


# Make a prediction

In [97]:
input_review = "I loved the movie, it was fantastic and thrilling, you should really see it. I recommend it"
# Convert Sentences
inpute_review = convert_sentences(input_review)
# Tokenize
input_review = tokenize(input_review, word_to_id)
# Pad
input_review = pad_sequences(input_review, dtype='float32', padding='post')

In [98]:
res = model.predict(input_review).mean()
print(f'Your input has a {round(res*100, 1)}% chance of being a spoiler')

Your input has a 48.3% chance of being a spoiler
