In [16]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

In [17]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
df = pd.read_json('/content/drive/My Drive/Colab Notebooks/IMDB_reviews.json', lines=True)
df.head()

Unnamed: 0,review_date,movie_id,user_id,is_spoiler,review_text,rating,review_summary
0,10 February 2006,tt0111161,ur1898687,True,"In its Oscar year, Shawshank Redemption (writt...",10,A classic piece of unforgettable film-making.
1,6 September 2000,tt0111161,ur0842118,True,The Shawshank Redemption is without a doubt on...,10,Simply amazing. The best film of the 90's.
2,3 August 2001,tt0111161,ur1285640,True,I believe that this film is the best story eve...,8,The best story ever told on film
3,1 September 2002,tt0111161,ur1003471,True,"**Yes, there are SPOILERS here**This film has ...",10,Busy dying or busy living?
4,20 May 2004,tt0111161,ur0226855,True,At the heart of this extraordinary movie is a ...,8,"Great story, wondrously told and acted"


In [19]:
df = df.loc[:200000, :].copy()

In [20]:
df = df[['is_spoiler', 'review_summary', 'review_text']]
df.head()

Unnamed: 0,is_spoiler,review_summary,review_text
0,True,A classic piece of unforgettable film-making.,"In its Oscar year, Shawshank Redemption (writt..."
1,True,Simply amazing. The best film of the 90's.,The Shawshank Redemption is without a doubt on...
2,True,The best story ever told on film,I believe that this film is the best story eve...
3,True,Busy dying or busy living?,"**Yes, there are SPOILERS here**This film has ..."
4,True,"Great story, wondrously told and acted",At the heart of this extraordinary movie is a ...


In [21]:
df['review'] = df['review_summary'] + ' ' + df['review_text']

In [22]:
df = df.drop(columns=['review_summary', 'review_text'])

In [23]:
df['is_spoiler'] = df['is_spoiler'].map({True:1, False:0})

In [24]:
df.head()

Unnamed: 0,is_spoiler,review
0,1,A classic piece of unforgettable film-making. ...
1,1,Simply amazing. The best film of the 90's. The...
2,1,The best story ever told on film I believe tha...
3,1,"Busy dying or busy living? **Yes, there are SP..."
4,1,"Great story, wondrously told and acted At the ..."


In [25]:
df.shape

(200001, 2)

In [26]:
df.nunique()

is_spoiler         2
review        199949
dtype: int64

In [27]:
df['is_spoiler'].value_counts()

0    142041
1     57960
Name: is_spoiler, dtype: int64

In [28]:
df.isna().sum()

is_spoiler    0
review        0
dtype: int64

In [29]:
def clean (text):
    
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
        
    tokenized = word_tokenize(lowercased) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    stop_words = set(stopwords.words('english')) # Make stopword list
    
    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words
    
    return " ".join(without_stopwords)

In [30]:
nltk.download('stopwords')

df['clean_reviews'] = df['review'].apply(clean)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,is_spoiler,review,clean_reviews
0,1,A classic piece of unforgettable film-making. ...,classic piece unforgettable film making oscar ...
1,1,Simply amazing. The best film of the 90's. The...,simply amazing best film shawshank redemption ...
2,1,The best story ever told on film I believe tha...,best story ever told film believe film best st...
3,1,"Busy dying or busy living? **Yes, there are SP...",busy dying busy living yes spoilers film emoti...
4,1,"Great story, wondrously told and acted At the ...",great story wondrously told acted heart extrao...


In [31]:
df.drop(columns='review', inplace=True)

In [32]:
df.head()

Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piece unforgettable film making oscar ...
1,1,simply amazing best film shawshank redemption ...
2,1,best story ever told film believe film best st...
3,1,busy dying busy living yes spoilers film emoti...
4,1,great story wondrously told acted heart extrao...


In [33]:
nltk.download('wordnet')

def stemmwords(row):
    l= [lemmatizer.lemmatize(word) for word in row]
    return "".join(l)

lemmatizer = WordNetLemmatizer()

df['clean_reviews'] = df['clean_reviews'].apply(stemmwords)
df.head()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piece unforgettable film making oscar ...
1,1,simply amazing best film shawshank redemption ...
2,1,best story ever told film believe film best st...
3,1,busy dying busy living yes spoilers film emoti...
4,1,great story wondrously told acted heart extrao...


In [34]:
X = df[['clean_reviews']]
y = df['is_spoiler']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

In [35]:
X_train.head()

Unnamed: 0,clean_reviews
134466,alice chains maybe getting cgi phantasmagorica...
184134,one scariest movies ever made movie scared cra...
47147,movie definitely one best movie upon others ba...
147964,devils black sheep really bad eggs waste movie...
34225,thrill minute watching movie dvd fascinating r...


In [36]:
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train = X_train.apply(convert_sentences)
X_test = X_test.apply(convert_sentences)

In [37]:
X_train.head()

Unnamed: 0,clean_reviews
134466,"[alice, chains, maybe, getting, cgi, phantasma..."
184134,"[one, scariest, movies, ever, made, movie, sca..."
47147,"[movie, definitely, one, best, movie, upon, ot..."
147964,"[devils, black, sheep, really, bad, eggs, wast..."
34225,"[thrill, minute, watching, movie, dvd, fascina..."


In [38]:
word_to_id = {}
iter_ = 1
for sentence in X_train['clean_reviews']:
    for word in sentence:
        if word in word_to_id:
            continue
        word_to_id[word] = iter_
        iter_ += 1

In [39]:
print(f'There are {len(word_to_id)} different words in the train sentences')

There are 132286 different words in the train sentences


In [40]:
id_to_word = {v:k for k, v in word_to_id.items()}

In [41]:
def tokenize(sentences, word_to_id):
    return [[word_to_id[_] for _ in s if _ in word_to_id] for s in sentences]

X_token_train = tokenize(X_train['clean_reviews'], word_to_id)
X_token_test = tokenize(X_test['clean_reviews'], word_to_id)

In [42]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_train_maxlen = pad_sequences(X_token_train, maxlen=150, dtype='float32', padding='post')
X_test_maxlen = pad_sequences(X_token_test, maxlen=150, dtype='float32', padding='post')

In [54]:
X_train_maxlen[1]

array([ 12., 191.,   7., 192., 107., 193., 194., 195.,  97., 196., 197.,
       198., 199., 200., 201., 202., 203., 204.,  12., 205., 206., 207.,
       208., 193., 101., 193., 208.,  20., 209., 210., 211., 212., 213.,
       214., 215., 216., 217.,  97., 218., 219., 220., 221., 219., 222.,
       223., 193., 193., 224., 225., 183., 226., 219., 227., 193., 228.,
        20., 208., 193., 193., 229., 230., 208., 193., 107., 231., 232.,
       233.,  71., 234., 235., 236., 237., 238., 239., 240., 241., 242.,
       125., 235., 236., 243., 244.,  37., 245.,  28., 246.,  53., 203.,
       204.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
         0.,   0.,   0.,   0.,   0.,   0.,   0.], d

In [43]:
from tensorflow.keras import Sequential
from tensorflow.keras import layers 

def init_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=30, mask_zero=True))
    model.add(layers.LSTM(10))
    model.add(layers.Dense(5))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    
    return model

In [44]:
from tensorflow.keras.callbacks import EarlyStopping

model = init_model(len(word_to_id))

es = EarlyStopping(patience=5, restore_best_weights=True)

model.fit(X_train_maxlen, y_train, 
          epochs=100, 
          batch_size=32,
          validation_split=0.3,
          callbacks=[es]
         )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<tensorflow.python.keras.callbacks.History at 0x7f76e779b7f0>

In [45]:
res = model.evaluate(X_test_maxlen, y_test)
print('Test accuracy:', res[1])

Test accuracy: 0.7518874406814575


In [52]:
review_to_predict = ['spoiler spoiler spoiler alert this is a spoiler, darth vader is lukes father. im making a spoiler']

sentence_converted = convert_sentences(review_to_predict)

#print(sentence_converted)

prediction_token = tokenize(sentence_converted, word_to_id)

prediction_pad = pad_sequences(prediction_token, maxlen=150, dtype='float32', padding='post')

model.predict(prediction_pad)

array([[0.67379004]], dtype=float32)

In [55]:
model.predict_proba(prediction_pad)

Instructions for updating:
Please use `model.predict()` instead.


array([[0.67379004]], dtype=float32)