In [2]:
# REMEMBER TO TURN GPU ON !!!

import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Recall
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
# LOCALLY

#data = pd.read_csv('../../Raw_Data/data_cleaned.csv')
#data.head()

In [4]:
# FROM GOOGLE COLAB

from google.colab import drive
drive.mount("/content/drive")
data = pd.read_csv('/content/drive/My Drive/data_cleaned.csv')
data.head()

Mounted at /content/drive


Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piec unforgett film make oscar year sh...
1,1,simpli amaz best film shawshank redempt withou...
2,1,best stori ever told film believ film best sto...
3,1,busi die busi live ye spoiler film emot impact...
4,1,great stori wondrous told act heart extraordin...


In [5]:
data.shape

(573913, 2)

In [6]:
df_shuffle = data.sample(frac=1).copy()

df_shuffle.reset_index(inplace =True)
df_shuffle.drop(columns='index', inplace= True)
df_shuffle

Unnamed: 0,is_spoiler,clean_reviews
0,1,lightheart romp whitewash genocid whole famili...
1,0,live everi posit review head late show tonight...
2,0,well done open rang pretti good job show life ...
3,1,love husband inde middl class coupl quietli ho...
4,0,joyou julio tenoch teenag friend member secret...
...,...,...
573908,1,great movi long spoiler realli like movi ron p...
573909,1,one best literari adapt seen year mild spoiler...
573910,1,straight review siddharth sai amaz spider man ...
573911,0,mean would die guy first ever review imdb disg...


In [7]:
df_shuffle_test = df_shuffle.loc[:199_999]
df_shuffle_train = df_shuffle.loc[200_000:]

In [8]:
df_shuffle_test.shape

(200000, 2)

In [9]:
df_shuffle_test['is_spoiler'].value_counts(normalize=True)

0    0.738095
1    0.261905
Name: is_spoiler, dtype: float64

In [10]:
 g = df_shuffle_train.groupby('is_spoiler')
 g = g.apply(lambda x: x.sample(g.size().min()).reset_index(drop=True))
 g = g.set_index('is_spoiler')
 g = g.reset_index()
 g.head()

Unnamed: 0,is_spoiler,clean_reviews
0,0,ryder joli anchor genuin interest movi month m...
1,0,lawrenc arabia david lean see old film often s...
2,0,cheesi comedi movi live bill laugh hard gave f...
3,0,movi one mish mash nicol kidman full makeup al...
4,0,pretti freak bad tantamount blasphemi saw dvd ...


In [11]:
df_shuffle_train = g

In [12]:
df_shuffle_train['is_spoiler'].value_counts(normalize=True)

1    0.5
0    0.5
Name: is_spoiler, dtype: float64

In [13]:
df_sample_train = df_shuffle_train.sample(n=100_000)

In [15]:
df_sample_train['is_spoiler'].value_counts(normalize=True)

1    0.5006
0    0.4994
Name: is_spoiler, dtype: float64

In [16]:
X = df_sample_train[['clean_reviews']]
y = df_sample_train['is_spoiler']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [17]:
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train = X_train.apply(convert_sentences)
X_test = X_test.apply(convert_sentences)

In [18]:
word_to_id = {}
iter_ = 1
for sentence in X_train['clean_reviews']:
    for word in sentence:
        if word in word_to_id:
            continue
        word_to_id[word] = iter_
        iter_ += 1

In [19]:
# SAVING WORD_TO_ID INTO A JSON FILE

import json

with open('word_to_id.json', 'w') as fp:
    json.dump(word_to_id, fp)

In [20]:
!cp word_to_id.json "drive/My Drive/Colab Notebooks"

In [21]:
print(f'There are {len(word_to_id)} different words in the train sentences')

There are 78370 different words in the train sentences


In [22]:
def tokenize(sentences, word_to_id):
    return [[word_to_id[_] for _ in s if _ in word_to_id] for s in sentences]

X_token_train = tokenize(X_train['clean_reviews'], word_to_id)
X_token_test = tokenize(X_test['clean_reviews'], word_to_id)

In [23]:
X_train_maxlen = pad_sequences(X_token_train, maxlen=200, dtype='float32', padding='post')
X_test_maxlen = pad_sequences(X_token_test, maxlen=200, dtype='float32', padding='post')

In [24]:
recall = Recall(name='recall')

def init_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=30, mask_zero=True))
    model.add(layers.GRU(units=128 , recurrent_dropout = 0.1 , dropout = 0.1))
    #model.add(layers.GRU(units=128 , recurrent_dropout = 0.1 , dropout = 0.1, return_sequences=True, input_shape=(X_train_maxlen.shape[1],200)))
    #model.add(layers.GRU(units=64, return_sequences=True, input_shape=(X_train_maxlen.shape[1],150)))
    #model.add(layers.GRU(units=32))
    #model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=recall)
    
    return model

In [25]:
model = init_model(len(word_to_id))
es = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(X_train_maxlen, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[es] )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x7f0cd03ad828>

In [26]:
model.evaluate(X_test_maxlen, y_test)



[0.571760356426239, 0.7584953904151917]

In [32]:
X_shuffle_test = df_shuffle_test[['clean_reviews']]
y_shuffle_test = df_shuffle_test['is_spoiler']

X_shuffle_test_converted = X_shuffle_test.apply(convert_sentences)
X_shuffle_test_tokenized = tokenize(X_shuffle_test_converted['clean_reviews'], word_to_id)
X_shuffle_test_maxlen = pad_sequences(X_shuffle_test_tokenized, maxlen=200, dtype='float32', padding='post')


In [33]:
model.evaluate(X_shuffle_test_maxlen, y_shuffle_test)



[0.6030338406562805, 0.7638838291168213]

In [34]:
y_predict = model.predict(X_shuffle_test_maxlen)

In [35]:
len(y_predict)

200000

In [36]:
y_predict[0:10]

array([[0.8235458 ],
       [0.35441515],
       [0.22897525],
       [0.8257432 ],
       [0.5712198 ],
       [0.9715403 ],
       [0.6104355 ],
       [0.12103421],
       [0.11966778],
       [0.08729726]], dtype=float32)

In [48]:
y_predict_df = pd.DataFrame(y_predict)
#y_predict_df.head()
y_predict_labeled = y_predict_df[0].apply(lambda x: 1 if x>0.5 else 0)

In [52]:
from sklearn.metrics import classification_report

print(classification_report(y_shuffle_test, y_predict_labeled))

              precision    recall  f1-score   support

           0       0.88      0.62      0.73    147619
           1       0.42      0.76      0.54     52381

    accuracy                           0.66    200000
   macro avg       0.65      0.69      0.63    200000
weighted avg       0.76      0.66      0.68    200000



In [27]:
model.save('model_baseline')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: model_baseline/assets


In [28]:
!cp -r model_baseline "drive/My Drive/Colab Notebooks"