In [39]:
# REMEMBER TO TURN GPU ON !!!

import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Recall
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [40]:
# LOCALLY

#data = pd.read_csv('../../Raw_Data/data_cleaned.csv')
#data.head()

In [41]:
# FROM GOOGLE COLAB

from google.colab import drive
drive.mount("/content/drive")
data = pd.read_csv('/content/drive/My Drive/data_cleaned.csv')
data.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piec unforgett film make oscar year sh...
1,1,simpli amaz best film shawshank redempt withou...
2,1,best stori ever told film believ film best sto...
3,1,busi die busi live ye spoiler film emot impact...
4,1,great stori wondrous told act heart extraordin...


In [42]:
data.shape

(573913, 2)

In [43]:
df = data.loc[:100000].copy()

In [44]:
df['is_spoiler'].value_counts()

0    74684
1    25317
Name: is_spoiler, dtype: int64

In [45]:
X = df[['clean_reviews']]
y = df['is_spoiler']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [46]:
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train = X_train.apply(convert_sentences)
X_test = X_test.apply(convert_sentences)

In [47]:
word_to_id = {}
iter_ = 1
for sentence in X_train['clean_reviews']:
    for word in sentence:
        if word in word_to_id:
            continue
        word_to_id[word] = iter_
        iter_ += 1

In [48]:
# SAVING WORD_TO_ID INTO A JSON FILE

import json

with open('word_to_id.json', 'w') as fp:
    json.dump(word_to_id, fp)

In [49]:
!cp word_to_id.json "drive/My Drive/Colab Notebooks"

In [50]:
print(f'There are {len(word_to_id)} different words in the train sentences')

There are 67500 different words in the train sentences


In [51]:
def tokenize(sentences, word_to_id):
    return [[word_to_id[_] for _ in s if _ in word_to_id] for s in sentences]

X_token_train = tokenize(X_train['clean_reviews'], word_to_id)
X_token_test = tokenize(X_test['clean_reviews'], word_to_id)

In [52]:
X_train_maxlen = pad_sequences(X_token_train, maxlen=150, dtype='float32', padding='post')
X_test_maxlen = pad_sequences(X_token_test, maxlen=150, dtype='float32', padding='post')

In [53]:
recall = Recall(name='recall')

def init_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=30, mask_zero=True))
    model.add(layers.GRU(units=128 , recurrent_dropout = 0.1 , dropout = 0.1, return_sequences=True, input_shape=(X_train_maxlen.shape[1],150)))
    model.add(layers.GRU(units=64, return_sequences=True, input_shape=(X_train_maxlen.shape[1],150)))
    model.add(layers.GRU(units=32))
    #model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=recall)
    
    return model

In [54]:
model = init_model(len(word_to_id))
es = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(X_train_maxlen, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[es] )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x7f8748105b00>

In [55]:
model.evaluate(X_test_maxlen, y_test)



[0.4839889407157898, 0.25760287046432495]

In [56]:
model.save('model_baseline')

INFO:tensorflow:Assets written to: model_baseline/assets


In [57]:
!cp -r model_baseline "drive/My Drive/Colab Notebooks"