In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.metrics import Recall

In [2]:
# FROM GOOGLE COLAB

#from google.colab import drive
#drive.mount("/content/drive")
#dataframe = pd.read_csv('/content/drive/My Drive/data_cleaned.csv')
#dataframe.head()

In [3]:
# Locally

dataframe = pd.read_csv('../../Raw_Data/data_cleaned.csv')
dataframe.head()

Unnamed: 0,is_spoiler,clean_reviews
0,1,classic piece unforgettable film making oscar ...
1,1,simply amazing best film shawshank redemption ...
2,1,best story ever told film believe film best st...
3,1,busy dying busy living yes spoiler film emotio...
4,1,great story wondrously told acted heart extrao...


In [4]:
# PORTION OF THE DATA

df = dataframe.loc[:1000, :].copy()

In [5]:
X = df[['clean_reviews']]
y = df['is_spoiler']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [6]:
def convert_sentences(X):
    return [sentence.split(' ') for sentence in X]

X_train = X_train.apply(convert_sentences)
X_test = X_test.apply(convert_sentences)

In [7]:
word_to_id = {}
iter_ = 1
for sentence in X_train['clean_reviews']:
    for word in sentence:
        if word in word_to_id:
            continue
        word_to_id[word] = iter_
        iter_ += 1

In [8]:
print(f'There are {len(word_to_id)} different words in the train sentences')

There are 8740 different words in the train sentences


In [9]:
def tokenize(sentences, word_to_id):
    return [[word_to_id[_] for _ in s if _ in word_to_id] for s in sentences]

X_token_train = tokenize(X_train['clean_reviews'], word_to_id)
X_token_test = tokenize(X_test['clean_reviews'], word_to_id)

In [11]:
X_train_maxlen = pad_sequences(X_token_train, maxlen=150, dtype='float32', padding='post')
X_test_maxlen = pad_sequences(X_token_test, maxlen=150, dtype='float32', padding='post')

In [12]:
recall = Recall(name='recall')

def init_model(vocab_size):
    model = Sequential()
    model.add(layers.Embedding(input_dim=vocab_size+1, output_dim=30, mask_zero=True))
    model.add(layers.LSTM(units=128 , recurrent_dropout = 0.5 , dropout = 0.5))
    model.add(layers.Dense(5, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=recall)
    
    return model

In [13]:
model = init_model(len(word_to_id))
es = EarlyStopping(patience=5, restore_best_weights=True)
model.fit(X_train_maxlen, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[es] )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x144bd6890>

In [14]:
res = model.evaluate(X_test_maxlen, y_test)
print('Test recall:', res[1])

Test recall: 1.0


In [15]:
review_to_predict = ['kjkhkjhkh']

sentence_converted = convert_sentences(review_to_predict)

#print(sentence_converted)

prediction_token = tokenize(sentence_converted, word_to_id)

prediction_pad = pad_sequences(prediction_token, maxlen=150, dtype='float32', padding='post')

model.predict(prediction_pad)

array([[0.504695]], dtype=float32)