In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
import random
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import load_model

In [2]:
import io

In [3]:
train = pd.read_csv('train.csv')
cv = pd.read_csv('cv.csv')
train = pd.concat([train, cv], ignore_index=True)
test = pd.read_csv('test.csv')

train_text = (train['book_title'].map(str) + ' ~~~ ' + train['sentence'].map(str)).to_numpy()
train_labels = train['sent_spoil'].to_numpy().astype(np.int32)

test_text = (test['book_title'].map(str) + ' ~~~ ' + test['sentence'].map(str)).to_numpy()
test_labels = test['sent_spoil'].to_numpy().astype(np.int32)

In [5]:
train = pd.read_pickle('../../../data/goodreads_sent_spoil_titles_balanced.pkl')
train_text = train.sentence.to_numpy()
train_labels = train.has_spoiler.to_numpy()

In [4]:
train.shape, cv.shape, test.shape

((285002, 4), (15000, 4), (15001, 4))

In [6]:
reviewMaxLen = 700

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
sequences = tokenizer.texts_to_sequences(train_text)
padded = pad_sequences(sequences, maxlen=reviewMaxLen)

In [12]:
len(tokenizer.word_counts)

64724

In [24]:
model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 32, input_length=reviewMaxLen) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(32, dropout=0.1, return_sequences=True))
model.add(LSTM(32, dropout=0.2))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))
opt = keras.optimizers.Adam(learning_rate=0.003)
model.compile(loss='binary_crossentropy',optimizer=opt, metrics=[tf.keras.metrics.AUC()])

In [25]:
history = model.fit(padded, train_labels, validation_split=0.0527, epochs=5, batch_size=64, verbose=1)
model.save('model_big_vocab')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




INFO:tensorflow:Assets written to: model_big_vocab\assets


INFO:tensorflow:Assets written to: model_big_vocab\assets


In [26]:
model = load_model('model_big_vocab')

In [27]:
import matplotlib.pyplot as plt
plt.plot(history.history['auc'])
plt.plot(history.history['val_auc'])
plt.title('model AUC')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

KeyError: 'auc'

In [28]:
predictions = []
i = 0
for text in test_text:
    if i % 1000 == 0:
        print(i)
    i += 1
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=reviewMaxLen)
    prediction = model.predict(pad).item()
    predictions.append(prediction)
predictions = np.array(predictions)

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_labels, predictions)
auc_keras = auc(fpr_keras, tpr_keras)

print(auc_keras)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
0.9140868795037902


In [13]:
train.head()

Unnamed: 0.1,Unnamed: 0,book_title,sent_spoil,sentence
0,266148,"Wildest Dreams (Fantasyland, #1)",0,To make matter worse she didn't even do anythi...
1,198268,The Goldfinch,0,At least Henry from The Secret History was act...
2,110639,"Draw Me Close (Hearts and Crafts, #3)",0,When Derek didn't respond Mike just shook his ...
3,138778,"The Program (The Program, #1)",0,The plot of The Program is the love story with...
4,29604,The Ritual,0,"And... instead of sending him back, or changin..."


In [16]:
sent = []
spoil = []
for i in range(len(train)):
    sent.append(train.iloc[i].book_title + ' [SEP] ' + train.iloc[i].sentence)   
    spoil.append(train.iloc[i].sent_spoil)

df = pd.DataFrame(list(zip(sent, spoil)), 
               columns =['sentence', 'has_spoiler']) 

In [18]:
df.to_pickle('../../../data/lstm_train.pkl')

In [19]:
sent = []
spoil = []
for i in range(len(test)):
    sent.append(test.iloc[i].book_title + ' [SEP] ' + test.iloc[i].sentence)   
    spoil.append(test.iloc[i].sent_spoil)

df = pd.DataFrame(list(zip(sent, spoil)), 
               columns =['sentence', 'has_spoiler'])
df.to_pickle('../../../data/lstm_test.pkl')

In [21]:
sent = []
spoil = []
for i in range(len(cv)):
    sent.append(cv.iloc[i].book_title + ' [SEP] ' + cv.iloc[i].sentence)   
    spoil.append(cv.iloc[i].sent_spoil)

df = pd.DataFrame(list(zip(sent, spoil)), 
               columns =['sentence', 'has_spoiler'])
df.to_pickle('../../../data/lstm_cv.pkl')

In [16]:
pad

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0