In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

Initial model taken from the tensorflow guide: https://www.tensorflow.org/guide/keras/rnn and then added layers and tuned it further to obtain better results.

In [7]:
data = pd.read_json("data/all_lemmas.json")

In [9]:
data["joke"] = data["joke"].astype(int)

In [10]:
data.head()

Unnamed: 0,id,text,joke
0,articles_0,the new york times reported thursday trump bac...,0
1,articles_1,entitled passions autobiography document sarko...,0
2,articles_2,so-called tiered deposit rate would mean bank ...,0
3,articles_3,air force brigade general miguel sisco mora wa...,0
4,articles_4,much hudson bay value locked real estate were ...,0


In [12]:
from sklearn.model_selection import train_test_split

In [59]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['joke'], test_size=0.1, stratify=data['joke'])

In [60]:
y_test.value_counts()

1    10000
0    10000
Name: joke, dtype: int64

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [104]:
num_words = 15000
maxlen = 500

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

tokenized_X_train = tokenizer.texts_to_sequences(X_train)
tokenized_X_test = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(tokenized_X_train, maxlen=maxlen)
X_test_pad = pad_sequences(tokenized_X_test, maxlen=maxlen)

In [105]:
y_train_vec = to_categorical(y_train)
y_test_vec = to_categorical(y_test)

In [106]:
X_train_pad.shape

(180000, 500)

In [107]:
X_test_pad.shape

(20000, 500)

In [108]:
model = keras.Sequential()
model.add(layers.Embedding(input_dim=num_words, input_length=maxlen, output_dim=64))
model.add(layers.LSTM(35))
model.add(layers.Dense(2, activation='softmax'))

model.summary()

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 500, 64)           960000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 35)                14000     
_________________________________________________________________
dense_14 (Dense)             (None, 2)                 72        
Total params: 974,072
Trainable params: 974,072
Non-trainable params: 0
_________________________________________________________________


In [109]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [110]:
model.fit(X_train_pad, y_train_vec, validation_data=(X_test_pad, y_test_vec), epochs=5, verbose=1, batch_size=1024)

Train on 180000 samples, validate on 20000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1b01b203548>

In [187]:
# need to restart my computer but want to save this so could do further checks - not gonna use this model later tho
model.save("test_model.h5")

In [189]:
uus = keras.models.load_model("test_model.h5")

In [196]:
uus.predict(X_test_pad[:1])

array([[2.293536e-04, 9.997707e-01]], dtype=float32)

In [198]:
predictions_probs[0]

array([2.2935403e-04, 9.9977070e-01], dtype=float32)

In [123]:
np.argmax([0.1,0.9])

1

In [124]:
predictions_probs = model.predict(X_test_pad)
predictions = [np.argmax(prob) for prob in predictions_probs]

In [125]:
len(predictions)

20000

In [129]:
sum(predictions == y_test) / len(predictions)

0.98585

In [154]:
wrong = []

for i, pred in enumerate(predictions):
    if pred != y_test.iloc[i]:
        wrong.append((X_test.iloc[i], y_test.iloc[i]))

In [155]:
len(wrong)

283

In [172]:
wrong[13]

("shinee 4th concert seoul shinee world iv '' shinee house `` `` sherlock `` `` shine `` `` stranger `` `` picasso korean ver '' `` love like oxygen `` `` your name `` `` your number korean ver '' `` love should go on `` `` close door `` `` alarm clock `` `` excuse me miss `` `` one minute back `` `` colorful `` `` jojo `` `` love sick `` `` one `` `` better off `` `` an ode you `` `` wowowow `` `` woof woof `` `` dream girl `` `` runaway `` `` ready not `` `` beautiful `` `` 321 korean ver '' `` nightmare `` `` dynamite `` `` everybody `` `` view `` `` replay `` `` lucky star `` `` an encore ``",
 0)

In [173]:
wrong[14]

('taught son today play marco polo we opened cabinet found china', 1)

In [186]:
wrong[77]

('with runner second sixth inning pence hit high fly ball inside right field foul pole boston brock holt lunged crowd effort catch ball fell glove',
 0)

In [163]:
data[data['joke'] == True]['text']

100000                       seafood diet see food fish eat
100001    the shoe store an al bundy one-liner fat woman...
100002    what say veteran prostitute night together tha...
100003    confused man see psychiatrist man go see psych...
100004    teabags fought back humans enslaved civilizati...
                                ...                        
199995            what difference snowman snowoman snowball
199996        how octopus tickle man ten time with tentacle
199997             used cough hide fart now fart hide cough
199998                why hellen keller bad driver she dead
199999               what call mayan masturbation ball game
Name: text, Length: 100000, dtype: object