In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Birgit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
qa = pd.read_json("data/lemmatize_google_qa.json.xz")
jokes = pd.read_json("data/lemmatize_jokes.json.xz")
articles = pd.read_json("data/lemmatize_news.json.xz")

In [41]:
stop_words = stopwords.words('english')
stop_words.extend(["...", "'s", "wo", "n't", "'m", "ca", "'ll", "'re", "'ve", "'d", "ha", "´´", "´", "´´´", ",", "!", "'", ":", ";", '"', "\\", "``"])
stop_words.extend(["wa", "''", "the", "he", "my", "it"])

data_for_df = []

jokes_lemmas = jokes['lemmatize']

for i, joke in enumerate(jokes_lemmas):
    joke_words = [word for word in joke if word not in stop_words and word.isalpha()]
    joke_text = " ".join(joke_words)
    data_for_df.append({'text': joke_text, 'joke': 1, 'category': 0})
    
qa_lemmas = qa['lemmatize']

for i, q in enumerate(qa_lemmas):
    qa_words = [word for word in q if word not in stop_words and word.isalpha()]
    qa_text = " ".join(qa_words)
    data_for_df.append({'text': qa_text, 'joke': 0, 'category': 1})
    
articles_lemmas = articles['lemmatize']

for i, article in enumerate(articles_lemmas):
    article_words = [word for word in article if word not in stop_words and word.isalpha()]
    article_text = " ".join(article_words)
    article_text = re.sub("reporting .*", "", article_text)
    data_for_df.append({'text': article_text, 'joke': 0, 'category': 2})

In [42]:
df = pd.DataFrame(data_for_df)
df.shape

(1380351, 3)

In [43]:
df = df.drop_duplicates('text')
df.shape

(1283149, 3)

In [44]:
df.head()

Unnamed: 0,text,joke,category
0,seafood diet see food fish eat,1,0
1,shoe store al bundy fat woman came shoe store ...,1,0
2,stalin said dark humor like food everyone get,1,0
3,really hate dementia remember,1,0
4,hey guy got nice joke ya u,1,0


### Preparing data for Neural Network

In [22]:
from sklearn.model_selection import train_test_split

In [45]:
jokes_subdf = df[df['category'] == 0].head(50000)
qa_subdf = df[df['category'] == 1].head(25000)
article_subdf = df[df['category'] == 2].head(25000)

smaller_df = pd.concat([jokes_subdf, qa_subdf, article_subdf])

In [49]:
X, X_test, y, y_test = train_test_split(smaller_df['text'], smaller_df['joke'], test_size=0.1, stratify=smaller_df['joke'])

In [50]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y)

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [54]:
num_words = 15000
maxlen = 500

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

tokenized_X_train = tokenizer.texts_to_sequences(X_train)
tokenized_X_val = tokenizer.texts_to_sequences(X_val)
tokenized_X_test = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(tokenized_X_train, maxlen=maxlen)
X_val_pad = pad_sequences(tokenized_X_val, maxlen=maxlen)
X_test_pad = pad_sequences(tokenized_X_test, maxlen=maxlen)

In [55]:
y_train_vec = to_categorical(y_train)
y_val_vec = to_categorical(y_val)
y_test_vec = to_categorical(y_test)

In [56]:
X_train_pad.shape

(81000, 500)

In [57]:
X_test_pad.shape

(10000, 500)

In [58]:
X_val_pad.shape

(9000, 500)

### Neural Network training and testing

Model taken from the tensorflow guide: https://www.tensorflow.org/guide/keras/rnn. Plan was to add layers and tune it further but the results indicated this wasn't necessary and instead we put more work into preprocessing the data further and analysing it.

#### Model with Embedding layer

In [59]:
model = keras.Sequential()
model.add(layers.Embedding(input_dim=num_words, input_length=maxlen, output_dim=64))
model.add(layers.LSTM(35))
model.add(layers.Dense(2, activation='softmax'))
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 64)           960000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 35)                14000     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 72        
Total params: 974,072
Trainable params: 974,072
Non-trainable params: 0
_________________________________________________________________


In [60]:
model.fit(X_train_pad, y_train_vec, validation_data=(X_val_pad, y_val_vec), epochs=3, verbose=1, batch_size=256)

Train on 81000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x270b7d5cdc8>

In [61]:
predictions_probs = model.predict(X_test_pad)
predictions = [np.argmax(prob) for prob in predictions_probs]

In [115]:
round(sum(predictions == y_test) / len(predictions) * 100, 2)

96.8

#### Model without Embedding layer

In [109]:
model2 = keras.Sequential()
#model2.add(layers.Embedding(input_dim=num_words, input_length=maxlen, output_dim=64))
model2.add(layers.LSTM(35, input_length=maxlen, input_dim=1))
model2.add(layers.Dense(2, activation='softmax'))
model2.summary()
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_11 (LSTM)               (None, 35)                5180      
_________________________________________________________________
dense_11 (Dense)             (None, 2)                 72        
Total params: 5,252
Trainable params: 5,252
Non-trainable params: 0
_________________________________________________________________


In [110]:
X_train_reshaped = np.reshape(X_train_pad, (X_train_pad.shape[0], X_train_pad.shape[1], 1))
X_val_reshaped = np.reshape(X_val_pad, (X_val_pad.shape[0], X_val_pad.shape[1], 1))
X_test_reshaped = np.reshape(X_test_pad, (X_test_pad.shape[0], X_test_pad.shape[1], 1))

In [111]:
X_train_reshaped.shape

(81000, 500, 1)

In [112]:
model2.fit(X_train_reshaped, y_train_vec, validation_data=(X_val_reshaped, y_val_vec), epochs=3, verbose=1, batch_size=256)

Train on 81000 samples, validate on 9000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x2713b2ab208>

In [113]:
predictions_probs2 = model2.predict(X_test_reshaped)
predictions2 = [np.argmax(prob) for prob in predictions_probs2]

In [114]:
round(sum(predictions2 == y_test) / len(predictions2) * 100, 2)

84.32

#### Save models

In [116]:
model.save("model_with_embedding_layer.h5")
model2.save("model_without_embedding_layer.h5")