In [2]:
import pandas as pd
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Flatten, Dense, Conv1D
from tensorflow.keras.models import Sequential

In [3]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ingma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
qa = pd.read_json("data/lemmatize_google_qa.json.xz")
jokes = pd.read_json("data/lemmatize_jokes.json.xz")
articles = pd.read_json("data/lemmatize_news.json.xz")

In [5]:
stop_words = stopwords.words('english')
stop_words.extend(["...", "'s", "wo", "n't", "'m", "ca", "'ll", "'re", "'ve", "'d", "ha", "´´", "´", "´´´", ",", "!", "'", ":", ";", '"', "\\", "``"])
stop_words.extend(["wa", "''", "the", "he", "my", "it"])

data_for_df = []

jokes_lemmas = jokes['lemmatize']

for i, joke in enumerate(jokes_lemmas):
    joke_words = [word for word in joke if word not in stop_words and word.isalpha()]
    joke_text = " ".join(joke_words)
    data_for_df.append({'text': joke_text, 'joke': 1, 'category': 0})
    
qa_lemmas = qa['lemmatize']

for i, q in enumerate(qa_lemmas):
    qa_words = [word for word in q if word not in stop_words and word.isalpha()]
    qa_text = " ".join(qa_words)
    data_for_df.append({'text': qa_text, 'joke': 0, 'category': 1})
    
articles_lemmas = articles['lemmatize']

for i, article in enumerate(articles_lemmas):
    article_words = [word for word in article if word not in stop_words and word.isalpha()]
    article_text = " ".join(article_words)
    article_text = re.sub("reporting .*", "", article_text)
    data_for_df.append({'text': article_text, 'joke': 0, 'category': 2})

In [7]:
df = pd.DataFrame(data_for_df)
df.shape

(1380351, 3)

In [8]:
df = df.drop_duplicates('text')
df.shape

(1283149, 3)

In [12]:
df[df.joke == 0].head(5)
df_ = df
df = pd.concat([df_[df_.joke == 0].sample(frac=0.04), df_[df_.joke == 1].sample(frac=0.04)])

### Preparing data for Neural Network

In [13]:
from sklearn.model_selection import train_test_split

In [24]:
X, X_test, y, y_test = train_test_split(df['text'], df['joke'], test_size=0.1, stratify=df['joke'])

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y)

In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [27]:
num_words = 15000
maxlen = 500

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)

tokenized_X_train = tokenizer.texts_to_sequences(X_train)
tokenized_X_val = tokenizer.texts_to_sequences(X_val)
tokenized_X_test = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(tokenized_X_train, maxlen=maxlen)
X_val_pad = pad_sequences(tokenized_X_val, maxlen=maxlen)
X_test_pad = pad_sequences(tokenized_X_test, maxlen=maxlen)

In [28]:
y_train_vec = to_categorical(y_train)
y_val_vec = to_categorical(y_val)
y_test_vec = to_categorical(y_test)

In [29]:
X_train_pad.shape

(41573, 500)

In [30]:
X_test_pad.shape

(5133, 500)

In [31]:
X_val_pad.shape

(4620, 500)

### Neural Network training and testing

In [None]:
model = Sequential()

model.add(Embedding(len(tokenizer.word_index), 32, input_length=maxlen))
model.add(Conv1D(256, 10, activation='relu'))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))




In [44]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train_pad, y_train, validation_data=(X_val_pad, y_val), epochs=10, verbose=1, batch_size=256, workers=8, use_multiprocessing=True)
model.evaluate(X_test_pad, y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.1776331067085266, 0.9622053503990173]

In [45]:
model.save("convulational_100k_embedding.h5")

65.499