In [15]:
import string
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
import regex as re
import io

In [3]:
articles = pd.read_json('data/articles_lemmas.json')
google_qa = pd.read_json('data/google_qa_lemmas.json')
jokes = pd.read_json('data/jokes_lemmas.json')

In [4]:
jokes_and_not = {'text': [], 'funny': []}

counter = 0
for line in articles.text[:10000]:
    jokes_and_not['text'].append(line)
    jokes_and_not['funny'].append(0)

for line in google_qa.text[:10000]:
    jokes_and_not['text'].append(line)
    jokes_and_not['funny'].append(0)

for line in jokes.text[:20000]:
    jokes_and_not['text'].append(line)
    jokes_and_not['funny'].append(1)

In [5]:
df = pd.DataFrame(jokes_and_not)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['funny'], test_size = 0.1)

In [7]:
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Conv1D
from tensorflow.keras.preprocessing.text import Tokenizer

from tensorflow.keras import preprocessing
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


In [8]:
tokenizer = Tokenizer(num_words=4096)

tokenizer.fit_on_texts(X_train)

tokenized_X_train = tokenizer.texts_to_sequences(X_train)
tokenized_X_test = tokenizer.texts_to_sequences(X_test)

maxlen=20

x_train_pad = preprocessing.sequence.pad_sequences(tokenized_X_train, maxlen=maxlen)
x_test_pad = preprocessing.sequence.pad_sequences(tokenized_X_test, maxlen=maxlen)

In [9]:

model = Sequential()
model.add(Embedding(len(tokenizer.word_index), 32, input_length=maxlen))

model.add(Conv1D(256, 10, activation='relu'))
model.add(Flatten())
model.add(Dense(10, activation='softmax'))

model.add(Flatten())

model.add(Dense(1, activation='sigmoid'))

In [10]:
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

history = model.fit(x_train_pad, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

model.evaluate(x_test_pad, y_test)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 32)            3266400   
_________________________________________________________________
conv1d (Conv1D)              (None, 11, 256)           82176     
_________________________________________________________________
flatten (Flatten)            (None, 2816)              0         
_________________________________________________________________
dense (Dense)                (None, 10)                28170     
_________________________________________________________________
flatten_1 (Flatten)          (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 3,376,757
Trainable params: 3,376,757
Non-trainable params: 0
______________________________________________

[0.21132397651672363, 0.9547500014305115]

In [11]:
def predict(nali):
    nali2 = tokenizer.texts_to_sequences(nali)
    print(nali2)
    nali3 = preprocessing.sequence.pad_sequences(nali2, maxlen=maxlen)
    s = model.predict(nali3)
    return s


predict(["What do you call poop?"])



[[22, 438, 70, 92]]


array([[0.9977927]], dtype=float32)

In [14]:
weights = model.get_layer('embedding').get_weights()[0]
vocab = list(tokenizer.word_index)


In [16]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if  index == 0: continue # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()