In [None]:
#!pip install keras
#!pip install tensorflow
#!pip install torch==1.4.0
#!pip install sentencepiece
!pip install --upgrade transformers==3.0.2 # the authors probably used version 3.0.2
# !pip install contractions
# !pip install unidecode
# !pip install contractions

In [9]:
# the basics
import pandas as pd
import numpy as np
import io
import os
import logging
# import random

# data cleaning
# import re
# import contractions as ct
# import string
# import unidecode 

# function for computing tokenized inputs from our own module
from bertembeddings import compute_input_arrays

# math + machine learning
from scipy.stats import spearmanr
from math import floor, ceil
from tqdm import tqdm # for nice progress meters
import sklearn
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
import nltk 
nltk.download('punkt')
import torch
# import tensorflow_hub as hub
import tensorflow as tf
# import bert_tokenization as tokenization
import tensorflow.keras.backend as K
from tensorflow import keras 
# import keras model and layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Concatenate
# import tensorflow.keras.utils.Sequence
# from transformers import *
import transformers
from transformers import TFBertModel, BertTokenizer
from transformers import BertTokenizer, TFBertModel
# from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# viz
import seaborn as sns
import matplotlib.pyplot as plt

np.set_printoptions(suppress=True)
print(tf.__version__)
print(transformers.__version__)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meerw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


2.8.0
3.0.2


# Read and split up dataset

In [10]:
data = pd.read_csv('clean_dataset.csv')

In [11]:
# has the right amount of jokes & non-jokes
data.humor.value_counts()

False    100000
True     100000
Name: humor, dtype: int64

In [12]:
# split into train and test data
x_train, x_test = data['text'][:160000], data['text'][160000:]
y_train, y_test = data['humor'][:160000], data['humor'][160000:]

# cast back into dataframes 
x_train = x_train.to_frame('text')
x_test = x_test.to_frame('text')
y_train = y_train.to_frame('humor')
y_test = y_test.to_frame('humor')

# Tokenize inputs to the model

In [5]:
def save_npz(filename, arr):
    """
    arr: list of 2D arrays
    """
    if '.npz' not in filename:
        filename += '.npz'
    arr_dict = dict(zip(map(str, range(len(arr))), arr))
    np.savez_compressed(filename, **arr_dict)
    
def load_npz(filename):
    if '.npz' not in filename:
        filename += '.npz'
    return np.load(filename, allow_pickle=True)

In [6]:
# only tokenize BATCH_SIZE examples in train and test data at a time
BATCH_SIZE = 50
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = compute_input_arrays(x_train[:BATCH_SIZE], ['text'], tokenizer)
save_npz("testrun_model_train_inputs.npz", inputs)
test_inputs = compute_input_arrays(x_test[:BATCH_SIZE], ['text'], tokenizer)
save_npz("testrun_model_test_inputs.npz", test_inputs)

0it [00:00, ?it/s]Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pa

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
4it [00:32,  8.28s/it]Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encod

(5, 20)





# ColBERT Model

In [95]:
# model architecture
SENT_INPUT_LEN = 20
DOC_INPUT_LEN = 100
DROPOUT_RATE = 0.5

# 18 inputs, 3 for each parallel path (5 sentence-level paths & 1 document-level path)
input_sent1_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent1') # input IDs
input_sent1_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent1') # attention masks
input_sent1_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent1') # token type IDs

input_sent2_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent2')
input_sent2_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent2')
input_sent2_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent2')

input_sent3_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent3')
input_sent3_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent3')
input_sent3_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent3')

input_sent4_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent4')
input_sent4_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent4')
input_sent4_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent4')

input_sent5_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent5')
input_sent5_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent5')
input_sent5_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent5')

input_doc_1 = Input(shape=(DOC_INPUT_LEN,), dtype=tf.int32, name='input_ii_doc')
input_doc_2 = Input(shape=(DOC_INPUT_LEN,), dtype=tf.int32, name='input_am_doc')
input_doc_3 = Input(shape=(DOC_INPUT_LEN,), dtype=tf.int32, name='input_tti_doc')

# embedding layer for sentences and documents
#bert_embeddings = Embedding(num_tokens,embedding_dim,embeddings_initializer=keras.initializers.Constant(embedding_matrix),trainable=False)
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
#bert_embeddings = bert_model(input_ids=input_sent1_1, attention_mask=input_sent1_2, token_type_ids=input_sent1_3) ########### HELP ###########
bert_embeddings1 = bert_model(input_sent1_1, attention_mask=input_sent1_2, token_type_ids=input_sent1_3)
bert_embeddings2 = bert_model(input_sent2_1, attention_mask=input_sent2_2, token_type_ids=input_sent2_3)
bert_embeddings3 = bert_model(input_sent3_1, attention_mask=input_sent3_2, token_type_ids=input_sent3_3)
bert_embeddings4 = bert_model(input_sent4_1, attention_mask=input_sent4_2, token_type_ids=input_sent4_3)
bert_embeddings5 = bert_model(input_sent5_1, attention_mask=input_sent5_2, token_type_ids=input_sent5_3)
bert_embeddings6 = bert_model(input_doc_1, attention_mask=input_doc_2, token_type_ids=input_doc_3)

# get pooled vectors of BERT sentence embeddings
x1 = bert_embeddings1[1] # can also do GlobalAveragePooling1D()
x2 = bert_embeddings2[1] 
x3 = bert_embeddings3[1]
x4 = bert_embeddings4[1]
x5 = bert_embeddings5[1]
x6 = bert_embeddings6[1] 

# fully connected layer w/ dropout
h1_1 = Dense(32, activation='relu', name="hidden1_sent1")(x1)
h1_2 = Dense(32, activation='relu', name="hidden1_sent2")(x2)
h1_3 = Dense(32, activation='relu', name="hidden1_sent3")(x3)
h1_4 = Dense(32, activation='relu', name="hidden1_sent4")(x4)
h1_5 = Dense(32, activation='relu', name="hidden1_sent5")(x5)
h1_6 = Dense(256, activation='relu', name="hidden1_doc")(x6)

h1_dropout1 = Dropout(DROPOUT_RATE, name="h1_dropout_sent1")(h1_1) ####################################################
h1_dropout2 = Dropout(DROPOUT_RATE, name="h1_dropout_sent2")(h1_2) ####################################################
h1_dropout3 = Dropout(DROPOUT_RATE, name="h1_dropout_sent3")(h1_3) #                rate TO BE CHANGED                # 
h1_dropout4 = Dropout(DROPOUT_RATE, name="h1_dropout_sent4")(h1_4) #                                                  #
h1_dropout5 = Dropout(DROPOUT_RATE, name="h1_dropout_sent5")(h1_5) ####################################################
h1_dropout6 = Dropout(DROPOUT_RATE, name="h1_dropout_doc")(h1_6)   ####################################################

# fully connected layer
h2_1 = Dense(8, activation='relu', name="hidden2_sent1")(h1_dropout1)
h2_2 = Dense(8, activation='relu', name="hidden2_sent2")(h1_dropout2)
h2_3 = Dense(8, activation='relu', name="hidden2_sent3")(h1_dropout3)
h2_4 = Dense(8, activation='relu', name="hidden2_sent4")(h1_dropout4)
h2_5 = Dense(8, activation='relu', name="hidden2_sent5")(h1_dropout5)
h2_6 = Dense(64, activation='relu', name="hidden2_doc")(h1_dropout6)

# concatenate outputs of all 6 parallel layers
xx = Concatenate()([h2_1, h2_2, h2_3, h2_4, h2_5, h2_6])

# fully connected layer w/ dropout for concatenated inputs
h3 = Dense(512, activation='relu', name="hidden3")(xx)
h3_dropout = Dropout(DROPOUT_RATE)(h3) ################ rate TO BE CHANGED ################

# fully connected layer
h4 = Dense(256, activation='relu', name="hidden4")(h3_dropout)

# final output layer
yhat = Dense(1, activation='sigmoid', name="output")(h4) # need to figure out dropout rate

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [96]:
# initialize and compile model
model_inputs = [input_sent1_1, input_sent1_2, input_sent1_3,
                input_sent2_1, input_sent2_2, input_sent2_3,
                input_sent3_1, input_sent3_2, input_sent3_3,
                input_sent4_1, input_sent4_2, input_sent4_3,
                input_sent5_1, input_sent5_2, input_sent5_3,
                input_doc_1, input_doc_2, input_doc_3]
model = Model(inputs=model_inputs, outputs=[yhat], name="keras_func_model") ########### HELP ###########
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics = ['accuracy']) # TO BE CHANGED

In [41]:
# train model using 0.2 dropout
input_dict = {'input_ii_sent1': inputs[0], 'input_am_sent1': inputs[1], 'input_tti_sent1': inputs[2],
              'input_ii_sent2': inputs[3], 'input_am_sent2': inputs[4], 'input_tti_sent2': inputs[5],
              'input_ii_sent3': inputs[6], 'input_am_sent3': inputs[7], 'input_tti_sent3': inputs[8],
              'input_ii_sent4': inputs[9], 'input_am_sent4': inputs[10], 'input_tti_sent4': inputs[11],
              'input_ii_sent5': inputs[12], 'input_am_sent5': inputs[13], 'input_tti_sent5': inputs[14],
              'input_ii_doc': inputs[15], 'input_am_doc': inputs[16], 'input_tti_doc': inputs[17],
}
history = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [97]:
# train model using 0.5 dropout
history2 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
# train model using 0.7 dropout
history3 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [81]:
# train model using 0.9 dropout
history4 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Evaluate model

In [74]:
# test accuracy w/ 0.2 dropout
test_preds = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds>0.5).flatten())

0.52

In [98]:
# test accuracy w/ 0.5 dropout
test_preds2 = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds2>0.5).flatten())

0.52

In [79]:
# test accuracy w/ 0.7 dropout
test_preds3 = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds3>0.5).flatten())

0.48

In [82]:
# test accuracy w/ 0.9 dropout
test_preds4 = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds4>0.5).flatten())

0.52