In [24]:
import pandas as pd
import numpy as np
import re

### Check if data has the same distribution as their final dataset

In [25]:
data = pd.read_csv('dataset.csv')

In [26]:
# has the right amount of jokes & non-jokes
data.humor.value_counts()

False    100000
True     100000
Name: humor, dtype: int64

In [27]:
# split into train and test data
x_train, x_test = data['text'][:160000], data['text'][160000:]
y_train, y_test = data['humor'][:160000], data['humor'][160000:]

# cast back into dataframes 
x_train = x_train.to_frame('text')
x_test = x_test.to_frame('text')
y_train = y_train.to_frame('humor')
y_test = y_test.to_frame('humor')

In [28]:
# haven't done processing for special words (eg isn't --> is not)
data[data['text'].str.contains("isn't")].set_index('text')

Unnamed: 0_level_0,humor
text,Unnamed: 1_level_1
Me: my cat isn't overweight; she's just big-boned vet: this is a dog,True
"Did you hear, john wayne bobbit got his penis cut off again? isn't that redickless?",True
"Jesus walks into a bar no he didn't, because he isn't real.",True
Video breaks down why machismo isn't synonymous with latino men,False
George clooney's ex-anchorman dad isn't having it with sinclair,False
...,...
Miss russia 'haters' say elmira abdrazakova isn't russian enough (photos),False
It's so cool how math isn't real now that i'm a grown up.,True
There's a shockingly high chance the seafood you're eating isn't legit,False
This is what happens 'when mama isn't home',False


In [6]:
all_text = ' '.join(data['text'])
all_puncs = re.findall(r'[^\w\s]', all_text)

In [7]:
all_puncs = []
for t in data['text']:
    num_puncs = re.findall(r'[^\w\s]', t)
    all_puncs.append(num_puncs)

In [8]:
len(all_puncs)

200000

In [9]:
# if we need to expand contractions ourselves

#!pip install contractions
import contractions as ct
ct.fix("Hi I'm isn't")

ModuleNotFoundError: No module named 'contractions'

## ColBERT Model

In [30]:
#!pip install keras
#!pip install tensorflow
!pip install transformers
# the authors probably used version 3.0.2
!pip install --upgrade --user transformers==3.0.2 
#!pip install sentencepiece



In [31]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# import tensorflow_hub as hub
import tensorflow as tf
# import bert_tokenization as tokenization
import tensorflow.keras.backend as K
from tensorflow import keras 

import os
from scipy.stats import spearmanr
from math import floor, ceil
# from transformers import *
import transformers
from transformers import TFBertModel, BertTokenizer

import seaborn as sns
import string
import re    #for regex

# import keras model layers
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, Concatenate

np.set_printoptions(suppress=True)
print(tf.__version__)
print(transformers.__version__)

2.8.0
4.17.0


In [95]:
SENT_INPUT_LEN = 20
DOC_INPUT_LEN = 100
DROPOUT_RATE = 0.5

# 18 inputs, 3 for each parallel path (5 sentence-level paths & 1 document-level path)
input_sent1_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent1') # input IDs
input_sent1_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent1') # attention masks
input_sent1_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent1') # token type IDs

input_sent2_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent2')
input_sent2_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent2')
input_sent2_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent2')

input_sent3_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent3')
input_sent3_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent3')
input_sent3_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent3')

input_sent4_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent4')
input_sent4_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent4')
input_sent4_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent4')

input_sent5_1 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_ii_sent5')
input_sent5_2 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_am_sent5')
input_sent5_3 = Input(shape=(SENT_INPUT_LEN,), dtype=tf.int32, name='input_tti_sent5')

input_doc_1 = Input(shape=(DOC_INPUT_LEN,), dtype=tf.int32, name='input_ii_doc')
input_doc_2 = Input(shape=(DOC_INPUT_LEN,), dtype=tf.int32, name='input_am_doc')
input_doc_3 = Input(shape=(DOC_INPUT_LEN,), dtype=tf.int32, name='input_tti_doc')

# embedding layer for sentences and documents
#bert_embeddings = Embedding(num_tokens,embedding_dim,embeddings_initializer=keras.initializers.Constant(embedding_matrix),trainable=False)
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
#bert_embeddings = bert_model(input_ids=input_sent1_1, attention_mask=input_sent1_2, token_type_ids=input_sent1_3) ########### HELP ###########
bert_embeddings1 = bert_model(input_sent1_1, attention_mask=input_sent1_2, token_type_ids=input_sent1_3)
bert_embeddings2 = bert_model(input_sent2_1, attention_mask=input_sent2_2, token_type_ids=input_sent2_3)
bert_embeddings3 = bert_model(input_sent3_1, attention_mask=input_sent3_2, token_type_ids=input_sent3_3)
bert_embeddings4 = bert_model(input_sent4_1, attention_mask=input_sent4_2, token_type_ids=input_sent4_3)
bert_embeddings5 = bert_model(input_sent5_1, attention_mask=input_sent5_2, token_type_ids=input_sent5_3)
bert_embeddings6 = bert_model(input_doc_1, attention_mask=input_doc_2, token_type_ids=input_doc_3)

# get pooled vectors of BERT sentence embeddings
#x1 = bert_embeddings[0]['pooled_vector'] # can also do GlobalAveragePooling1D()
#x2 = bert_embeddings[1]['pooled_vector']
#x3 = bert_embeddings[2]['pooled_vector']
#x4 = bert_embeddings[3]['pooled_vector']
#x5 = bert_embeddings[4]['pooled_vector']
#x6 = bert_embeddings[5]['pooled_vector']
x1 = bert_embeddings1[1] 
x2 = bert_embeddings2[1] 
x3 = bert_embeddings3[1]
x4 = bert_embeddings4[1]
x5 = bert_embeddings5[1]
x6 = bert_embeddings6[1] 

# fully connected layer w/ dropout
h1_1 = Dense(32, activation='relu', name="hidden1_sent1")(x1)
h1_2 = Dense(32, activation='relu', name="hidden1_sent2")(x2)
h1_3 = Dense(32, activation='relu', name="hidden1_sent3")(x3)
h1_4 = Dense(32, activation='relu', name="hidden1_sent4")(x4)
h1_5 = Dense(32, activation='relu', name="hidden1_sent5")(x5)
h1_6 = Dense(256, activation='relu', name="hidden1_doc")(x6)

h1_dropout1 = Dropout(DROPOUT_RATE, name="h1_dropout_sent1")(h1_1) ####################################################
h1_dropout2 = Dropout(DROPOUT_RATE, name="h1_dropout_sent2")(h1_2) ####################################################
h1_dropout3 = Dropout(DROPOUT_RATE, name="h1_dropout_sent3")(h1_3) #                rate TO BE CHANGED                # 
h1_dropout4 = Dropout(DROPOUT_RATE, name="h1_dropout_sent4")(h1_4) #                                                  #
h1_dropout5 = Dropout(DROPOUT_RATE, name="h1_dropout_sent5")(h1_5) ####################################################
h1_dropout6 = Dropout(DROPOUT_RATE, name="h1_dropout_doc")(h1_6)   ####################################################

# fully connected layer
h2_1 = Dense(8, activation='relu', name="hidden2_sent1")(h1_dropout1)
h2_2 = Dense(8, activation='relu', name="hidden2_sent2")(h1_dropout2)
h2_3 = Dense(8, activation='relu', name="hidden2_sent3")(h1_dropout3)
h2_4 = Dense(8, activation='relu', name="hidden2_sent4")(h1_dropout4)
h2_5 = Dense(8, activation='relu', name="hidden2_sent5")(h1_dropout5)
h2_6 = Dense(64, activation='relu', name="hidden2_doc")(h1_dropout6)

# concatenate outputs of all 6 parallel layers
xx = Concatenate()([h2_1, h2_2, h2_3, h2_4, h2_5, h2_6])

# fully connected layer w/ dropout for concatenated inputs
h3 = Dense(512, activation='relu', name="hidden3")(xx)
h3_dropout = Dropout(DROPOUT_RATE)(h3) ################ rate TO BE CHANGED ################

# fully connected layer
h4 = Dense(256, activation='relu', name="hidden4")(h3_dropout)

# final output layer
yhat = Dense(1, activation='sigmoid', name="output")(h4) # need to figure out dropout rate

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [35]:
#df = pd.read_csv('/kaggle/input/200k-short-texts-for-humor-detection/dataset.csv')
df = pd.read_csv('dataset.csv')

#df_train = pd.read_csv('/kaggle/input/200k-short-texts-for-humor-detection/train.csv')
df_train = pd.read_csv('train.csv')
display(df_train.head(3))
df_train = df_train[:50]

#df_test = pd.read_csv('/kaggle/input/200k-short-texts-for-humor-detection/dev.csv')
df_test = pd.read_csv('dev.csv')
display(df_test.head(3))
df_test = df_test[:50]

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True


Unnamed: 0,text,humor
0,What kind of cat should you take into the des...,True
1,Remember when people used to have to be in sha...,True
2,Pizza is always good. - everyone we'll see abo...,True


In [36]:
from bertembeddings import compute_input_arrays

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs      = compute_input_arrays(df_train, ['text'], tokenizer)
test_inputs = compute_input_arrays(df_test, ['text'], tokenizer)


0it [00:00, ?it/s][A
1it [00:31, 31.07s/it][A
2it [01:01, 30.92s/it][A
3it [01:32, 30.95s/it][A
4it [02:03, 30.98s/it][A
5it [02:35, 31.03s/it][A
6it [03:06, 31.08s/it][A
7it [03:37, 31.01s/it][A
8it [04:08, 31.01s/it][A
9it [04:38, 30.90s/it][A
10it [05:09, 30.81s/it][A
11it [05:40, 30.92s/it][A
12it [06:11, 30.88s/it][A
13it [06:43, 31.19s/it][A
14it [07:14, 31.09s/it][A
15it [07:44, 30.96s/it][A
16it [08:15, 30.91s/it][A
17it [08:46, 30.91s/it][A
18it [09:17, 31.03s/it][A
19it [09:49, 31.20s/it][A
20it [10:20, 31.08s/it][A
21it [10:51, 31.08s/it][A
22it [11:21, 30.97s/it][A
23it [11:53, 31.02s/it][A
24it [12:23, 30.98s/it][A
25it [12:55, 31.12s/it][A
26it [13:26, 31.03s/it][A
27it [13:57, 31.03s/it][A
28it [14:28, 30.99s/it][A
29it [14:59, 31.12s/it][A
30it [15:30, 31.03s/it][A
31it [16:01, 30.94s/it][A
32it [16:32, 30.98s/it][A
33it [17:03, 31.15s/it][A
34it [17:34, 31.14s/it][A
35it [18:07, 31.50s/it][A
36it [18:38, 31.41s/it][A
37it [19:09, 3

(50, 20)



1it [00:31, 31.05s/it][A
2it [01:02, 31.55s/it][A
3it [01:34, 31.37s/it][A
4it [02:05, 31.27s/it][A
5it [02:36, 31.16s/it][A
6it [03:07, 31.07s/it][A
7it [03:38, 31.18s/it][A
8it [04:09, 31.26s/it][A
9it [04:42, 31.53s/it][A
10it [05:13, 31.44s/it][A
11it [05:43, 31.21s/it][A
12it [06:15, 31.27s/it][A
13it [06:46, 31.13s/it][A
14it [07:17, 31.25s/it][A
15it [07:48, 31.13s/it][A
16it [08:19, 31.11s/it][A
17it [08:50, 31.05s/it][A
18it [09:21, 30.93s/it][A
19it [09:51, 30.89s/it][A
20it [10:22, 30.86s/it][A
21it [10:54, 31.12s/it][A
22it [11:25, 31.08s/it][A
23it [11:56, 30.94s/it][A
24it [12:26, 30.76s/it][A
25it [12:57, 30.74s/it][A
26it [13:27, 30.75s/it][A
27it [13:58, 30.77s/it][A
28it [14:29, 30.75s/it][A
29it [15:00, 30.76s/it][A
30it [15:31, 30.81s/it][A
31it [16:01, 30.75s/it][A
32it [16:32, 30.71s/it][A
33it [17:03, 30.73s/it][A
34it [17:34, 30.80s/it][A
35it [18:04, 30.80s/it][A
36it [18:35, 30.80s/it][A
37it [19:06, 30.81s/it][A
38it [19:

(50, 20)





In [None]:
from bertembeddings import compute_input_arrays

#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#inputs      = compute_input_arrays(x_train, ['text'], tokenizer)
#test_inputs = compute_input_arrays(x_test, ['text'], tokenizer)

4178it [30:50:28,  7.29s/it]   

### each sentence-level path is differentiated sentence position in the document (maximum 5 sentences in a document)
input_sent1_1 = inputs[0] # input ids

input_sent1_2 = inputs[1] # attention masks

input_sent1_3 = inputs[2] # token type ids (of the first sentences)

input_sent2_1 = inputs[3] # input ids

input_sent2_2 = inputs[4] # attention masks

input_sent2_3 = inputs[5] # token type ids (of the 2nd sentences)

input_doc_1 = inputs[15]

input_doc_2 = inputs[16]

input_doc_3 = inputs[17] # token type ids of documents

notes
- dropout rate is not specified (also should different dropout layers have different dropout rates?)

In [38]:
from tensorflow.keras.models import Model

In [96]:
model_inputs = [input_sent1_1, input_sent1_2, input_sent1_3,
                input_sent2_1, input_sent2_2, input_sent2_3,
                input_sent3_1, input_sent3_2, input_sent3_3,
                input_sent4_1, input_sent4_2, input_sent4_3,
                input_sent5_1, input_sent5_2, input_sent5_3,
                input_doc_1, input_doc_2, input_doc_3]
model = Model(inputs=model_inputs, outputs=[yhat], name="keras_func_model") ########### HELP ###########
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics = ['accuracy']) # TO BE CHANGED

In [41]:
input_dict = {'input_ii_sent1': inputs[0], 'input_am_sent1': inputs[1], 'input_tti_sent1': inputs[2],
              'input_ii_sent2': inputs[3], 'input_am_sent2': inputs[4], 'input_tti_sent2': inputs[5],
              'input_ii_sent3': inputs[6], 'input_am_sent3': inputs[7], 'input_tti_sent3': inputs[8],
              'input_ii_sent4': inputs[9], 'input_am_sent4': inputs[10], 'input_tti_sent4': inputs[11],
              'input_ii_sent5': inputs[12], 'input_am_sent5': inputs[13], 'input_tti_sent5': inputs[14],
              'input_ii_doc': inputs[15], 'input_am_doc': inputs[16], 'input_tti_doc': inputs[17],
}
history = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6) # dropout 0.2

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
history1 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6) # dropout 0.3

In [97]:
history2 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6) # dropout 0.5

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [78]:
history3 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6) # dropout 0.7

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [81]:
history4 = model.fit(input_dict, df_train['humor'], epochs=5, batch_size=6) # dropout 0.9

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [42]:
test_preds = model.predict(test_inputs)

In [74]:
# test accuracy w/ 0.2 dropout
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds>0.5).flatten())

0.52

In [98]:
# test accuracy w/ 0.5 dropout
test_preds2 = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds2>0.5).flatten())

0.52

In [79]:
# test accuracy w/ 0.7 dropout
test_preds3 = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds3>0.5).flatten())

0.48

In [82]:
# test accuracy w/ 0.9 dropout
test_preds4 = model.predict(test_inputs)
sklearn.metrics.accuracy_score(y_test.iloc[:50,0], (test_preds4>0.5).flatten())

0.52

In [101]:
###### THEIR CODE ######
# Evaluation Metrics
import sklearn
def print_evaluation_metrics(y_true, y_pred, label='', is_regression=True, label2=''):
    print('==================', label2)
    ### For regression
    if is_regression:
        print('mean_absolute_error',label,':', sklearn.metrics.mean_absolute_error(y_true, y_pred))
        print('mean_squared_error',label,':', sklearn.metrics.mean_squared_error(y_true, y_pred))
        print('r2 score',label,':', sklearn.metrics.r2_score(y_true, y_pred))
        #     print('max_error',label,':', sklearn.metrics.max_error(y_true, y_pred))
        return sklearn.metrics.mean_squared_error(y_true, y_pred)
    else:
        ### FOR Classification
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('average_precision_score',label,':', sklearn.metrics.average_precision_score(y_true, y_pred))
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('accuracy_score',label,':', sklearn.metrics.accuracy_score(y_true, y_pred))
        print('f1_score',label,':', sklearn.metrics.f1_score(y_true, y_pred))
        
        matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        print(matrix)
        TP,TN,FP,FN = matrix[1][1],matrix[0][0],matrix[0][1],matrix[1][0]
        Accuracy = (TP+TN)/(TP+FP+FN+TN)
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1 = 2*(Recall * Precision) / (Recall + Precision)
        print('Acc', Accuracy, 'Prec', Precision, 'Rec', Recall, 'F1',F1)
        return sklearn.metrics.accuracy_score(y_true, y_pred)

print_evaluation_metrics([1,0], [0.9,0.1], '', True)
print_evaluation_metrics([1,0], [1,1], '', False)

mean_absolute_error  : 0.09999999999999999
mean_squared_error  : 0.009999999999999998
r2 score  : 0.96
f1_score  : 0.6666666666666666
[[0 1]
 [0 1]]
Acc 0.5 Prec 0.5 Rec 1.0 F1 0.6666666666666666


0.5

In [107]:
print_evaluation_metrics(y_test.iloc[:50,0], (test_preds2>0.5).flatten(), '', False)

f1_score  : 0.0
[[26  0]
 [24  0]]
Acc 0.52 Prec nan Rec 0.0 F1 nan




0.52