## We will first train a ColBERT sentence embedding.

In [1]:
# Machine details
from tensorflow import keras
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

Using TensorFlow backend.


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 4122018096975150724
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 11153683264
locality {
  bus_id: 1
  links {
  }
}
incarnation: 6393585071727212234
physical_device_desc: "device: 0, name: Tesla K80, pci bus id: 0000:00:04.0, compute capability: 3.7"
]


In [2]:
# Dependencies
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# import tensorflow_hub as hub
import tensorflow as tf
# import bert_tokenization as tokenization
import tensorflow.keras.backend as K
# from tensorflow import keras

import os
from scipy.stats import spearmanr
from math import floor, ceil
from transformers import *

import seaborn as sns
import string
import re    #for regex

np.set_printoptions(suppress=True)
print(tf.__version__)

2.4.1


In [3]:
# Parameters for data
training_sample_count = 1000 # 4000
test_count = 1000

MAX_SENTENCE_LENGTH = 20
MAX_SENTENCES = 5
MAX_LENGTH = 100

## Read training and testing data

In [4]:
os.chdir("joke-gen/ColBERT humor")

In [5]:
df = pd.read_csv('Data/dataset.csv')

df_train = pd.read_csv('Data/train.csv')
display(df_train.head(3))
df_train = df_train[:training_sample_count]

df_test = pd.read_csv('Data/dev.csv')
display(df_test.head(3))
df_test = df_test[:test_count]

Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True


Unnamed: 0,text,humor
0,What kind of cat should you take into the des...,True
1,Remember when people used to have to be in sha...,True
2,Pizza is always good. - everyone we'll see abo...,True


In [6]:
test_df_y = df_test.copy()
del df_test['humor']

df_sub = test_df_y.copy()

print(len(df),len(df_train),len(df_test))
display(df_train.head())
display(df_test.head())

200000 1000 1000


Unnamed: 0,text,humor
0,"Joe biden rules out 2020 bid: 'guys, i'm not r...",False
1,Watch: darvish gave hitter whiplash with slow ...,False
2,What do you call a turtle without its shell? d...,True
3,5 reasons the 2016 election feels so personal,False
4,"Pasco police shot mexican migrant from behind,...",False


Unnamed: 0,text
0,What kind of cat should you take into the des...
1,Remember when people used to have to be in sha...
2,Pizza is always good. - everyone we'll see abo...
3,"What's 6 inches long hard, bent, and in my pan..."
4,Black teen's response to violence in his commu...


In [7]:
output_categories = list(df_train.columns[[1]])
input_categories = list(df_train.columns[[0]])

TARGET_COUNT = len(output_categories)

print('\ninput categories:\n\t', input_categories)
print('\noutput TARGET_COUNT:\n\t', TARGET_COUNT)
print('\noutput categories:\n\t', output_categories)


input categories:
	 ['text']

output TARGET_COUNT:
	 1

output categories:
	 ['humor']


## Preprocess the data

In [8]:
from tensorflow.keras import layers
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizer, TFBertModel, BertConfig

MODEL_TYPE = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

In [9]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def return_id(str1, str2, truncation_strategy, length):

    inputs = tokenizer.encode_plus(str1, str2,
        add_special_tokens=True,
        max_length=length,
        truncation_strategy=truncation_strategy)

    input_ids =  inputs["input_ids"]
    input_masks = [1] * len(input_ids)
    input_segments = inputs["token_type_ids"]
    padding_length = length - len(input_ids)
    padding_id = tokenizer.pad_token_id
    input_ids = input_ids + ([padding_id] * padding_length)
    input_masks = input_masks + ([0] * padding_length)
    input_segments = input_segments + ([0] * padding_length)

    return [input_ids, input_masks, input_segments]


def compute_input_arrays(df, columns, tokenizer):
    model_input = []
    for xx in range((MAX_SENTENCES*3)+3):
        model_input.append([])
    
    for _, row in tqdm(df[columns].iterrows()):
        i = 0
        
        # sent
        sentences = sent_tokenize(row.text)
        for xx in range(MAX_SENTENCES):
            s = sentences[xx] if xx<len(sentences) else ''
            ids_q, masks_q, segments_q = return_id(s, None, 'longest_first', MAX_SENTENCE_LENGTH)
            model_input[i].append(ids_q)
            i+=1
            model_input[i].append(masks_q)
            i+=1
            model_input[i].append(segments_q)
            i+=1
        
        # full row
        ids_q, masks_q, segments_q = return_id(row.text, None, 'longest_first', MAX_LENGTH)
        model_input[i].append(ids_q)
        i+=1
        model_input[i].append(masks_q)
        i+=1
        model_input[i].append(segments_q)
        
    for xx in range((MAX_SENTENCES*3)+3):
        model_input[xx] = np.asarray(model_input[xx], dtype=np.int32)
        
    print(model_input[0].shape)
    return model_input

In [11]:
inputs      = compute_input_arrays(df_train, input_categories, tokenizer)
test_inputs = compute_input_arrays(df_test, input_categories, tokenizer)

0it [00:00, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


(1000, 20)


0it [00:00, ?it/s]

(1000, 20)


In [12]:
print(len(inputs), len(inputs[0]), len(inputs[0][0]))

# check out input for 7th row
xx = 7
print(df_train.iloc[xx,0])
print(sent_tokenize(df_train.iloc[xx,0]))
inputs[0][xx], inputs[3][xx], inputs[6][xx], inputs[15][xx]

18 1000 20
Why do native americans hate it when it rains in april? because it brings mayflowers.
['Why do native americans hate it when it rains in april?', 'because it brings mayflowers.']


(array([  101,  2339,  2079,  3128,  4841,  5223,  2009,  2043,  2009,
        15811,  1999,  2258,  1029,   102,     0,     0,     0,     0,
            0,     0], dtype=int32),
 array([  101,  2138,  2009,  7545,  2089, 14156,  2015,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0], dtype=int32),
 array([101, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0], dtype=int32),
 array([  101,  2339,  2079,  3128,  4841,  5223,  2009,  2043,  2009,
        15811,  1999,  2258,  1029,  2138,  2009,  7545,  2089, 14156,
         2015,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,

In [13]:
def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

outputs = compute_output_arrays(df_train, output_categories)
outputs[:3]

array([[False],
       [False],
       [ True]])

## Load untrained Model

In [20]:
# MAX_SENTENCE_LENGTH = 20
import json
with open("modelConf.json") as file:
    config = json.load(file)

model = keras.models.model_from_json(json.dumps(config))
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_19 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_20 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_21 (InputLayer)           [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_22 (InputLayer)           [(None, 20)]         0                                            
_______________________________________________________________________________________

Since it's a regression with outcome between 0 and 1, it's reasonable to use binary_crossentropy optimizer and MAE loss function

In [28]:
model.compile(optimizer="Adam",loss="binary_crossentropy",metrics=["mae","accuracy"])

In [29]:
model.fit(inputs,outputs,epochs=70)

Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


<tensorflow.python.keras.callbacks.History at 0x7f4365517350>

## Evaluation

In [30]:
# Evaluation Metrics
import sklearn
def print_evaluation_metrics(y_true, y_pred, label='', is_regression=True, label2=''):
    print('==================', label2)
    ### For regression
    if is_regression:
        print('mean_absolute_error',label,':', sklearn.metrics.mean_absolute_error(y_true, y_pred))
        print('mean_squared_error',label,':', sklearn.metrics.mean_squared_error(y_true, y_pred))
        print('r2 score',label,':', sklearn.metrics.r2_score(y_true, y_pred))
        #     print('max_error',label,':', sklearn.metrics.max_error(y_true, y_pred))
        return sklearn.metrics.mean_squared_error(y_true, y_pred)
    else:
        ### FOR Classification
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('average_precision_score',label,':', sklearn.metrics.average_precision_score(y_true, y_pred))
#         print('balanced_accuracy_score',label,':', sklearn.metrics.balanced_accuracy_score(y_true, y_pred))
#         print('accuracy_score',label,':', sklearn.metrics.accuracy_score(y_true, y_pred))
        print('f1_score',label,':', sklearn.metrics.f1_score(y_true, y_pred))
        
        matrix = sklearn.metrics.confusion_matrix(y_true, y_pred)
        print(matrix)
        TP,TN,FP,FN = matrix[1][1],matrix[0][0],matrix[0][1],matrix[1][0]
        Accuracy = (TP+TN)/(TP+FP+FN+TN)
        Precision = TP/(TP+FP)
        Recall = TP/(TP+FN)
        F1 = 2*(Recall * Precision) / (Recall + Precision)
        print('Acc', Accuracy, 'Prec', Precision, 'Rec', Recall, 'F1',F1)
        return sklearn.metrics.accuracy_score(y_true, y_pred)

In [33]:
preds = model.predict(inputs)
print_evaluation_metrics(np.array(outputs), np.array(preds), '')

mean_absolute_error  : 0.49990338
mean_squared_error  : 0.24995096
r2 score  : -5.9040197397663974e-08


0.24995096

## Predict on test sets and produce binary submission

In [34]:
test_preds = model.predict(test_inputs)

In [35]:
for split in np.arange(0.1, 0.99, 0.1).tolist():
    df_sub['pred_bi'] = (test_preds > split)

    print_evaluation_metrics(df_sub['humor'], df_sub['pred_bi'], '', False, 'SPLIT on '+str(split))

    df_sub.to_csv('sub3.csv', index=False)
    df_sub.head()

f1_score  : 0.6807387862796834
[[  0 484]
 [  0 516]]
Acc 0.516 Prec 0.516 Rec 1.0 F1 0.6807387862796834
f1_score  : 0.6807387862796834
[[  0 484]
 [  0 516]]
Acc 0.516 Prec 0.516 Rec 1.0 F1 0.6807387862796834
f1_score  : 0.6807387862796834
[[  0 484]
 [  0 516]]
Acc 0.516 Prec 0.516 Rec 1.0 F1 0.6807387862796834
f1_score  : 0.6807387862796834
[[  0 484]
 [  0 516]]
Acc 0.516 Prec 0.516 Rec 1.0 F1 0.6807387862796834
f1_score  : 0.6807387862796834
[[  0 484]
 [  0 516]]
Acc 0.516 Prec 0.516 Rec 1.0 F1 0.6807387862796834
f1_score  : 0.0
[[484   0]
 [516   0]]
Acc 0.484 Prec nan Rec 0.0 F1 nan
f1_score  : 0.0
[[484   0]
 [516   0]]
Acc 0.484 Prec nan Rec 0.0 F1 nan
f1_score  : 0.0
[[484   0]
 [516   0]]
Acc 0.484 Prec nan Rec 0.0 F1 nan
f1_score  : 0.0
[[484   0]
 [516   0]]
Acc 0.484 Prec nan Rec 0.0 F1 nan




In [36]:
df_sub['pred_bi'] = (test_preds > 0.5)

print_evaluation_metrics(df_sub['humor'], df_sub['pred_bi'], '', False, 'SPLIT on '+str(split))

df_sub.to_csv('sub.csv', index=False)
df_sub.head()

f1_score  : 0.6807387862796834
[[  0 484]
 [  0 516]]
Acc 0.516 Prec 0.516 Rec 1.0 F1 0.6807387862796834


Unnamed: 0,text,humor,pred_bi
0,What kind of cat should you take into the des...,True,True
1,Remember when people used to have to be in sha...,True,True
2,Pizza is always good. - everyone we'll see abo...,True,True
3,"What's 6 inches long hard, bent, and in my pan...",True,True
4,Black teen's response to violence in his commu...,False,True


In [38]:
print('Texts that the model correctly predicts:')
df_sub[df_sub['pred_bi']==df_sub['humor']]

Texts that the model correctly predicts:


Unnamed: 0,text,humor,pred_bi
0,What kind of cat should you take into the des...,True,True
1,Remember when people used to have to be in sha...,True,True
2,Pizza is always good. - everyone we'll see abo...,True,True
3,"What's 6 inches long hard, bent, and in my pan...",True,True
6,Do infants have as much fun in infancy as adul...,True,True
...,...,...,...
994,Look on the bright side would be horrible advi...,True,True
996,Why are giraffes slow to apologize? it takes t...,True,True
997,I think some drugs should be legalized... but ...,True,True
998,What did miss muffet and saddam hussein have i...,True,True


In [37]:
print('Texts that the model failed to correctly predict:')
df_sub[df_sub['pred_bi']!=df_sub['humor']]

Texts that the model failed to correctly predict:


Unnamed: 0,text,humor,pred_bi
4,Black teen's response to violence in his commu...,False,True
5,"'make me a sandwich' is making us hungry, deli...",False,True
7,Funded kickstarters: food products we can't wa...,False,True
10,China says it wants smooth military ties with ...,False,True
13,Lance armstrong used rugs: not the headline we...,False,True
...,...,...,...
987,These are our relationships as depicted by foo...,False,True
988,Dazzling photos show northern lights shimmerin...,False,True
989,What an april fools’ day prank says about the ...,False,True
991,Beauty cheat sheet: products that make beauty ...,False,True
