In [1]:
import os, re, random
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

import tensorflow_hub as hub
import tensorflow_text as text

from transformers import BertTokenizer, TFDistilBertModel

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

We are going to create a single text column where we concatenate the premise and the hypothesis, separated by " [SEP] ". We are going to use the **sequence_output** of the BERT layer so that the classifier uses both word meaning and word order to determine if the premise and hypothesis are contradictory or not. 

In [3]:
train['text'] = train.premise + " [SEP] " + train.hypothesis

In [4]:
train_df, val_df = np.split(train.sample(frac = 1), [int(0.8 * len(train))])

In [13]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_df.text, train_df.label)).shuffle(len(train_df)).batch(64).prefetch(tf.data.AUTOTUNE)
val_dataset = tf.data.Dataset.from_tensor_slices((val_df.text, val_df.label)).shuffle(len(val_df)).batch(64).prefetch(tf.data.AUTOTUNE)

In [5]:
print("Number of observations in training data set: {}".format(len(train_df)))
print("Number of observations in validation data set: {}".format(len(val_df)))

Number of observations in training data set: 9696
Number of observations in validation data set: 2424


## Build Model

As the data set that we are trying to classify contain texts that are written in other languages, we are going to use the **bert_multi_cased_L-12_H-768_A-12** model. To find out more about the model, refer to the documentation [here](https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4). 

In [6]:
preprocess_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
encoder_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

In [15]:
input_layer = tf.keras.layers.Input(shape = (), dtype = tf.string)
bert_input = preprocess_layer(input_layer)
bert_output = encoder_layer(bert_input)

output = tf.keras.layers.Dropout(0.3)(bert_output['sequence_output'])
output = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32))(output)
output = tf.keras.layers.Dense(3, activation = 'softmax')(output)

model = tf.keras.Model(inputs = [input_layer], outputs = output)
model.compile(optimizer = tf.keras.optimizers.AdamW(),
              loss = tf.keras.losses.SparseCategoricalCrossentropy(),
              metrics = ['accuracy'])

In [12]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', 
                                                  patience = 2)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = 'model/best_performed_model.ckpt',
    save_weights_only = True,
    save_best_only = True,
    monitor = 'val_loss',
    verbose = 1
)

In [16]:
history = model.fit(train_dataset, 
                    validation_data = val_dataset,
                    epochs = 30, 
                    callbacks = [early_stopping, model_checkpoint_callback])

Epoch 1/30
Epoch 1: val_loss improved from inf to 1.04090, saving model to model/best_performed_model.ckpt
Epoch 2/30
Epoch 2: val_loss improved from 1.04090 to 1.03031, saving model to model/best_performed_model.ckpt
Epoch 3/30
Epoch 3: val_loss improved from 1.03031 to 1.02134, saving model to model/best_performed_model.ckpt
Epoch 4/30
Epoch 4: val_loss did not improve from 1.02134
Epoch 5/30
Epoch 5: val_loss did not improve from 1.02134
