In [1]:
!pip install mapply --quiet
!pip install tensorflow --quiet
!pip install deep_translator --quiet
!pip install tensorflow-hub --quiet
!pip install tensorflow-text --quiet
!pip install tensorflow-datasets --quiet

In [34]:
import os, mapply
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from deep_translator import GoogleTranslator
from transformers import AutoTokenizer, TFBertModel
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')
tf.get_logger().setLevel('ERROR')
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"

In [3]:
mapply.init(
    n_workers=-1,
    chunk_size=100,
    max_chunks_per_worker=8,
    progressbar=True
)

## Import and Inspect Data

In [4]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [5]:
train.head(5)

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


The train dataset has six columns: 'id', 'premise', 'hypothesis', 'lang_abv', 'language', and 'label'. The 'premise' and 'hypothesis' columns are two sentences that we are trying to classify, and n'lang_abv' and 'language' columns correspond to the language in which the sentences are written. Let's check if there are any NA values inside the dataset:

In [6]:
train.isna().any()

id            False
premise       False
hypothesis    False
lang_abv      False
language      False
label         False
dtype: bool

## Preprocessing: Translate Non-English Observations

We've confirmed that there are no NA values inside the dataset. Next, as we are going to use a BERT-based classifier, we are going to first translate the non-English written text into English before training our model. We are going to do this using **GoogleTranslator** module in the [**deep_translator**](https://pypi.org/project/deep-translator/) package. Before we use it on the training dataset, let's look at an example sentence to see if it works: 

In [7]:
GoogleTranslator(source='ko', target='en').translate("저는 한국 사람입니다.") 

"I'm Korean."

We are going to set the 'lang_abv' column as the source language and set English ('en') as the target language. Then we are going to translate both the 'premise' and 'hypothesis' columns. First, let's survey the list of language supported by **GoogleTranslator**. If the language is supported by the translator, we are going to translate and use the observation to train our model, but if the language is not supported by the translator, we are going to remove those observations. 

In [8]:
langs_list = GoogleTranslator().get_supported_languages()

In [9]:
train.language = train.language.apply(lambda x: x.lower())
print('Number of observations prior to filtering: {}'.format(train.shape[0]))

train_filtered = train[train['language'].isin(langs_list)]
print('Number of observations after filtering: {}'.format(train_filtered.shape[0]))

Number of observations prior to filtering: 12120
Number of observations after filtering: 11709


In [11]:
translated_premise = train_filtered.mapply(lambda x: GoogleTranslator(source = str.lower(x['language']), 
                                                                     target = 'en').translate(x['premise']) if str.lower(x['language']) != 'english' else x['premise'], axis = 1)
translated_hypothesis = train_filtered.mapply(lambda x: GoogleTranslator(source = str.lower(x['language']), 
                                                                        target = 'en').translate(x['hypothesis']) if str.lower(x['language']) != 'english' else x['hypothesis'], axis = 1)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [14]:
train_filtered['translated_premise'] = translated_premise
train_filtered['translated_hypothesis'] = translated_hypothesis

Let's check if the line of code above worked as intended. First, we are going to look at those observations whose language is not English. Then, we are going to look at those observations whose language is English. We would expect the 'translated_premise' and 'translated_hypothesis' columns of the prior observations to have been translated into English and those columns of the latter observations to be the same as 'premise' and 'hypothesis'.

In [15]:
train_filtered[train_filtered.language == 'english'].head(5)

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label,translated_premise,translated_hypothesis
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,english,0,and these comments were considered in formulat...,The rules developed in the interim were put to...
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,english,2,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,english,0,you know they can't really defend themselves l...,They can't defend themselves because of their ...
7,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,en,english,2,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.
8,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,en,english,1,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...


## Preprocessing: Balance Observations in Training Data

Now that we've completed preprocessing the text that we will feed in our classifier model, we are going to see if the labels of the observations in the training data are balanced. This is so that we can prevent the model from being biased towards one class (label).

In [16]:
train_filtered.label.value_counts(normalize = True)

0    0.344692
2    0.336408
1    0.318900
Name: label, dtype: float64

The classes of observations are pretty balanaced, so we are not going to take extra steps to further balance the classes. Now, we are going to split the filtered training dataset into training and validation datasets. We should note that we are going to be using two features - **translated_premise** and **translated_hypothesis** - as our input and **label** as our output.

In [30]:
x_train, x_val, y_test, y_val = train_test_split(train_filtered[['translated_premise', 'translated_hypothesis']],
                                                 train_filtered.label, 
                                                 train_size = 0.80,
                                                 random_state = 1048596)

In [32]:
print("The number of observations in the training dataset is: {}".format(len(x_train)))
print("The number of observations in the validation dataset is {}".format(len(x_val)))

The number of observations in the training dataset is: 9367
The number of observations in the validation dataset is 2342


In [37]:
roberta_base_path = 'cardiffnlp/twitter-roberta-base-sentiment'
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_base_path, 
                                                  do_lower_case = True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [41]:
premise_train = roberta_tokenizer(x_train['translated_premise'].tolist(), padding = "max_length", truncation = True).data
hypothesis_train = roberta_tokenizer(x_train['translated_hypothesis'].tolist(), padding = "max_length", truncation = True).data

premise_val = roberta_tokenizer(x_val['translated_premise'].tolist(), padding = "max_length", truncation = True).data
hypothesis_val = roberta_tokenizer(x_val['translated_hypothesis'].tolist(), padding = "max_length", truncation = True).data

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
def extract_features(tokens, labels, batch_size):
    features = {x: tokens[x] for x in tokenizer.model_input_names}
    features = tf.data.Dataset.from_tensor_slices((features, labels))
    return features.shuffle(len(labels)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_premise = extract_features(premise_train, y_train, 32)
train_hypothesis = extract_features(hypothesis_train, y_val, 32)

## Define Model

Our model is going to take the tokenized **premise** and **hypothesis** features as the input. The input(s) will go through a Dense layer with the Softmax activation function as this is a classifier with 2+ labels (0, 1, or 2) associated with them. 

In [None]:
bert_model = TFRobertaModel.from_pretrained('../input/huggingface-roberta/roberta-base')

input_id_1 = tf.keras.Input(shape = (128,), dtype = 'int32')
input_id_2 = tf.keras.Input(shape = (128,), dtype = 'int32')

output = bert_model([input_id_1, input_id_2])[1]
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(output)

model = tf.keras.models.Model(inputs = [input_id_1, input_id_2], outputs = output)

In [None]:
model.compile(optimizer = tf.keras.optimizers.AdamW(weight_decay = 1e-6, lr = 1e-5),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = ['accuracy'])