In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train_data = pd.read_csv("../input/contradictory-my-dear-watson/tran.csv")
test_data = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
# In case data already translated, load it directly
# train_data = pd.read_csv("../input/translatedcsv/train_translated.csv")
# test_data = pd.read_csv("../input/translatedcsv/test_translated.csv")

In [None]:
!pip install googletrans==3.1.0a0
from googletrans import Translator
import tensorflow as tf
from transformers import TFAlbertModel, AlbertTokenizer, TFAutoModel, AutoTokenizer

In [None]:
train_data.language.value_counts()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")

## Load external datasets

In [None]:
!pip install datasets
from datasets import load_dataset

In [None]:
train_data = train_data[['premise', 'hypothesis', 'label']]

In [None]:
def append_external(dataset_name):
    train_external = []
    dataset=load_dataset(dataset_name)
    for record in dataset['train']:
        c1, c2, c3 = record['premise'], record['hypothesis'], record['label']
        train_external.append((c1, c2, c3))

    train_external = pd.DataFrame(train_external, columns=['premise', 'hypothesis', 'label'])
    train_data = pd.concat([train_data, train_external])
    return train_data

In [None]:
# train_data = append_external('xnli')
# train_data = append_external('multi_nli')
# train_data = append_external('snli')

## Translate using the Google Translate API

Since alBERT is pretrained on only English language dataset, we translate all the data into English

In [None]:
train_not_eng = train_data[train_data.lang_abv != 'en']

In [None]:
train_eng = train_data[train_data.lang_abv == 'en']

In [None]:
def _to_en(text):
    translator = Translator()
    decoded = translator.translate(text, dest='en').text
    return decoded

In [None]:
train_data_not_eng = train_data[train_data.lang_abv != 'en'].copy()
test_data_not_eng = test_data[test_data.lang_abv != 'en'].copy()

In [None]:
import time
start_time = time.time()
# started at 11:39am

train_data_not_eng.premise = train_data_not_eng.premise.apply(to_english)
train_data_not_eng.hypothesis = train_data_not_eng.hypothesis.apply(to_english)
test_data_not_eng.premise = test_data_not_eng.premise.apply(to_english)
test_data_not_eng.hypothesis = test_data_not_eng.hypothesis.apply(to_english)

# print time taken for translation
print(time.time()-start_time)

Add back the translated data into the train and test datasets

In [None]:
train_data[train_data.lang_abv != 'en'] = train_data_not_eng
test_data[test_data.lang_abv != 'en'] = test_data_not_eng

## Save the translations for future use

In [None]:
import pickle

with open('train_data.pickle', 'wb') as f:
    pickle.dump(train_data, f)
    
with open('test_data.pickle', 'wb') as f:
    pickle.dump(test_data, f)

## Encode the text

In [None]:
SEQ_LEN=256
def albert_encode(df, tokenizer):
    batch_premises = df['premise'].tolist()
    batch_hypothesis = df['hypothesis'].tolist()

    tokens = tokenizer(batch_premises, batch_hypothesis, max_length = SEQ_LEN,
                   truncation=True, padding='max_length',
                   add_special_tokens=True, return_attention_mask=True,
                   return_token_type_ids=True,
                   return_tensors='tf')
    inputs = {
          'input_ids': tokens['input_ids'], 
          'attention_mask': tokens['attention_mask'],
          'token_type_ids': tokens['token_type_ids']  }  
    return inputs

In [None]:
train_input = albert_encode(train_data, tokenizer)

In [None]:
tokenizer.decode(train_input['input_ids'][0])

## Create the ML model

In [None]:
from tensorflow.keras import regularizers

def build_model(): 
   
    input_ids = tf.keras.Input(shape=(SEQ_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(SEQ_LEN,), dtype=tf.int32, name="attention_mask")
    token_type_ids = tf.keras.Input(shape=(SEQ_LEN,), 
                                    dtype=tf.int32,  name="token_type_ids")
        
    model = TFAutoModel.from_pretrained("albert-base-v2")
    embedding = model([input_ids, attention_mask , token_type_ids])[0] 
    inputs=[input_ids, attention_mask  , token_type_ids ] 
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
        
    final_model = tf.keras.Model(inputs=inputs, outputs=output)
    hp_learning_rate = 1e-6
    final_model.compile(tf.keras.optimizers.Adam(lr = hp_learning_rate),
                        loss='sparse_categorical_crossentropy', metrics=['accuracy'])   
    return final_model 

In [None]:
with strategy.scope(): 
    albert_model = build_model()
    albert_model.summary()

In [None]:
train_data.label.values.shape

In [None]:
albert_model.fit(train_input, train_data.label.values,
                 epochs = 5, verbose = 1, batch_size = 30, validation_split = 0.2)

## Create a submission file

In [None]:
test_input = albert_encode(test_data, tokenizer)

In [None]:
predictions = [np.argmax(i) for i in albert_model.predict(test_input)]

In [None]:
submission = test_data.id.copy().to_frame()
submission['prediction'] = predictions

submission.head()

In [None]:
submission.to_csv("submission.csv", index = False)