In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import tensorflow as tf
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy() # for CPU and single GPU
    print('Number of replicas:', strategy.num_replicas_in_sync)

In [None]:
train_data = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_data = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")

In [None]:
from transformers import TFAutoModel, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-large')

In [None]:
SEQ_LEN=256
def xlm_roberta_encode(df, tokenizer):
    batch_premises = df['premise'].tolist()
    batch_hypothesis = df['hypothesis'].tolist()

    tokens = tokenizer(batch_premises, batch_hypothesis, max_length = SEQ_LEN,
                   truncation=True, padding='max_length',
                   add_special_tokens=True, return_attention_mask=True,
                   return_token_type_ids=True,
                   return_tensors='tf')
    inputs = {
          'input_ids': tokens['input_ids'], 
          'attention_mask': tokens['attention_mask'],
          'token_type_ids': tokens['token_type_ids']  }  
    return inputs

In [None]:
train_input = xlm_roberta_encode(train_data, tokenizer)

In [None]:
tokenizer.decode(train_input['input_ids'][0])

In [None]:
from tensorflow.keras import regularizers

def build_model(): 
   
    input_ids = tf.keras.Input(shape=(SEQ_LEN,), dtype=tf.int32, name="input_ids")
    attention_mask = tf.keras.Input(shape=(SEQ_LEN,), dtype=tf.int32, name="attention_mask")
    token_type_ids = tf.keras.Input(shape=(SEQ_LEN,), 
                                    dtype=tf.int32,  name="token_type_ids")
        
    model = AutoModelForMaskedLM.from_pretrained('xlm-roberta-large')
    embedding = model([input_ids, attention_mask , token_type_ids])[0] 
    inputs=[input_ids, attention_mask  , token_type_ids ] 
    output = tf.keras.layers.Dense(3, activation='softmax')(embedding[:,0,:])
        
    final_model = tf.keras.Model(inputs=inputs, outputs=output)
    hp_learning_rate = 1e-6
    final_model.compile(tf.keras.optimizers.Adam(lr = hp_learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])   
    return final_model 

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [None]:
with strategy.scope(): 
    xlm_roberta_model = build_model()
    xlm_roberta_model.summary()

In [None]:
xlm_roberta_model.fit(train_input, train_data.label.values, epochs = 5, verbose = 1, batch_size = 30, validation_split = 0.2)

In [None]:
test_input = xlm_roberta_encode(test_data, tokenizer)

In [None]:
predictions = [np.argmax(i) for i in xlm_roberta_model.predict(test_input)]

In [None]:
submission = test_data.id.copy().to_frame()
submission['prediction'] = predictions

submission.head()

In [None]:
submission.to_csv("submission.csv", index = False)