In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
!nvidia-smi


In [2]:
!curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
!python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [3]:
import torch_xla
import torch_xla.core.xla_model as xm

In [4]:
#huggingface libraries 
import transformers
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW
#torch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset, TensorDataset

#random libraries
from tqdm import tqdm 
import pandas as pd
import numpy as np
import os
import gc
import random

#  For setting a seed value
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score

import warnings
warnings.filterwarnings("ignore")


print(torch.__version__)

In [6]:
train_data=pd.read_csv("/kaggle/input/contradictory-my-dear-watson/train.csv")
test_data=pd.read_csv("/kaggle/input/contradictory-my-dear-watson/test.csv")

train_data

In [None]:
# PRE-PROCESSING THE DATA TYPE
#model_name = 'bert-base-multilingual-uncased'
#model_name = 'xlm-roberta-base'

In [7]:
model_name = 'joeddav/xlm-roberta-large-xnli' 
batch_size = 32
MAX_LENGTH = 256
NUM_EPOCHS = 3
L_RATE = 1e-5
NUM_CORES = os.cpu_count()

NUM_CORES

In [8]:
device = xm.xla_device()

print(device)

In [9]:
tokenizer=XLMRobertaTokenizer.from_pretrained(model_name, do_lower_case=True)

In [10]:
# sample_data = list(zip(train_data['premise'][:2], train_data['hypothesis'][:2]))
# print(sample_data)
# tokenized_sample = tokenizer.batch_encode_plus(sample_data, **kwargs)
# print(tokenized_sample)
def preprocess(data1, tokenizer):
    kwargs = { 'truncation': True,
    'max_length': MAX_LENGTH,
    'padding': 'max_length',
     'return_attention_mask': True, 
    'return_token_type_ids': True     
    }
    data = list(zip(data1['premise'], data1['hypothesis']))
    tokenized = tokenizer.batch_encode_plus(data,**kwargs)
    input_ids = torch.LongTensor(tokenized.input_ids)
    attention_masks = torch.LongTensor(tokenized.attention_mask)
    token_type_ids = torch.LongTensor(tokenized.token_type_ids)
    return input_ids, attention_masks, token_type_ids

In [11]:
input_ids, attention_masks, token_type_ids = preprocess(train_data,tokenizer)
labels = torch.Tensor(train_data['label']).reshape(-1, 1)
train_dataset_final = TensorDataset(input_ids, attention_masks, token_type_ids,labels)
train_dataloader = DataLoader(train_dataset_final, sampler=RandomSampler(train_dataset_final), batch_size=batch_size)
train_dataset_final

In [12]:
print(len(train_dataloader))


In [13]:
input_ids_test, attention_masks_test, token_type_ids_test = preprocess(test_data,tokenizer)
test_dataset_final = TensorDataset(input_ids_test, attention_masks_test, token_type_ids_test)
test_dataloader = DataLoader(test_dataset_final, sampler=SequentialSampler(test_dataset_final), batch_size=batch_size)
len(test_dataloader)

In [14]:
model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=3)
model.to(device)

In [None]:
# TESTING MODEL

In [15]:
batch = next(iter(train_dataloader))
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_token_type_ids = batch[2].to(device)
b_labels = batch[3].to(device)

In [16]:
outputs = model(b_input_ids, 
                token_type_ids=b_token_type_ids, 
                attention_mask=b_input_mask,
                labels=b_labels)

In [17]:
print(outputs[0])
print(outputs[0].item())

In [None]:
# TRAINING MODEL

In [18]:
optimizer = AdamW(model.parameters(),
              lr = L_RATE, 
              eps = 1e-8
            )

In [19]:
gc.collect()

In [20]:
seed_val = 1024

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [21]:
for epoch in range(NUM_EPOCHS):
    model.train()
    torch.set_grad_enabled(True)
    total_train_loss=0
    
    for i,batch in tqdm(enumerate(train_dataloader)):
        model.zero_grad()
        input_ids, attention_masks, token_type_ids, labels=batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device)
        outputs = model(input_ids, token_type_ids=token_type_ids, attention_mask=attention_masks, labels=labels)
        loss=outputs[0]
        if i%10==0:
            print(f'loss of batch {i}: {loss}')
        total_train_loss+=loss.item()
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)
        xm.optimizer_step(optimizer, barrier=True) 
    print(f'total loss of epoch {epoch}: {total_train_loss}')
    gc.collect()    


In [22]:
stacked_val_labels = []
model.eval()

torch.set_grad_enabled(False)
total_val_loss = 0

for j, h_batch in enumerate(test_dataloader):

    b_input_ids = h_batch[0].to(device)
    b_input_mask = h_batch[1].to(device)
    b_token_type_ids = h_batch[2].to(device)     
    outputs = model(b_input_ids, 
            token_type_ids=b_token_type_ids, 
            attention_mask=b_input_mask)
    preds = outputs[0]
    val_preds = preds.detach().cpu().numpy()
    if j == 0:  # first batch
        stacked_val_preds = val_preds
        
    else:
        stacked_val_preds = np.vstack((stacked_val_preds, val_preds))
        #stacked_val_preds.extend(val_preds)
    #print(len(stacked_val_preds))
    
            
print('\nPrediction complete.')

In [23]:
print(stacked_val_preds)


In [24]:
test_preds = np.argmax(stacked_val_preds, axis=1)

In [25]:
path = '../input/contradictory-my-dear-watson/sample_submission.csv'

df_sample = pd.read_csv(path)

print(df_sample.shape)
df_sample['prediction'] = test_preds

df_sample.head()
df_sample.to_csv('submission.csv', index=False)