In [None]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

In [None]:
# !pip install cloud-tpu-client==0.10 https://pip.repos.neuron.amazonaws.com/torch-xla/torch_xla-1.13.0%2Btorchneuron3-cp39-cp39-linux_x86_64.whl
!pip install cloud-tpu-client==0.10 torch==2.0.0 torchvision==0.15.1 https://storage.googleapis.com/tpu-pytorch/wheels/colab/torch_xla-2.0-cp39-cp39-linux_x86_64.whl

!pip install torch torchvision pytorch-lightning
!pip install -U torch_xla
!pip install transformers
!pip install datasets
!pip install googletrans=='3.1.0-alpha'

# Imports:

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp
from torch.utils.data.distributed import DistributedSampler

import json
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from torch.optim.lr_scheduler import StepLR

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, GroupShuffleSplit
from sklearn.metrics import classification_report
from datasets import Dataset
import transformers
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup,
    AutoModelForSequenceClassification,
    XLMRobertaModel, XLMRobertaConfig
)
transformers.logging.set_verbosity_error()

from googletrans import Translator
translator = Translator()

# **Load and Augment Data**

Load the data and augment it by backtranslating non-english samples.

In [None]:
data = pd.read_csv('data.csv')
data.loc[data.lang_abv == 'zh', 'lang_abv'] = 'zh-cn'

def translate_back_to_original_language(row):
    premise = row['premise']
    hypothesis = row['hypothesis']
    lang_abv = row['lang_abv']

    # Translate premise and hypothesis to English
    premise_en = translator.translate(premise, dest='en').text
    hypothesis_en = translator.translate(hypothesis, dest='en').text

    # Translate premise and hypothesis back to original language
    premise_back = translator.translate(premise_en, dest=lang_abv).text
    hypothesis_back = translator.translate(hypothesis_en, dest=lang_abv).text

    # Create new columns for translated text
    row['premise'] = premise_back
    row['hypothesis'] = hypothesis_back

    return row

tqdm.pandas()

# Apply the function to non-English rows with tqdm progress bar
non_en_rows = data[data['lang_abv'] != 'en']
translated_rows = non_en_rows.progress_apply(translate_back_to_original_language, axis=1)

# Concatenate translated rows with original df
augmented_data = pd.concat([data, translated_rows])


100%|██████████| 5250/5250 [45:33<00:00,  1.92it/s]


# **Tokenize and Prepare Datasets**

Prepare the datasets by splitting tokenizing the premise and hypothesis using xlm-r tokenizer and then create dataset for each.

In [None]:
# load data
df = augmented_data.copy().dropna() #pd.read_csv('augmented_data.csv').dropna()

# split to train and val
train_ids, val_ids = train_test_split(df.id.unique(), test_size=0.2, shuffle=True)
train_df = df[df.id.isin(train_ids)]
val_df = df[df.id.isin(val_ids)]

train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)

# load tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch['premise'], batch['hypothesis'], padding=True)

# cols_to_remove = list(train_df.columns)
# cols_to_remove.remove('label')
# cols_to_remove.remove('id')

train_ds = train_ds.map(tokenize, batched=True, batch_size=None)
val_ds = val_ds.map(tokenize, batched=True, batch_size=None)

train_ds.set_format("torch")
val_ds.set_format("torch")

torch.save(train_ds, 'train_dataset.pth')
torch.save(val_ds, 'val_dataset.pth')

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/13913 [00:00<?, ? examples/s]

Map:   0%|          | 0/3457 [00:00<?, ? examples/s]

# **Model Definition**

Defining the model class with a pretrained model as a base model with a dropout layer and an additional transformer layer, and finally a linear classifying layer.

In [None]:
import torch.nn as nn
import torch_xla.core.xla_model as xm
import torch_xla.distributed.parallel_loader as pl
import torch_xla.distributed.xla_multiprocessing as xmp

from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset

class CustomModel(torch.nn.Module):
    def __init__(self, num_classes):
        super(CustomModel, self).__init__()
        self.xlmr = XLMRobertaModel.from_pretrained('symanto/xlm-roberta-base-snli-mnli-anli-xnli')
        self.num_classes = num_classes
        self.dropout = torch.nn.Dropout(0.5)
        self.transformer_layer = torch.nn.TransformerEncoderLayer(d_model=self.xlmr.config.hidden_size, nhead=8)
        self.transformer_encoder = torch.nn.TransformerEncoder(self.transformer_layer, num_layers=2)
        self.classifier = torch.nn.Linear(self.xlmr.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.xlmr(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        transformer_output = self.transformer_encoder(sequence_output)
        pooled_output = transformer_output[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        if labels is not None:
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_classes), labels.view(-1))
            return {"loss": loss, "logits": logits}
        else:
            return {"logits": logits}



# **Model Finetuning: Training and Evaluation**

The training process is designed to run over multicore TPUs.

In [None]:
from sklearn.metrics import accuracy_score

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}

def _mp_fn(index, flags):
    model = CustomModel(num_classes=3)

    device = xm.xla_device()
    model = model.to(device)

    train_dataset = torch.load('train_dataset.pth')
    eval_dataset = torch.load('val_dataset.pth')

    # define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',           # output directory
        num_train_epochs=6,              # total number of training epochs
        per_device_train_batch_size=8,    # batch size per device during training
        per_device_eval_batch_size=8,     # batch size for evaluation
        warmup_steps=500,                 # number of warmup steps for learning rate scheduler
        weight_decay=0.5,                 # strength of weight decay
        learning_rate=2e-5,
        logging_dir='./logs',             # directory for storing logs
        evaluation_strategy="epoch",
        tpu_num_cores=flags['tpu_cores'],
    )
    # create the trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    # Fine-tune the model
    trainer.train()

    # Inference
    val_preds = trainer.predict(eval_dataset)
    val_logits = val_preds.predictions
    val_predictions = np.argmax(val_logits, axis=-1)

    # Save results as a CSV file
    if xm.is_master_ordinal():
        submission = pd.DataFrame({"id": eval_dataset['id'],
                                   "label": eval_dataset['label'],
                                   "language": eval_dataset['language'],
                                   "prediction": val_predictions})
        submission.to_csv("val_predictions.csv", index=False)


In [None]:
flags = {'tpu_cores': 8}
xmp.spawn(_mp_fn, args=(flags,), nprocs=8, start_method='fork')




{'eval_loss': 0.30014169216156006, 'eval_accuracy': 0.924211744286954, 'eval_runtime': 5.4339, 'eval_samples_per_second': 636.192, 'eval_steps_per_second': 10.122, 'epoch': 1.0}
{'eval_loss': 0.3887871205806732, 'eval_accuracy': 0.9207405264680358, 'eval_runtime': 5.2481, 'eval_samples_per_second': 658.72, 'eval_steps_per_second': 10.48, 'epoch': 2.0}
{'loss': 0.392, 'learning_rate': 2e-05, 'epoch': 2.29}
{'eval_loss': 0.4951157867908478, 'eval_accuracy': 0.9132195545270466, 'eval_runtime': 5.2241, 'eval_samples_per_second': 661.74, 'eval_steps_per_second': 10.528, 'epoch': 3.0}
{'eval_loss': 0.553794264793396, 'eval_accuracy': 0.91293028637547, 'eval_runtime': 5.0594, 'eval_samples_per_second': 683.279, 'eval_steps_per_second': 10.871, 'epoch': 4.0}
{'loss': 0.1874, 'learning_rate': 7.6237623762376246e-06, 'epoch': 4.59}
{'eval_loss': 0.5846182703971863, 'eval_accuracy': 0.9169800404975412, 'eval_runtime': 5.2165, 'eval_samples_per_second': 662.699, 'eval_steps_per_second': 10.543, 'e

# **Results**

In [None]:
results = pd.read_csv('val_predictions.csv')

# Calculate accuracy by language
acc_by_lang = results.groupby('language').apply(lambda x: round((x['prediction'] == x['label']).mean() * 100, 2))

# Print accuracy by language
print('Accuracy by language:')
print(acc_by_lang)

# Calculate overall accuracy
overall_acc = round((results['prediction'] == results['label']).mean() * 100, 2)

# Print overall accuracy
print('\nOverall accuracy:', overall_acc)

Accuracy by language:
language
Arabic         98.59
Bulgarian      98.00
Chinese        93.51
English        82.60
French         99.34
German         97.69
Greek          98.78
Hindi          98.55
Russian        99.19
Spanish       100.00
Swahili        95.27
Thai           98.73
Turkish        98.63
Urdu           95.24
Vietnamese     98.67
dtype: float64

Overall accuracy: 91.7
