In [3]:
!pip install evaluate



In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import os, re, random, datasets, evaluate

import tensorflow_hub as hub
import tensorflow_text as text

pd.set_option('display.max_colwidth', None)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer

This notebook is an adapdation of a tutorial that I found [here](https://www.kaggle.com/code/rajkumarl/nlp-tutorial-fine-tuning-with-trainer-api/notebook). I've been looking for tutorials where we fine-tune BERT (and BERT-equivalent) for the purpose of classifying text using **transformers** on Youtube and Kaggle, and I found the one linked above really helpful. This serves as a way to try out and learn: **AutoTokenizer**, **AutoModelForSequenceClassification**, **DataCollatorWithPadding**, **TrainingArguments**, and **Trainer**.

## Import and Preprocess Data Set

In [5]:
train = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/train.csv')
test_df = pd.read_csv('/kaggle/input/contradictory-my-dear-watson/test.csv')

In [6]:
train['text'] = train.hypothesis + " [SEP] " + train.premise
test_df['text'] = test_df.hypothesis + " [SEP] " + test_df.premise

In [7]:
train_df, val_df = np.split(train.sample(frac = 1), [int(0.8 * len(train))])

In [8]:
train_dict = datasets.Dataset.from_dict(train_df.to_dict(orient="list"))
val_dict = datasets.Dataset.from_dict(val_df.to_dict(orient="list"))
test_dict = datasets.Dataset.from_dict(test_df.to_dict(orient="list"))

In [9]:
contradiction_ds = datasets.DatasetDict({"train": train_dict, "val": val_dict, "test": test_dict})

In [10]:
contradiction_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'text'],
        num_rows: 9696
    })
    val: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'text'],
        num_rows: 2424
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'text'],
        num_rows: 5195
    })
})

In [11]:
model_name = 'symanto/xlm-roberta-base-snli-mnli-anli-xnli'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 3)

Downloading (…)lve/main/config.json:   0%|          | 0.00/921 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [13]:
def Tokenize_function(example):
    return tokenizer(example['text'], truncation=True)

tokenized_data = contradiction_ds.map(Tokenize_function, batched=True)

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [14]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 9696
    })
    val: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 2424
    })
    test: Dataset({
        features: ['id', 'premise', 'hypothesis', 'lang_abv', 'language', 'text', 'input_ids', 'attention_mask'],
        num_rows: 5195
    })
})

In [15]:
tokenized_data = tokenized_data.remove_columns(['premise','hypothesis', 'lang_abv', 'language', 'text'])
tokenized_data.with_format('pt')

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'input_ids', 'attention_mask'],
        num_rows: 9696
    })
    val: Dataset({
        features: ['id', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2424
    })
    test: Dataset({
        features: ['id', 'input_ids', 'attention_mask'],
        num_rows: 5195
    })
})

In [16]:
training_args = TrainingArguments(model_name,  
                                  evaluation_strategy='epoch',
                                  num_train_epochs = 5,
                                  learning_rate = 5e-5,
                                  weight_decay = 0.005,
                                  per_device_train_batch_size = 16,
                                  per_device_eval_batch_size = 16,
                                  report_to = 'none')

In [17]:
def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [18]:
trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["val"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics
)

In [19]:
trainer.train()

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.429494,0.832096
2,0.508200,0.452934,0.831271
3,0.508200,0.509621,0.842409
4,0.207800,0.721203,0.846947
5,0.080700,0.897081,0.842409


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



TrainOutput(global_step=1515, training_loss=0.2632287571729213, metrics={'train_runtime': 991.1404, 'train_samples_per_second': 48.913, 'train_steps_per_second': 1.529, 'total_flos': 2424540810597696.0, 'train_loss': 0.2632287571729213, 'epoch': 5.0})

## Prepare for Submission

In [20]:
yhat = trainer.predict(tokenized_data["test"])
preds = np.argmax(yhat.predictions, axis=1)

In [26]:
submission = pd.DataFrame(list(zip(test_df.id, preds)), 
                          columns = ["id", "prediction"])
submission.to_csv("submission.csv", index=False)