# Installing libraries

In [None]:
!pip install --upgrade pip
##sentencepiece is unsupervised text tokenizer and detokenizer
!pip install sentencepiece
##HuggingFace community-driven open-source library of datasets
!pip install datasets
!pip install transformers

# Training model on custom data

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

In [None]:
import torch
from transformers import AutoTokenizer

# Create The Dataset Class 
class TheDataset(torch.utils.data.Dataset):
    ##attributes initiated as tweets and labels
    def __init__(self, tweets, labels, tokenizer):
        self.tweets = tweets
        self.labels = labels
        self.tokenizer  = tokenizer
    ##length of the tweet 
    def __len__(self):
        return len(self.tweets)
    ##getting item based on index

    def __getitem__(self, index):
        tweet = str(self.tweets[index])
        labels = self.labels[index]
    ##encoding the tweets keeping padding at max_length
    ##for more details please reference from https://huggingface.co/transformers/v2.11.0/glossary.html#attention-mask
        encoded_tweet = self.tokenizer.encode_plus(
            tweet,
            add_special_tokens    = True,
            max_length            = 512,
            return_token_type_ids = False,
            return_attention_mask = True,
            return_tensors        = "pt",
            padding               = "max_length",
            truncation            = True
        )

        return {
            'input_ids': encoded_tweet['input_ids'][0],
            'attention_mask': encoded_tweet['attention_mask'][0],
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [None]:
##Learning rate
LR = 2e-4
##number of epochs
EPOCHS = 10
##batch size
BATCH_SIZE = 8
##Model used for transfer learning, a lot of options are available on huggingface 
MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)

## Data Loading

In [None]:
##relabelling target variable based on classification problem 
def relabel(label):
    if label == 'NEGATIVE':
        return 0
    elif label == 'NEUTRAL':
        return 1
    else:
        return 2

In [None]:
##sorting data types of data variables
for i in ['train','dev','test']:
  df = pd.read_csv(f"./finaldata/{i}.csv")
  df['tweet'] = df.tweet.astype(str).values
  df['label'] = df.label.apply(relabel).values

  df.to_csv(f"./dataloader/{i}.csv", index=False)

In [None]:
train_df = pd.read_csv("./dataloader/train.csv")
val_df = pd.read_csv("./dataloader/dev.csv")
test_df = pd.read_csv("./dataloader/test.csv")

train_df['tweet'] = train_df.tweet.astype(str).values
val_df['tweet'] = val_df.tweet.astype(str).values
test_df['tweet'] = test_df.tweet.astype(str).values

In [None]:
##converting into Dataset format
train_dataset = TheDataset(
    tweets    = train_df.tweet.values.tolist(),
    labels = train_df.label.values.tolist(),
    tokenizer  = tokenizer,
)

val_dataset = TheDataset(
    tweets  = val_df.tweet.values.tolist(),
    labels = val_df.label.values.tolist(),
    tokenizer  = tokenizer,
)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size  = BATCH_SIZE,
    num_workers = 2
)

valid_dataloader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size  = BATCH_SIZE,
    num_workers = 2
)


## Model training

In [None]:
##defining number of lables to classify
num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=num_labels)

In [None]:
##defining output metrices
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
##training args and trainer defined and then using trainer.train() to train the model on custom dataset
training_args = TrainingArguments(
    output_dir                  = "./sentiment-analysis",
    num_train_epochs            = EPOCHS,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size  = BATCH_SIZE,
    warmup_steps                = 500,
    weight_decay                = 0.01,
    save_strategy               = "epoch",
    evaluation_strategy         = "steps"
)

trainer = Trainer(
    model           = model,
    args            = training_args,
    train_dataset   = train_dataset,
    eval_dataset    = val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

In [None]:
# # # if you wanna share your model to huggingface, please simply login and add a argument after logging in as below
# # !pip install huggingface_hub
# # from huggingface_hub import notebook_login

# # !notebook_login()
# # # training_args = TrainingArguments(
# # #     output_dir                  = "./sentiment-analysis",
# # #     num_train_epochs            = EPOCHS,
# # #     per_device_train_batch_size = BATCH_SIZE,
# # #     per_device_eval_batch_size  = BATCH_SIZE,
# # #     warmup_steps                = 500,
# # #     weight_decay                = 0.01,
# # #     save_strategy               = "epoch",
# # #     evaluation_strategy         = "steps",
# # #     push_to_hub                 = True

# # # )
# trainer.push_to_hub()

In [None]:
##one can also push the model to huggingface with a callback function, please refer https://huggingface.co/docs/transformers/model_sharing for more options

In [None]:
##evaluating the trained model
trainer.evaluate()

# Convert to onnx

Why to use ONNX- Bringing all models trained using different frameworks to work on common production pipeline as well as accelerating the inference steps, ONNX is really helpful

In [None]:
##converting to ONNX format to speed up the process
!pip install optimum[onnxruntime]
!pip install transformers[onnx]

In [None]:

import os
from pathlib import Path
import transformers
from transformers.onnx import FeaturesManager
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

In [None]:
##model_dir is very important to define and make sure once the model is converted into ONNX, it will be saved in that same path defined
##for sample purpose I am using the same model from hugginhface to convert to ONNX 
model_dir = "./sentiment-analysis"
feature = "sequence-classification"
##XLM-RoBERTa is a multilingual version of RoBERTa. It is pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages.
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(model_dir, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature=feature)
onnx_config = model_onnx_config(model.config)

onnx_inputs, onnx_outputs = transformers.onnx.export(
        preprocessor=tokenizer,
        model=model,
        config=onnx_config,
        opset=13,
        output=Path(os.path.join(model_dir, "model.onnx"))
)


Checking export

In [None]:
##one can load ONNX model using the path specified while saving the same model
from optimum.onnxruntime import ORTModelForSequenceClassification

model_onnx = ORTModelForSequenceClassification.from_pretrained("./sentiment-analysis")