I am making use of transformers along with HuggingFace library, the particular model which I am using is [Multilingual DistilBert](https://huggingface.co/distilbert-base-multilingual-cased).

In [None]:
import pandas as pd

df = pd.read_csv('comments_resampled.csv')

In [None]:
!pip -q install transformers datasets torch tabulate tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m117.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import AutoTokenizer, pipeline, DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# classifier = pipeline('text-classification', model = 'distilbert-base-multilingual-cased')

we need the tokenizer for our distilbert model

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

we also separate our required labels and data into two series

In [None]:
comments = df['trans_comment'].values

possible_tags = df['tags'].unique()

label_dict = {}
for index, possible_tag in enumerate(possible_tags):
    label_dict[possible_tag] = index
df['num_tags'] = df['tags'].replace(label_dict)

tags = df['num_tags'].values

We design our tokenizer and other related things

In [None]:
import numpy as np
import random
import torch
from tabulate import tabulate
from tqdm import trange

In [None]:
def print_rand_sentence():
  '''Displays the tokens and respective IDs of a random comments sample'''
  index = random.randint(0, len(comments)-1)
  table = np.array([tokenizer.tokenize(comments[index]),
                    tokenizer.convert_tokens_to_ids(tokenizer.tokenize(comments[index]))]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence()

╒══════════╤═════════════╕
│ Tokens   │   Token IDs │
╞══════════╪═════════════╡
│ अ        │         851 │
├──────────┼─────────────┤
│ ##च्छा    │      102752 │
├──────────┼─────────────┤
│ और       │       10977 │
├──────────┼─────────────┤
│ ज्ञान     │       77342 │
├──────────┼─────────────┤
│ ##वर     │       22568 │
├──────────┼─────────────┤
│ ##्       │       20429 │
├──────────┼─────────────┤
│ ##ध      │       27694 │
├──────────┼─────────────┤
│ ##क      │       12151 │
├──────────┼─────────────┤
│ व        │         895 │
├──────────┼─────────────┤
│ ##ी      │       10914 │
├──────────┼─────────────┤
│ ##ड      │       20691 │
├──────────┼─────────────┤
│ ##ियो    │       43237 │
├──────────┼─────────────┤
│ ,        │         117 │
├──────────┼─────────────┤
│ थे        │       17798 │
├──────────┼─────────────┤
│ ##ंक      │       90696 │
├──────────┼─────────────┤
│ ##्स      │       18869 │
├──────────┼─────────────┤
│ दो       │       29784 │
├──────────┼──────────

In [None]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
  '''
  Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
    - input_ids: list of token ids
    - token_type_ids: list of token type ids
    - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
  '''
  return tokenizer.encode_plus(
                        input_text,
                        add_special_tokens = True,
                        max_length = 32,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt'
                   )


for sample in comments:
  encoding_dict = preprocessing(sample, tokenizer)
  token_id.append(encoding_dict['input_ids'])
  attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(tags)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
token_id[6]

tensor([   101,    851, 102752,    102,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0])

In [None]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(comments) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table,
                 headers = ['Tokens', 'Token IDs', 'Attention Mask'],
                 tablefmt = 'fancy_grid'))

print_rand_sentence_encoding()

╒══════════╤═════════════╤══════════════════╕
│ Tokens   │   Token IDs │   Attention Mask │
╞══════════╪═════════════╪══════════════════╡
│ [CLS]    │         101 │                1 │
├──────────┼─────────────┼──────────────────┤
│ भ        │         888 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##ै       │       18438 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##या     │       15168 │                1 │
├──────────┼─────────────┼──────────────────┤
│ प        │         885 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##ह      │       17110 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##ल      │       11714 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##गा     │       43263 │                1 │
├──────────┼─────────────┼──────────────────┤
│ ##म      │       13841 │                1 │
├──────────┼─────────────┼──────────────────┤
│ से        │       11072 │      

we now split our data into training and testing set

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

here we make use TensorDataset and DataLoader from PyTorch to make the training part easier

In [None]:
val_ratio = 0.2
batch_size = 16

train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    shuffle = True,
    stratify = labels)

train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_set,
            sampler = SequentialSampler(val_set),
            batch_size = batch_size
        )

Normally the metrics were designed keeping binary classification in mind. However we have multiclass classification so we make our metrics a bit differently.

In [None]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

we are ready to create our model now

In [None]:
!pip install --upgrade nvidia-pyindex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8418 sha256=e58578c2bd80961f6ff232b1feade02d870497dcde0f35e49f2317c821bfcb49
  Stored in directory: /root/.cache/pip/wheels/2c/af/d0/7a12f82cab69f65d51107f48bcd6179e29b9a69a90546332b3
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9


In [None]:
!nvidia-smi

Mon Jun 19 12:12:57 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-multilingual-cased',
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False,
)

optimizer = AdamW(model.parameters(),
                              lr = 5e-5,
                              eps = 1e-08
                              )

model.cuda()

Downloading model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weigh

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


we have arrived at the training part.

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed_val = 42
epochs = 4
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(train_dataloader)*epochs)

def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(train_dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_distilbert-base-multilingual-cased_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(train_dataloader)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(validation_dataloader)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.5732531351379202
Validation loss: 1.1478660464286805
F1 Score (Weighted): 0.5615956216413254


Epoch 2:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.8450646630570858
Validation loss: 0.7857549831271171
F1 Score (Weighted): 0.7002471343162915


Epoch 3:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.5043457705008833
Validation loss: 0.7320617973804474
F1 Score (Weighted): 0.7226863170775211


Epoch 4:   0%|          | 0/79 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.3194361387551585
Validation loss: 0.6764917746186256
F1 Score (Weighted): 0.736980637136504


We now need to check how accurately our model performed

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased',
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_distilbert-base-multilingual-cased_epoch_4.model',
                                 map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(validation_dataloader)
accuracy_per_class(predictions, true_vals)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weigh

Class: appreciation
Accuracy: 38/52

Class: general
Accuracy: 28/54

Class: informative
Accuracy: 31/53

Class: negative
Accuracy: 0/7

Class: planning
Accuracy: 42/50

Class: personal experience
Accuracy: 47/50

Class: recommendation
Accuracy: 50/50



It's now time to predict using this model.

In [None]:
#create test dataset

test = ['Nice sharing video. Well explained pandavas story with Hidumba rakshasa. Fantastic visuals... Natural waterfalls  🪂flying, shops.... Superb 👌👌',
        'Beautiful I’ve been here last week.',
        'बहुत जानकारीपूर्ण वीडियो। बस एक सवाल है कि आप अपने सभी होटल कैसे बुक करते हैं? ऑनलाइन या ऑफलाइन']
test_df = pd.DataFrame(test)
test_df.columns=['comments']

encoded_data_test = tokenizer.batch_encode_plus(
                    test_df['comments'].values.astype(str),
                    add_special_tokens=True,
                    return_attention_mask=True,
                    padding='longest',
                    max_length=256,
                    return_tensors='pt'
                    )
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
dataset_test = TensorDataset(input_ids_test, attention_masks_test)

dataloader_test = DataLoader(dataset_test,
                            sampler=SequentialSampler(dataset_test),
                            )
pred_label=[]
for batch in dataloader_test:
    batch = tuple(b.to(device) for b in batch)
    inputs = {'input_ids': batch[0],
    'attention_mask': batch[1],
    }
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        label = (list(label_dict.keys())[list(label_dict.values()).index(np.argmax(logits))])
        pred_label.append(label)

test_df['predictions'] = pred_label
test_df

Unnamed: 0,comments,predictions
0,Nice sharing video. Well explained pandavas st...,general
1,Beautiful I’ve been here last week.,personal experience
2,बहुत जानकारीपूर्ण वीडियो। बस एक सवाल है कि आप ...,informative
