**Multilingual BERT**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m60.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m103.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


In [None]:
import sys
import random
import torch
import torch.nn as nn
import shutil
import numpy as np
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import time
import datetime
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

In [None]:
if torch.cuda.is_available():
    # torch.cuda.set_device(0)
    device = torch.device('cuda')
    print('Using GPU: ', torch.cuda.current_device())
else:
    device = torch.device('cpu')

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
print(device)

Using GPU:  0
cuda


In [None]:
import pandas as pd
import csv
import string

In [None]:
df = pd.read_csv('drive/MyDrive/hindi_dataset.tsv',sep='\t')
df

Unnamed: 0,text_id,text,task_1,task_2,task_3
0,hasoc_hi_5556,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,NONE,NONE
1,hasoc_hi_5648,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,PRFN,UNT
2,hasoc_hi_164,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,PRFN,TIN
3,hasoc_hi_3530,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,NONE,NONE
4,hasoc_hi_5206,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,NONE,NONE
...,...,...,...,...,...
4660,hasoc_hi_6606,पाकिस्तान ने हिंदुओं के ख़िलाफ़ बोलने वाले को ...,NOT,NONE,NONE
4661,hasoc_hi_4931,कोहली है #नेहरू नहीं जो अंग्रेजों के तलवे चाटन...,HOF,PRFN,TIN
4662,hasoc_hi_1059,परशुराम? वही जिसने अपनी मां की हत्या की थीं?,NOT,NONE,NONE
4663,hasoc_hi_5429,जिस देश में #कन्हैया_कुमार जैसा पढ़ा लिखा युवा...,HOF,HATE,TIN


In [None]:
df.drop(['text_id','task_2','task_3'],inplace=True,axis=1)
df

Unnamed: 0,text,task_1
0,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT
1,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF
2,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF
3,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT
4,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT
...,...,...
4660,पाकिस्तान ने हिंदुओं के ख़िलाफ़ बोलने वाले को ...,NOT
4661,कोहली है #नेहरू नहीं जो अंग्रेजों के तलवे चाटन...,HOF
4662,परशुराम? वही जिसने अपनी मां की हत्या की थीं?,NOT
4663,जिस देश में #कन्हैया_कुमार जैसा पढ़ा लिखा युवा...,HOF


In [None]:
df1 = df.assign(Offensive=0)
df1

Unnamed: 0,text,task_1,Offensive
0,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,0
1,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,0
2,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,0
3,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,0
4,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,0
...,...,...,...
4660,पाकिस्तान ने हिंदुओं के ख़िलाफ़ बोलने वाले को ...,NOT,0
4661,कोहली है #नेहरू नहीं जो अंग्रेजों के तलवे चाटन...,HOF,0
4662,परशुराम? वही जिसने अपनी मां की हत्या की थीं?,NOT,0
4663,जिस देश में #कन्हैया_कुमार जैसा पढ़ा लिखा युवा...,HOF,0


In [None]:
train_df = df1.assign(NotOffensive=0)
train_df

Unnamed: 0,text,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,0,0
1,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,0,0
2,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,0,0
3,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,0,0
4,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,0,0
...,...,...,...,...
4660,पाकिस्तान ने हिंदुओं के ख़िलाफ़ बोलने वाले को ...,NOT,0,0
4661,कोहली है #नेहरू नहीं जो अंग्रेजों के तलवे चाटन...,HOF,0,0
4662,परशुराम? वही जिसने अपनी मां की हत्या की थीं?,NOT,0,0
4663,जिस देश में #कन्हैया_कुमार जैसा पढ़ा लिखा युवा...,HOF,0,0


In [None]:
for index in train_df.index:
    k = train_df['task_1'][index]
    if k == 'HOF':
        train_df['Offensive'][index] = 1
        train_df['NotOffensive'][index] = 0
    else:
        train_df['Offensive'][index] = 0
        train_df['NotOffensive'][index] = 1
train_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Offensive'][index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['NotOffensive'][index] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['Offensive'][index] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['NotOffensive'][index] = 0


Unnamed: 0,text,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,0,1
1,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,1,0
2,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,1,0
3,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,0,1
4,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,0,1
...,...,...,...,...
4660,पाकिस्तान ने हिंदुओं के ख़िलाफ़ बोलने वाले को ...,NOT,0,1
4661,कोहली है #नेहरू नहीं जो अंग्रेजों के तलवे चाटन...,HOF,1,0
4662,परशुराम? वही जिसने अपनी मां की हत्या की थीं?,NOT,0,1
4663,जिस देश में #कन्हैया_कुमार जैसा पढ़ा लिखा युवा...,HOF,1,0


In [None]:
train_df.columns

Index(['text', 'task_1', 'Offensive', 'NotOffensive'], dtype='object')

In [None]:
categories = train_df.columns[2:]
counts = []
for category in categories:
    counts.append((category, train_df[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of comments'])
df_stats

Unnamed: 0,category,number of comments
0,Offensive,2469
1,NotOffensive,2196


In [None]:
target_list = categories

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased",do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [None]:
def tokenizeWithBert(example):
  encodings = tokenizer.encode_plus(
    example,
    add_special_tokens = True,   # tokens CLS, PAD, SEP
    max_length = 512, #MAX_LEN
    padding = 'max_length',
    truncation = True,
    return_attention_mask = True,
    return_tensors = 'pt'
  )
  return encodings

In [None]:
def get_dataset(df, tokenizer, mode='train'):
    sentences, labels = df['text'], df.iloc[:,2:].to_numpy()
    max_length = 300
    in_T = []
    in_T_attn_masks = []
    for sentence in sentences:
        enc_sent_dict = tokenizer.encode_plus(
            sentence[:300],
            max_length = max_length,
            add_special_tokens = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        in_T.append(enc_sent_dict['input_ids'])
        in_T_attn_masks.append(enc_sent_dict['attention_mask'])
    
    in_T = torch.cat(in_T, dim=0)
    in_T_attn_masks = torch.cat(in_T_attn_masks, dim=0)
    labels = torch.tensor(labels, dtype = torch.float32)
    print('Text Input: ' , in_T.shape)
    print('Text Input Attention: ' , in_T_attn_masks.shape)    
    print('Labels: ' , labels.shape)
    
    dataset = TensorDataset(
        in_T,
        in_T_attn_masks,
        labels
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
train_dataset, val_dataset = get_dataset(
    train_df,
    tokenizer = tokenizer,
    mode = 'train'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Text Input:  torch.Size([4665, 300])
Text Input Attention:  torch.Size([4665, 300])
Labels:  torch.Size([4665, 2])


In [None]:
batch_size = 8
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    sampler = RandomSampler(train_dataset)
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    sampler = SequentialSampler(val_dataset)
)

print('Data Ready!!')

Data Ready!!


In [None]:
import torch.nn.functional as F
from transformers import BertModel

class MultiClassClassifier(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(MultiClassClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        
        self.bertmodel = BertModel.from_pretrained('bert-base-multilingual-uncased')
        self.ffn1 = nn.Linear(768, hidden_dim)
        self.dp1 = nn.Dropout()
        self.ffn2 = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, in_T, in_T_attn_masks):
        outputs = self.bertmodel(in_T, in_T_attn_masks)
        x = torch.mean(outputs.last_hidden_state, dim=1)
        x = F.relu(self.ffn1(x))
        x = self.dp1(x)
        x = torch.sigmoid(self.ffn2(x))
        return x

In [None]:
model = MultiClassClassifier(100, 2).to(device) # 100 hidden dimension, 2 lables
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8) # Adam with weight decay
criterion = nn.BCELoss()

In [None]:
#TRAINING and VALIDATION
epochs = 3  
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = total_steps)


training_stats = []
total_t0 = time.time()

best_val_loss = 1e8
true_labels = val_dataset[:][2].numpy()

for epoch_i in range(0, epochs):

    #############               Training
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 5 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. Loss: {:.5f}'.format(step, len(train_dataloader), elapsed, total_train_loss/step))

        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)
        
        model.zero_grad()

        logits = model(b_in_T, b_in_T_attn_masks)
        loss = criterion(logits, b_labels)

        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    ##########               Validation
   
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    pred_labels = np.empty((0,2))

    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)

        with torch.no_grad():
            logits = model(b_in_T, b_in_T_attn_masks)
            loss = criterion(logits, b_labels)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        pred_labels = np.concatenate((pred_labels, logits), axis=0)


    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    pred_labels = np.array([[int(x >= 0.25) for x in pred_labels[:,i]] for i  in range(2)]).transpose()

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

#     Report the final accuracy, f1-score for this validation run.
    for i in range(2):
        print("  Accuracy: {0:.2f}".format(accuracy_score(true_labels[:,i], pred_labels[:,i])))

    for i in range(2):
        print("  Macro F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='macro')))

    for i in range(2):
        print("  Weighted F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='weighted')))

    print('Classification Report:')
    for i in range(2):
        print(classification_report(true_labels[:,i], pred_labels[:,i]))

    print('Confusion Matrix:')
    for i in range(2):
        print(confusion_matrix(true_labels[:,i], pred_labels[:,i]))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'training_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'val_accuracy': np.mean([accuracy_score(true_labels[:,i], pred_labels[:,i]) for i in range(2)]),
            'val_macro_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='macro') for i in range(2)]),
            'val_weighted_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='weighted') for i in range(2)]),
            'training_time': training_time,
            'val_tim': validation_time
        }
    )

    model_path = 'model_state_dict_'+str(epoch_i)+'.pt'
    torch.save(model.state_dict(), model_path)


Training...
  Batch     5  of    467.    Elapsed: 0:00:02. Loss: 1.21615
  Batch    10  of    467.    Elapsed: 0:00:05. Loss: 0.95545
  Batch    15  of    467.    Elapsed: 0:00:07. Loss: 0.86350
  Batch    20  of    467.    Elapsed: 0:00:09. Loss: 0.80869
  Batch    25  of    467.    Elapsed: 0:00:11. Loss: 0.77111
  Batch    30  of    467.    Elapsed: 0:00:14. Loss: 0.72568
  Batch    35  of    467.    Elapsed: 0:00:16. Loss: 0.69815
  Batch    40  of    467.    Elapsed: 0:00:18. Loss: 0.67549
  Batch    45  of    467.    Elapsed: 0:00:21. Loss: 0.64542
  Batch    50  of    467.    Elapsed: 0:00:23. Loss: 0.63418
  Batch    55  of    467.    Elapsed: 0:00:25. Loss: 0.62850
  Batch    60  of    467.    Elapsed: 0:00:28. Loss: 0.61370
  Batch    65  of    467.    Elapsed: 0:00:30. Loss: 0.61986
  Batch    70  of    467.    Elapsed: 0:00:32. Loss: 0.61298
  Batch    75  of    467.    Elapsed: 0:00:35. Loss: 0.61912
  Batch    80  of    467.    Elapsed: 0:00:37. Loss: 0.60529
  Batch    

In [None]:
model_path = 'model_state_dict.pt'
torch.save(model.state_dict(), model_path)

In [None]:
modelPathDrive = '/content/drive/MyDrive/mBert.pt'
torch.save(model.state_dict(), modelPathDrive)

In [None]:
state_dict = torch.load('/content/drive/MyDrive/mBert.pt')
print(state_dict.keys())

odict_keys(['bertmodel.embeddings.position_ids', 'bertmodel.embeddings.word_embeddings.weight', 'bertmodel.embeddings.position_embeddings.weight', 'bertmodel.embeddings.token_type_embeddings.weight', 'bertmodel.embeddings.LayerNorm.weight', 'bertmodel.embeddings.LayerNorm.bias', 'bertmodel.encoder.layer.0.attention.self.query.weight', 'bertmodel.encoder.layer.0.attention.self.query.bias', 'bertmodel.encoder.layer.0.attention.self.key.weight', 'bertmodel.encoder.layer.0.attention.self.key.bias', 'bertmodel.encoder.layer.0.attention.self.value.weight', 'bertmodel.encoder.layer.0.attention.self.value.bias', 'bertmodel.encoder.layer.0.attention.output.dense.weight', 'bertmodel.encoder.layer.0.attention.output.dense.bias', 'bertmodel.encoder.layer.0.attention.output.LayerNorm.weight', 'bertmodel.encoder.layer.0.attention.output.LayerNorm.bias', 'bertmodel.encoder.layer.0.intermediate.dense.weight', 'bertmodel.encoder.layer.0.intermediate.dense.bias', 'bertmodel.encoder.layer.0.output.dense.

**Indic BERT**

In [None]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [None]:
import sys
import random
import torch
import torch.nn as nn
import shutil
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import AutoModel, AutoTokenizer
import time
import datetime
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

In [None]:
train_df = pd.read_csv('drive/MyDrive/final_hindi_backtranslated.csv')
train_df

Unnamed: 0,text,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,0,1
1,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,1,0
2,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,1,0
3,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,0,1
4,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,0,1
...,...,...,...,...
6060,कश्मीर भगवान से है,HOF,1,0
6061,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT,0,1
6062,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT,0,1
6063,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF,1,0


In [None]:
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')

Downloading (…)lve/main/config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [None]:
def tokenizeWithBert(example):
  encodings = tokenizer.encode_plus(
    example,
    add_special_tokens = True,   # tokens CLS, PAD, SEP
    max_length = 512, #MAX_LEN
    padding = 'max_length',
    truncation = True,
    return_attention_mask = True,
    return_tensors = 'pt'
  )
  return encodings

In [None]:
def get_dataset(df, tokenizer, mode='train'):
    sentences, labels = df['text'], df.iloc[:,2:].to_numpy()
    max_length = 300
    in_T = []
    in_T_attn_masks = []
    for sentence in sentences:
        enc_sent_dict = tokenizer.encode_plus(
            sentence[:300],
            max_length = max_length,
            add_special_tokens = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        in_T.append(enc_sent_dict['input_ids'])
        in_T_attn_masks.append(enc_sent_dict['attention_mask'])
    
    in_T = torch.cat(in_T, dim=0)
    in_T_attn_masks = torch.cat(in_T_attn_masks, dim=0)
    labels = torch.tensor(labels, dtype = torch.float32)
    print('Text Input: ' , in_T.shape)
    print('Text Input Attention: ' , in_T_attn_masks.shape)    
    print('Labels: ' , labels.shape)
    
    dataset = TensorDataset(
        in_T,
        in_T_attn_masks,
        labels
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
train_dataset, val_dataset = get_dataset(
    train_df,
    tokenizer = tokenizer,
    mode = 'train'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Text Input:  torch.Size([6065, 300])
Text Input Attention:  torch.Size([6065, 300])
Labels:  torch.Size([6065, 2])


In [None]:
batch_size = 8
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    sampler = RandomSampler(train_dataset)
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    sampler = SequentialSampler(val_dataset)
)

print('Data Ready!!')

Data Ready!!


In [None]:
import torch.nn.functional as F
from transformers import BertModel

class MultiClassClassifier(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(MultiClassClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        
        self.bertmodel = AutoModel.from_pretrained('ai4bharat/indic-bert')
        self.ffn1 = nn.Linear(768, hidden_dim)
        self.dp1 = nn.Dropout()
        self.ffn2 = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, in_T, in_T_attn_masks):
        outputs = self.bertmodel(in_T, in_T_attn_masks)
        x = torch.mean(outputs.last_hidden_state, dim=1) 
        x = F.relu(self.ffn1(x))
        x = self.dp1(x)
        x = torch.sigmoid(self.ffn2(x))
        return x

In [None]:
model = MultiClassClassifier(100, 2).to(device) # 100 hidden dimension, 2 lables
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8) # Adam with weight decay
criterion = nn.BCELoss()

Downloading pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of the model checkpoint at ai4bharat/indic-bert were not used when initializing AlbertModel: ['predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.bias', 'sop_classifier.classifier.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'sop_classifier.classifier.weight', 'predictions.dense.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
epochs = 3   
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = total_steps)


training_stats = []
total_t0 = time.time()

best_val_loss = 1e8
true_labels = val_dataset[:][2].numpy()

for epoch_i in range(0, epochs):

    #############               Training
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 5 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. Loss: {:.5f}'.format(step, len(train_dataloader), elapsed, total_train_loss/step))

        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)
        
        model.zero_grad()

        logits = model(b_in_T, b_in_T_attn_masks)
        loss = criterion(logits, b_labels)

        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    ##########               Validation
   
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    pred_labels = np.empty((0,2))

    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)

        with torch.no_grad():
            logits = model(b_in_T, b_in_T_attn_masks)
            loss = criterion(logits, b_labels)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        pred_labels = np.concatenate((pred_labels, logits), axis=0)


    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    pred_labels = np.array([[int(x >= 0.25) for x in pred_labels[:,i]] for i  in range(2)]).transpose()

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

#     Report the final accuracy, f1-score for this validation run.
    for i in range(2):
        print("  Accuracy: {0:.2f}".format(accuracy_score(true_labels[:,i], pred_labels[:,i])))

    for i in range(2):
        print("  Macro F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='macro')))

    for i in range(2):
        print("  Weighted F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='weighted')))

    print('Classification Report:')
    for i in range(2):
        print(classification_report(true_labels[:,i], pred_labels[:,i]))

    print('Confusion Matrix:')
    for i in range(2):
        print(confusion_matrix(true_labels[:,i], pred_labels[:,i]))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'training_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'val_accuracy': np.mean([accuracy_score(true_labels[:,i], pred_labels[:,i]) for i in range(2)]),
            'val_macro_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='macro') for i in range(2)]),
            'val_weighted_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='weighted') for i in range(2)]),
            'training_time': training_time,
            'val_tim': validation_time
        }
    )

    model_path = 'model_state_dict_'+str(epoch_i)+'.pt'
    torch.save(model.state_dict(), model_path)


Training...
  Batch     5  of    607.    Elapsed: 0:00:02. Loss: 0.69297
  Batch    10  of    607.    Elapsed: 0:00:04. Loss: 0.69223
  Batch    15  of    607.    Elapsed: 0:00:06. Loss: 0.68966
  Batch    20  of    607.    Elapsed: 0:00:08. Loss: 0.68973
  Batch    25  of    607.    Elapsed: 0:00:10. Loss: 0.69051
  Batch    30  of    607.    Elapsed: 0:00:12. Loss: 0.68815
  Batch    35  of    607.    Elapsed: 0:00:14. Loss: 0.68645
  Batch    40  of    607.    Elapsed: 0:00:16. Loss: 0.68501
  Batch    45  of    607.    Elapsed: 0:00:17. Loss: 0.68584
  Batch    50  of    607.    Elapsed: 0:00:19. Loss: 0.68466
  Batch    55  of    607.    Elapsed: 0:00:21. Loss: 0.68348
  Batch    60  of    607.    Elapsed: 0:00:23. Loss: 0.68295
  Batch    65  of    607.    Elapsed: 0:00:25. Loss: 0.68126
  Batch    70  of    607.    Elapsed: 0:00:27. Loss: 0.68186
  Batch    75  of    607.    Elapsed: 0:00:29. Loss: 0.67958
  Batch    80  of    607.    Elapsed: 0:00:31. Loss: 0.67817
  Batch    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Training...
  Batch     5  of    607.    Elapsed: 0:00:02. Loss: 0.54910
  Batch    10  of    607.    Elapsed: 0:00:04. Loss: 0.55483
  Batch    15  of    607.    Elapsed: 0:00:07. Loss: 0.55794
  Batch    20  of    607.    Elapsed: 0:00:09. Loss: 0.54812
  Batch    25  of    607.    Elapsed: 0:00:11. Loss: 0.56313
  Batch    30  of    607.    Elapsed: 0:00:13. Loss: 0.56686
  Batch    35  of    607.    Elapsed: 0:00:15. Loss: 0.56733
  Batch    40  of    607.    Elapsed: 0:00:18. Loss: 0.55898
  Batch    45  of    607.    Elapsed: 0:00:20. Loss: 0.55664
  Batch    50  of    607.    Elapsed: 0:00:22. Loss: 0.55361
  Batch    55  of    607.    Elapsed: 0:00:24. Loss: 0.55887
  Batch    60  of    607.    Elapsed: 0:00:26. Loss: 0.56403
  Batch    65  of    607.    Elapsed: 0:00:29. Loss: 0.56281
  Batch    70  of    607.    Elapsed: 0:00:31. Loss: 0.56631
  Batch    75  of    607.    Elapsed: 0:00:33. Loss: 0.56677
  Batch    80  of    607.    Elapsed: 0:00:35. Loss: 0.56434
  Batch    

**Back translation**

In [None]:
!pip install googletrans==4.0.0-rc1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.1/55.1 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chardet==3.*
  Downloading chardet-3.0.4-py2.py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.4/133.4 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hstspreload
  Downloading hstspreload-2023.1.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [None]:
from googletrans import Translator
import pandas as pd
import httpx

In [None]:
batch_1 = train_df['text'][:100]
timeout = httpx.Timeout(15)
translator = Translator(service_urls=['translate.google.com'],timeout=timeout)

# Back translate the Hindi tweets to English and then back to Hindi
back_translated = []
for text in batch_1:
    hindi_tweet = text
    english_translation = translator.translate(hindi_tweet, src='hi', dest='en').text
    back_translation = translator.translate(english_translation, src='en', dest='hi').text
    back_translated.append(back_translation)


In [None]:
len(back_translated)

100

In [None]:
batch_2 = train_df['text'][100:300]
timeout = httpx.Timeout(15)
translator = Translator(service_urls=['translate.google.com'],timeout=timeout)

# Back translate the Hindi tweets to English and then back to Hindi
for text in batch_2:
    hindi_tweet = text
    english_translation = translator.translate(hindi_tweet, src='hi', dest='en').text
    back_translation = translator.translate(english_translation, src='en', dest='hi').text
    back_translated.append(back_translation)

In [None]:
len(back_translated)

300

In [None]:
batch_3 = train_df['text'][300:500]
timeout = httpx.Timeout(15)
translator = Translator(service_urls=['translate.google.com'],timeout=timeout)

# Back translate the Hindi tweets to English and then back to Hindi
for text in batch_3:
    hindi_tweet = text
    english_translation = translator.translate(hindi_tweet, src='hi', dest='en').text
    back_translation = translator.translate(english_translation, src='en', dest='hi').text
    back_translated.append(back_translation)

In [None]:
batch_8 = train_df['text'][1400:2000]
timeout = httpx.Timeout(15)
translator = Translator(service_urls=['translate.google.com'],timeout=timeout)
back_translated = []
# Back translate the Hindi tweets to English and then back to Hindi
for text in batch_8:
    hindi_tweet = text
    english_translation = translator.translate(hindi_tweet, src='hi', dest='en').text
    back_translation = translator.translate(english_translation, src='en', dest='hi').text
    back_translated.append(back_translation)

In [None]:
len(back_translated)

500

In [None]:
batch_L1 = train_df['task_1'][:1400]
batch_L1

0       NOT
1       HOF
2       HOF
3       NOT
4       NOT
       ... 
1395    HOF
1396    NOT
1397    NOT
1398    HOF
1399    NOT
Name: task_1, Length: 1400, dtype: object

In [None]:
batch_L2 = batch_L1.append(train_df['task_1'][:1400])
batch_L2

  batch_L2 = batch_L1.append(train_df['task_1'][:1400])


0       NOT
1       HOF
2       HOF
3       NOT
4       NOT
       ... 
1395    HOF
1396    NOT
1397    NOT
1398    HOF
1399    NOT
Name: task_1, Length: 2800, dtype: object

In [None]:
back_translated = pd.read_csv('drive/MyDrive/backoutput.csv',sep='\t')
back_translated['task_1'] = batch_L1
back_translated

Unnamed: 0,0,task_1
0,"बांग्लादेश की शानदार वापसी, भारत 314 रन के लिए...",NOT
1,जैसे ही कोई वेश्या के नृत्य को देखने में व्यस्...,HOF
2,"आप जैसे हरविस के लिए जूतों की कमी है, धन्यवाद,...",HOF
3,"भाजपा के विधायक आकाश विजयवर्गिया, जेल से रिहा,...",NOT
4,शाइनिंग बुखार: विधानसभा परिसर में आरजेडी प्रदर...,NOT
...,...,...
1395,कश्मीर भगवान से है,HOF
1396,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT
1397,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT
1398,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF


In [None]:
dfb1 = back_translated.assign(Offensive=0)
dfb1

Unnamed: 0,0,task_1,Offensive
0,"बांग्लादेश की शानदार वापसी, भारत 314 रन के लिए...",NOT,0
1,जैसे ही कोई वेश्या के नृत्य को देखने में व्यस्...,HOF,0
2,"आप जैसे हरविस के लिए जूतों की कमी है, धन्यवाद,...",HOF,0
3,"भाजपा के विधायक आकाश विजयवर्गिया, जेल से रिहा,...",NOT,0
4,शाइनिंग बुखार: विधानसभा परिसर में आरजेडी प्रदर...,NOT,0
...,...,...,...
1395,कश्मीर भगवान से है,HOF,0
1396,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT,0
1397,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT,0
1398,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF,0


In [None]:
train_dfb = dfb1.assign(NotOffensive=0)
train_dfb

Unnamed: 0,0,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत 314 रन के लिए...",NOT,0,0
1,जैसे ही कोई वेश्या के नृत्य को देखने में व्यस्...,HOF,0,0
2,"आप जैसे हरविस के लिए जूतों की कमी है, धन्यवाद,...",HOF,0,0
3,"भाजपा के विधायक आकाश विजयवर्गिया, जेल से रिहा,...",NOT,0,0
4,शाइनिंग बुखार: विधानसभा परिसर में आरजेडी प्रदर...,NOT,0,0
...,...,...,...,...
1395,कश्मीर भगवान से है,HOF,0,0
1396,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT,0,0
1397,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT,0,0
1398,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF,0,0


In [None]:
for index in train_dfb.index:
    k = train_dfb['task_1'][index]
    if k == 'HOF':
        train_dfb['Offensive'][index] = 1
        train_dfb['NotOffensive'][index] = 0
    else:
        train_dfb['Offensive'][index] = 0
        train_dfb['NotOffensive'][index] = 1
train_dfb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dfb['Offensive'][index] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dfb['NotOffensive'][index] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dfb['Offensive'][index] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_dfb['NotOffensive'][index] = 0


Unnamed: 0,0,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत 314 रन के लिए...",NOT,0,1
1,जैसे ही कोई वेश्या के नृत्य को देखने में व्यस्...,HOF,1,0
2,"आप जैसे हरविस के लिए जूतों की कमी है, धन्यवाद,...",HOF,1,0
3,"भाजपा के विधायक आकाश विजयवर्गिया, जेल से रिहा,...",NOT,0,1
4,शाइनिंग बुखार: विधानसभा परिसर में आरजेडी प्रदर...,NOT,0,1
...,...,...,...,...
1395,कश्मीर भगवान से है,HOF,1,0
1396,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT,0,1
1397,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT,0,1
1398,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF,1,0


In [None]:
train_dfb.columns = ['text','task_1','Offensive','NotOffensive']
train_dfb

Unnamed: 0,text,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत 314 रन के लिए...",NOT,0,1
1,जैसे ही कोई वेश्या के नृत्य को देखने में व्यस्...,HOF,1,0
2,"आप जैसे हरविस के लिए जूतों की कमी है, धन्यवाद,...",HOF,1,0
3,"भाजपा के विधायक आकाश विजयवर्गिया, जेल से रिहा,...",NOT,0,1
4,शाइनिंग बुखार: विधानसभा परिसर में आरजेडी प्रदर...,NOT,0,1
...,...,...,...,...
1395,कश्मीर भगवान से है,HOF,1,0
1396,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT,0,1
1397,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT,0,1
1398,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF,1,0


In [None]:
frames = [train_df, train_dfb]
final_train_df = pd.concat(frames)
final_train_df
final_train_df.to_csv('drive/MyDrive/final_hindi_backtranslated.csv',index = False)

In [None]:
final_train_df = pd.read_csv('drive/MyDrive/final_hindi_backtranslated.csv')
final_train_df

Unnamed: 0,text,task_1,Offensive,NotOffensive
0,"बांग्लादेश की शानदार वापसी, भारत को 314 रन पर ...",NOT,0,1
1,सब रंडी नाच देखने मे व्यस्त जैसे ही कोई #शांती...,HOF,1,0
2,तुम जैसे हरामियों के लिए बस जूतों की कमी है शु...,HOF,1,0
3,"बीजेपी MLA आकाश विजयवर्गीय जेल से रिहा, जमानत ...",NOT,0,1
4,चमकी बुखार: विधानसभा परिसर में आरजेडी का प्रदर...,NOT,0,1
...,...,...,...,...
6060,कश्मीर भगवान से है,HOF,1,0
6061,अंग्रेजों के खिलाफ पहला संगठित संघर्ष रानी लक्...,NOT,0,1
6062,ऑटो से मोबाइल तक टेक की 5 बड़ी खबरें जानें ...,NOT,0,1
6063,90 % लोग पहले से ही जानते थे कि भारत मैच हार ज...,HOF,1,0


In [None]:
categories = final_train_df.columns[2:]
counts = []
for category in categories:
    counts.append((category, final_train_df[category].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number of comments'])
df_stats

Unnamed: 0,category,number of comments
0,Offensive,2888
1,NotOffensive,3177


In [None]:
target_list = categories

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased",do_lower_case=True)

In [None]:
def tokenizeWithBert(example):
  encodings = tokenizer.encode_plus(
    example,
    add_special_tokens = True,   # tokens CLS, PAD, SEP
    max_length = 512, #MAX_LEN
    padding = 'max_length',
    truncation = True,
    return_attention_mask = True,
    return_tensors = 'pt'
  )
  return encodings

In [None]:
def get_dataset(df, tokenizer, mode='train'):
    sentences, labels = df['text'], df.iloc[:,2:].to_numpy()
    max_length = 300
    in_T = []
    in_T_attn_masks = []
    for sentence in sentences:
        enc_sent_dict = tokenizer.encode_plus(
            sentence[:300],
            max_length = max_length,
            add_special_tokens = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        in_T.append(enc_sent_dict['input_ids'])
        in_T_attn_masks.append(enc_sent_dict['attention_mask'])
    
    in_T = torch.cat(in_T, dim=0)
    in_T_attn_masks = torch.cat(in_T_attn_masks, dim=0)
    labels = torch.tensor(labels, dtype = torch.float32)
    print('Text Input: ' , in_T.shape)
    print('Text Input Attention: ' , in_T_attn_masks.shape)    
    print('Labels: ' , labels.shape)
    
    dataset = TensorDataset(
        in_T,
        in_T_attn_masks,
        labels
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
train_dataset, val_dataset = get_dataset(
    final_train_df,
    tokenizer = tokenizer,
    mode = 'train'
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Text Input:  torch.Size([6065, 300])
Text Input Attention:  torch.Size([6065, 300])
Labels:  torch.Size([6065, 2])


In [None]:
batch_size = 8
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    sampler = RandomSampler(train_dataset)
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    sampler = SequentialSampler(val_dataset)
)

print('Data Ready!!')

Data Ready!!


In [None]:
import torch.nn.functional as F
from transformers import BertModel

class MultiClassClassifier(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(MultiClassClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        
        self.bertmodel = BertModel.from_pretrained('bert-base-multilingual-uncased')
        self.ffn1 = nn.Linear(768, hidden_dim)
        self.dp1 = nn.Dropout()
        self.ffn2 = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, in_T, in_T_attn_masks):
        outputs = self.bertmodel(in_T, in_T_attn_masks)
        x = torch.mean(outputs.last_hidden_state, dim=1)
        x = F.relu(self.ffn1(x))
        x = self.dp1(x)
        x = torch.sigmoid(self.ffn2(x))
        return x

In [None]:
model = MultiClassClassifier(100, 2).to(device) # 100 hidden dimension, 2 lables
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8) # Adam with weight decay
criterion = nn.BCELoss()

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
#TRAINING and VALIDATION
epochs = 3  
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = total_steps)


training_stats = []
total_t0 = time.time()

best_val_loss = 1e8
true_labels = val_dataset[:][2].numpy()

for epoch_i in range(0, epochs):

    #############               Training
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 5 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. Loss: {:.5f}'.format(step, len(train_dataloader), elapsed, total_train_loss/step))

        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)
        
        model.zero_grad()

        logits = model(b_in_T, b_in_T_attn_masks)
        loss = criterion(logits, b_labels)

        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    ##########               Validation
   
    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    pred_labels = np.empty((0,2))

    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)

        with torch.no_grad():
            logits = model(b_in_T, b_in_T_attn_masks)
            loss = criterion(logits, b_labels)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        pred_labels = np.concatenate((pred_labels, logits), axis=0)


    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    pred_labels = np.array([[int(x >= 0.25) for x in pred_labels[:,i]] for i  in range(2)]).transpose()

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

#     Report the final accuracy, f1-score for this validation run.
    for i in range(2):
        print("  Accuracy: {0:.2f}".format(accuracy_score(true_labels[:,i], pred_labels[:,i])))

    for i in range(2):
        print("  Macro F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='macro')))

    for i in range(2):
        print("  Weighted F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='weighted')))

    print('Classification Report:')
    for i in range(2):
        print(classification_report(true_labels[:,i], pred_labels[:,i]))

    print('Confusion Matrix:')
    for i in range(2):
        print(confusion_matrix(true_labels[:,i], pred_labels[:,i]))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'training_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'val_accuracy': np.mean([accuracy_score(true_labels[:,i], pred_labels[:,i]) for i in range(2)]),
            'val_macro_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='macro') for i in range(2)]),
            'val_weighted_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='weighted') for i in range(2)]),
            'training_time': training_time,
            'val_tim': validation_time
        }
    )

    model_path = 'model_state_dict_'+str(epoch_i)+'.pt'
    torch.save(model.state_dict(), model_path)


Training...
  Batch     5  of    607.    Elapsed: 0:00:02. Loss: 0.69452
  Batch    10  of    607.    Elapsed: 0:00:05. Loss: 0.69682
  Batch    15  of    607.    Elapsed: 0:00:07. Loss: 0.69629
  Batch    20  of    607.    Elapsed: 0:00:09. Loss: 0.69504
  Batch    25  of    607.    Elapsed: 0:00:12. Loss: 0.69238
  Batch    30  of    607.    Elapsed: 0:00:14. Loss: 0.69056
  Batch    35  of    607.    Elapsed: 0:00:17. Loss: 0.68681
  Batch    40  of    607.    Elapsed: 0:00:19. Loss: 0.68260
  Batch    45  of    607.    Elapsed: 0:00:21. Loss: 0.67987
  Batch    50  of    607.    Elapsed: 0:00:24. Loss: 0.67449
  Batch    55  of    607.    Elapsed: 0:00:26. Loss: 0.66914
  Batch    60  of    607.    Elapsed: 0:00:29. Loss: 0.66273
  Batch    65  of    607.    Elapsed: 0:00:31. Loss: 0.66474
  Batch    70  of    607.    Elapsed: 0:00:34. Loss: 0.66113
  Batch    75  of    607.    Elapsed: 0:00:36. Loss: 0.65576
  Batch    80  of    607.    Elapsed: 0:00:39. Loss: 0.64697
  Batch    

In [None]:
!pip install nlpaug

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [None]:
import pandas as pd
import nlpaug.augmenter.word as naw

In [None]:
def augment_text(text):
    # Define the augmentation technique (in this case, synonym replacement)
    aug = naw.SynonymAug()

    # Augment the text using the specified technique
    augmented_text = aug.augment(text)

    return augmented_text

In [None]:
!pip install -U wn==0.0.23

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wn==0.0.23
  Downloading wn-0.0.23.tar.gz (31.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.6/31.6 MB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wn
  Building wheel for wn (setup.py) ... [?25l[?25hdone
  Created wheel for wn: filename=wn-0.0.23-py3-none-any.whl size=31792928 sha256=dad3a88c0085550de5c25b2c6147b77c4c247477da311ce91645735e219a556b
  Stored in directory: /root/.cache/pip/wheels/a1/1a/7d/23a76ce45998af60e47466a694c237fa26023c5674b47672b2
Successfully built wn
Installing collected packages: wn
Successfully installed wn-0.0.23


In [None]:
!pip3 install pywsd==1.0.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pywsd==1.0.2
  Downloading pywsd-1.0.2.tar.gz (8.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pywsd
  Building wheel for pywsd (setup.py) ... [?25l[?25hdone
  Created wheel for pywsd: filename=pywsd-1.0.2-py3-none-any.whl size=12122 sha256=d8734aee011b5aa4281177b0bed354f1d0a29ec43f806e0612eadc6c7a77ae91
  Stored in directory: /root/.cache/pip/wheels/93/4a/27/3c91e7c499b777b847997c8e15b4a4dd83c114b619f1e64987
Successfully built pywsd
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
!pip install nltk==3.6.2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nltk==3.6.2
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.8.1
    Uninstalling nltk-3.8.1:
      Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.6.2


In [None]:
!pip install --upgrade pyiwn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyiwn
  Downloading pyiwn-0.0.5-py3-none-any.whl (12 kB)
Installing collected packages: pyiwn
Successfully installed pyiwn-0.0.5
