In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading CSV from link
def read_csv_from_link(url):
    path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
    df = pd.read_csv(path,delimiter="\t",error_bad_lines=False, header=None)
    return df

In [3]:
# Loading All Data
tamil_train = read_csv_from_link('https://drive.google.com/file/d/15auwrFAlq52JJ61u7eSfnhT9rZtI5sjk/view?usp=sharing')
tamil_dev = read_csv_from_link('https://drive.google.com/file/d/1Jme-Oftjm7OgfMNLKQs1mO_cnsQmznRI/view?usp=sharing')

In [4]:
# Test
tamil_test = read_csv_from_link('https://drive.google.com/file/d/17o-spkU5JnI_18qDJXO-F_DoIG9Zzv9q/view?usp=sharing')
malayalam_test = read_csv_from_link('https://drive.google.com/file/d/1waRFe4yTG8TMkMruICaavd9JH0xiO_rb/view?usp=sharing')
kannada_test = read_csv_from_link('https://drive.google.com/file/d/14DQvnNZCXSgmiZxJqGtPYFdRqBH7TOSr/view?usp=sharing')

In [5]:
# Tamil Preprocess
tamil_train = tamil_train.iloc[:, 0:2]
tamil_train = tamil_train.rename(columns={0: "text", 1: "label"})

tamil_dev = tamil_dev.iloc[:, 0:2]
tamil_dev = tamil_dev.rename(columns={0: "text", 1: "label"})

tamil_test = tamil_test.iloc[:, 0:1]
tamil_test = tamil_test.rename(columns={0: "text"})

# Stats
tamil_train['label'] = pd.Categorical(tamil_train.label)
tamil_dev['label'] = pd.Categorical(tamil_dev.label)

In [6]:
print(tamil_train.head())

                                                text          label
0                  movie vara level la Erika poguthu  Not_offensive
1  I love Ajith Kumar Vivegam movie inki mjy bht ...      not-Tamil
2          Padam nalla comedy padama irukum polaye..  Not_offensive
3  karthick subburaj anne .... intha padam vetri ...  Not_offensive
4  கவுண்டர் தேவர்.சார்பாக வெற்றி பெற வாழ்த்துக்கள் 🦁  Not_offensive


### Load Model

In [7]:
import torch

In [8]:
!gpustat -p

[1m[37mdevi                   [m  Fri Jan  8 18:44:56 2021  [1m[30m450.51.05[m
[36m[0][m [34mTesla P100-PCIE-12GB[m |[31m 44'C[m, [32m  0 %[m | [36m[1m[33m    6[m / [33m12198[m MB | [1m[30mgdm[m/1412([33m4M[m)
[36m[1][m [34mTesla P100-PCIE-16GB[m |[31m 42'C[m, [32m  0 %[m | [36m[1m[33m    6[m / [33m16280[m MB | [1m[30mgdm[m/1412([33m4M[m)


In [9]:
print("GPU Nos: {}".format(torch.cuda.device_count()))
# print(torch.cuda.get_device_name(0))
# print(torch.cuda.get_device_name(1))

# Change Device - CPU/GPU-0/GPU-1
torch.cuda.set_device(0)
device = 'cuda'
device = device if torch.cuda.is_available() else 'cpu'

## Enter Path of Saved model here in torch.load()

In [10]:
# Model Select
import os
from os import listdir
from os.path import isfile, join
[x for x in listdir('../../finetuned_berts/') if 'kannada' in x]

['Indic_bert_offensive_only_kannada',
 'MURIL_cased_temp_offensive_only_kannada_weighted',
 'XLMroberta_large_kannada_weighted',
 'Mbert_base_cased_offensive_only_kannada',
 'Indic_bert_collated_kannada_weighted',
 'Indic_bert_kannada',
 'Distilbert_m_base_cased_kannada',
 'XLMroberta_base_kannada_weighted',
 'MURIL_cased_temp_collated_kannada_weighted',
 'XLMroberta_custom_pretrained_collated_kannada',
 'MURIL_cased_temp_collated_kannada',
 'Distilbert_m_base_cased_kannada_weighted',
 'XLMroberta_base_offensive_only_kannada_weighted',
 'XLMroberta_custom_pretrained_kannada_weighted',
 'XLMroberta_large_collated_kannada_weighted',
 'XLMroberta_base_kannada',
 'XLMroberta_custom_pretrained_offensive_only_kannada',
 'Indic_bert_collated_kannada',
 'MURIL_cased_temp_kannada_weighted',
 'XLMroberta_large_kannada',
 'Mbert_base_cased_collated_kannada_weighted',
 'XLMroberta_custom_pretrained_collated_kannada_weighted',
 'XLMroberta_custom_pretrained_offensive_only_kannada_weighted',
 'Mbert

In [11]:
models = []
tokenizers = []

In [12]:
from transformers import AutoTokenizer, AutoModel, BertTokenizer, XLMRobertaTokenizer

model_names = [
    'Mbert_base_cased_Tamil',
    'XLMroberta_custom_pretrained_tamil_weighted',
]
tokenizers = [
    BertTokenizer.from_pretrained('bert-base-multilingual-cased'),
    XLMRobertaTokenizer.from_pretrained('xlm-roberta-base'),
]

for name in model_names:
    model = AutoModel.from_pretrained(os.path.join('../../finetuned_berts/', name))
    model.eval()
    models.append(model)

Some weights of XLMRobertaModel were not initialized from the model checkpoint at ../finetuned_berts/XLMroberta_custom_pretrained_tamil_weighted and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
lin_dim = np.sum([1024 if 'large' in name else 768 for name in model_names])
lin_dim

1536

In [14]:
# # Loading Model
# #saved_model_filename = 'XLMroberta_base_kannada_weighted.pth'
# saved_model_filename = 'XLMroberta_large_kannada.pth'
# from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaModel
# tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
# model = XLMRobertaModel.from_pretrained('xlm-roberta-large')

# load_dict = torch.load(os.path.join('../../finetuned_models/', saved_model_filename))
# load_dict = {k.split('roberta.')[-1]: v for k, v in load_dict.items()}
# load_dict = {k: v for k, v in load_dict.items() if k in model.state_dict()}
# model.load_state_dict(load_dict, strict=False)
# model.eval()

# models.append(model)
# tokenizers.append(tokenizer)

In [15]:
# Using Indic Bert
# saved_model_filename = 'Indic_bert_Tamil.pth'
# from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
# tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
# model = AutoModel.from_pretrained("ai4bharat/indic-bert")

# load_dict = torch.load(os.path.join('../../finetuned_models/', saved_model_filename))
# load_dict = {k.split('albert.')[-1]: v for k, v in load_dict.items()}
# load_dict = {k: v for k, v in load_dict.items() if k in model.state_dict()}
# model.load_state_dict(load_dict, strict=False)
# model.eval()

# models.append(model)
# tokenizers.append(tokenizer)

In [16]:
n_models = len(models)

In [17]:
for model in models:
    for param in model.parameters():
        param.requires_grad = False

### Dataset Stuff

In [18]:
label_mapping = {
    'Not_offensive': 0, 
    'not-Tamil': 1, 
    'Offensive_Targeted_Insult_Other': 2, 
    'Offensive_Targeted_Insult_Group': 3, 
    'Offensive_Untargetede': 4, 
    'Offensive_Targeted_Insult_Individual': 5
}

# Collecting Text and Labels
train_batch_sentences = list(tamil_train['text'])
train_batch_labels =  [label_mapping[x] for x in tamil_train['label']]
dev_batch_sentences = list(tamil_dev['text'])
dev_batch_labels =  [label_mapping[x] for x in tamil_dev['label']]
test_batch_sentences = list(tamil_test['text'])

# Convert to Tensor
train_encodings = [tokenizer(train_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
train_labels = torch.tensor(train_batch_labels)
dev_encodings = [tokenizer(dev_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
dev_labels = torch.tensor(dev_batch_labels)
test_encodings = [tokenizer(test_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]

In [20]:
# Dataset
from torch.utils.data import Dataset

class Tamil_Offensive_Dataset(Dataset):
    def __init__(self, encodings, labels = None):
        self.encodings = encodings
        self.labels = labels
        self.n_models = len(encodings)

    def __getitem__(self, idx):
        item = {}
        for i in range(self.n_models):
            item.update({key+'_'+str(i): torch.tensor(val[idx]) for key, val in self.encodings[i].items()})
        item['index'] = idx
        if self.labels != None:
            item['labels'] = torch.tensor(self.labels[idx])
        else:
            item['labels'] = torch.tensor(1)
        return item

    def __len__(self):
        return len(self.encodings[0]['input_ids'])

# Defining Datasets
train_dataset = Tamil_Offensive_Dataset(train_encodings, train_labels)
dev_dataset = Tamil_Offensive_Dataset(dev_encodings, dev_labels)
test_dataset = Tamil_Offensive_Dataset(test_encodings)

In [21]:
import torch.nn.functional as F
import torch.nn as nn

# Basic Fully-Connected (Linear => BatchNorm => ReLU)
class BasicFC(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicFC, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels, **kwargs)
        self.bn = nn.BatchNorm1d(out_channels, eps=0.001)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)

class FusionNet(torch.nn.Module):
    def __init__(self, D_in, H1, H2, H3, D_out):
        super(FusionNet, self).__init__()
        self.linear1_1 = BasicFC(D_in, H1)
        self.linear1_2 = BasicFC(H1, H2)
        self.linear1_3 = BasicFC(H2, H3)
        self.dp = nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(H3, D_out, bias = False)

    def forward(self, x):
        h_relu_1 = self.linear1_1(x)
        h_relu_2 = self.dp(self.linear1_2(h_relu_1))
        h_relu_3 = self.dp(self.linear1_3(h_relu_2))
        y_pred = self.linear2(h_relu_3)
        return y_pred

In [22]:
# Optimiser
from transformers import AdamW
from pytorch_pretrained_bert.optimization import BertAdam

In [23]:
## Loss Fn
XE_loss_function = nn.CrossEntropyLoss(reduction='mean').float()

In [24]:
class F1_Loss(nn.Module):
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. epsilon <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    - http://www.ryanzhang.info/python/writing-your-own-loss-function-module-for-pytorch/
    '''
    def __init__(self, epsilon=1e-7, n_labels = 6):
        super().__init__()
        self.epsilon = epsilon
        self.n_labels = n_labels
        
    def forward(self, y_pred, y_true,):
        assert y_pred.ndim == 2
        assert y_true.ndim == 1
        y_true = F.one_hot(y_true, self.n_labels).to(torch.float32)
        y_pred = F.softmax(y_pred, dim=1)
        
        tp = (y_true * y_pred).sum(dim=0).to(torch.float32)
        tn = ((1 - y_true) * (1 - y_pred)).sum(dim=0).to(torch.float32)
        fp = ((1 - y_true) * y_pred).sum(dim=0).to(torch.float32)
        fn = (y_true * (1 - y_pred)).sum(dim=0).to(torch.float32)

        precision = tp / (tp + fp + self.epsilon)
        recall = tp / (tp + fn + self.epsilon)

        f1 = 2* (precision*recall) / (precision + recall + self.epsilon)
        f1 = f1.clamp(min=self.epsilon, max=1-self.epsilon)
        return 1 - f1.mean()

F1_loss_function = F1_Loss().cuda()

In [25]:
use_f1_loss = False
if use_f1_loss:
    loss_function = F1_loss_function
else:
    loss_function = XE_loss_function
    #loss_function = multi_loss_function

In [26]:
add_extra_embeds = [
     'cnn_128',
     #'cnn_64',
     #'sentbert',
#      'laser_ta',
#      'laser_en',
]

embeds_files = {
    'cnn_128': ['../../sentence_embeddings/cnn_emb_train_128_tamil.npy', '../../sentence_embeddings/cnn_emb_dev_128_tamil.npy', '../../sentence_embeddings/cnn_emb_test_128_tamil.npy'],
    'cnn_64': ['../../sentence_embeddings/cnn_skipgram_emb_train.npy', '../../sentence_embeddings/cnn_skipgram_emb_dev.npy', '../../sentence_embeddings/cnn_skipgram_emb_test.npy'],
    'laser_ta': ['../../sentence_embeddings/LASER_embeddings_tamil_ta_train.npy', '../../sentence_embeddings/LASER_embeddings_tamil_ta_dev.npy'],
    'laser_en': ['../../sentence_embeddings/LASER_embeddings_tamil_en_train.npy', '../../sentence_embeddings/LASER_embeddings_tamil_en_dev.npy'],
    'sentbert': ['../../sentence_embeddings/sentence_bert_train.npy', '../../sentence_embeddings/sentence_bert_dev.npy'],
}

train_embeddings = [np.load(embeds_files[embname][0]) for embname in add_extra_embeds]
dev_embeddings = [np.load(embeds_files[embname][1]) for embname in add_extra_embeds]
test_embeddings = [np.load(embeds_files[embname][2]) for embname in add_extra_embeds]

In [27]:
train_embeddings[0].shape

(35139, 128)

In [28]:
len_extra_embeds = len(add_extra_embeds)
len_extra_embeds

1

In [29]:
dim_extra_embeds = np.sum([train_embeddings[i].shape[1] for i in range(len_extra_embeds)])
dim_extra_embeds

128

In [30]:
print(lin_dim + dim_extra_embeds)

1664


In [31]:
best_model = None

In [32]:
import copy

### Train

In [33]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, f1_score

fusion_classifier = FusionNet(int(lin_dim + dim_extra_embeds), 1024, 256, 64, 6)
# Optimiser
optimizer = AdamW(fusion_classifier.parameters(), lr=1e-5)
mo,ex_em = "",""
for mod in model_names:
    mo += mod
for em in add_extra_embeds:
    ex_em += em
model_name = 'fusion_tamil_'+mo+'_'+ex_em

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
fusion_classifier.to(device)
for model in models:
    model.to(device)
best_val_f1 = 0
count = 0

# Dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

for epoch in range(100):
    fusion_classifier.train()
    print("==========================================================")
    print("Epoch {}".format(epoch))
    print("Train")
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        outputs_all = []
        for i in range(n_models):
            model = models[i]
            input_ids = batch['input_ids'+'_'+str(i)].to(device)
            attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            outputs_all.append(outputs[1])
        
        for i in range(len_extra_embeds):
            outputs_all.append(torch.Tensor(train_embeddings[i][batch['index'], :]).to(device))
            
        bert_output = torch.cat(outputs_all, dim = -1) 
        out = fusion_classifier(bert_output)
        loss = loss_function(out, labels)
        loss.backward()
        optimizer.step()
    
    print("Dev")
    dev_preds = []
    fusion_classifier.eval()
    total_val_loss = 0
    with torch.set_grad_enabled(False):
        for batch in tqdm(dev_loader):
            outputs_all = []
            for i in range(n_models):
                model = models[i]
                input_ids = batch['input_ids'+'_'+str(i)].to(device)
                attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                outputs_all.append(outputs[1])
                
            for i in range(len_extra_embeds):
                outputs_all.append(torch.Tensor(dev_embeddings[i][batch['index'], :]).to(device))

            bert_output = torch.cat(outputs_all, dim = -1) 
            out = fusion_classifier(bert_output)
            loss = loss_function(out, labels)
            total_val_loss += loss.item()/len(dev_loader)
            
            for logits in out.cpu().numpy():
                dev_preds.append(np.argmax(logits))
    
    y_true = dev_batch_labels
    y_pred = dev_preds
    target_names = label_mapping.keys()
    report = classification_report(y_true, y_pred, target_names=target_names)
    val_f1 = f1_score(y_true, y_pred, average='macro')
    
    if val_f1 > best_val_f1:
        PATH = '../../finetuned_models/' + model_name + '.pth'
        torch.save(fusion_classifier.state_dict(), PATH)
        best_val_f1 = val_f1
        best_model = copy.deepcopy(fusion_classifier)
        count = 0
    else:
        count += 1
    
    print(report)
    print("Epoch {}, Val Loss = {}, Val F1 = {}, Best Val f1 = {}, stagnant_t = {}".format(epoch, total_val_loss, val_f1, best_val_f1, count))
    if count == 5:
        print("No increase for 5 epochs, Stopping ...")
        break

Epoch 0
Train


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2197.0), HTML(value='')))

  del sys.path[0]
  app.launch_new_instance()



Dev


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))


                                      precision    recall  f1-score   support

                       Not_offensive       0.89      0.89      0.89      3193
                           not-Tamil       0.82      0.87      0.85       172
     Offensive_Targeted_Insult_Other       0.04      0.02      0.02        65
     Offensive_Targeted_Insult_Group       0.34      0.41      0.37       295
               Offensive_Untargetede       0.39      0.42      0.40       356
Offensive_Targeted_Insult_Individual       0.51      0.41      0.46       307

                            accuracy                           0.77      4388
                           macro avg       0.50      0.50      0.50      4388
                        weighted avg       0.77      0.77      0.77      4388

Epoch 0, Val Loss = 1.0347464641657742, Val F1 = 0.4977503074658605, Best Val f1 = 0.4977503074658605, stagnant_t = 0
Epoch 1
Train


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2197.0), HTML(value='')))


Dev


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))


                                      precision    recall  f1-score   support

                       Not_offensive       0.89      0.88      0.88      3193
                           not-Tamil       0.84      0.85      0.85       172
     Offensive_Targeted_Insult_Other       0.00      0.00      0.00        65
     Offensive_Targeted_Insult_Group       0.35      0.41      0.38       295
               Offensive_Untargetede       0.39      0.44      0.41       356
Offensive_Targeted_Insult_Individual       0.47      0.44      0.45       307

                            accuracy                           0.77      4388
                           macro avg       0.49      0.50      0.50      4388
                        weighted avg       0.77      0.77      0.77      4388

Epoch 1, Val Loss = 0.91070671341636, Val F1 = 0.496401770756378, Best Val f1 = 0.4977503074658605, stagnant_t = 1
Epoch 2
Train


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2197.0), HTML(value='')))


Dev


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))


                                      precision    recall  f1-score   support

                       Not_offensive       0.86      0.92      0.89      3193
                           not-Tamil       0.88      0.80      0.84       172
     Offensive_Targeted_Insult_Other       0.00      0.00      0.00        65
     Offensive_Targeted_Insult_Group       0.37      0.36      0.36       295
               Offensive_Untargetede       0.47      0.35      0.40       356
Offensive_Targeted_Insult_Individual       0.49      0.39      0.44       307

                            accuracy                           0.78      4388
                           macro avg       0.51      0.47      0.49      4388
                        weighted avg       0.76      0.78      0.77      4388

Epoch 2, Val Loss = 0.7288551826097749, Val F1 = 0.4885005667022395, Best Val f1 = 0.4977503074658605, stagnant_t = 2
Epoch 3
Train


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2197.0), HTML(value='')))


Dev


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))


                                      precision    recall  f1-score   support

                       Not_offensive       0.88      0.90      0.89      3193
                           not-Tamil       0.87      0.81      0.84       172
     Offensive_Targeted_Insult_Other       0.00      0.00      0.00        65
     Offensive_Targeted_Insult_Group       0.38      0.32      0.35       295
               Offensive_Untargetede       0.39      0.49      0.43       356
Offensive_Targeted_Insult_Individual       0.50      0.40      0.44       307

                            accuracy                           0.78      4388
                           macro avg       0.50      0.49      0.49      4388
                        weighted avg       0.77      0.78      0.77      4388

Epoch 3, Val Loss = 0.7443472450700676, Val F1 = 0.492798165024041, Best Val f1 = 0.4977503074658605, stagnant_t = 3
Epoch 4
Train


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2197.0), HTML(value='')))


Dev


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))


                                      precision    recall  f1-score   support

                       Not_offensive       0.87      0.91      0.89      3193
                           not-Tamil       0.87      0.85      0.86       172
     Offensive_Targeted_Insult_Other       0.07      0.03      0.04        65
     Offensive_Targeted_Insult_Group       0.36      0.34      0.35       295
               Offensive_Untargetede       0.40      0.43      0.42       356
Offensive_Targeted_Insult_Individual       0.52      0.31      0.39       307

                            accuracy                           0.78      4388
                           macro avg       0.51      0.48      0.49      4388
                        weighted avg       0.76      0.78      0.77      4388

Epoch 4, Val Loss = 0.7310985839908771, Val F1 = 0.4919166502007308, Best Val f1 = 0.4977503074658605, stagnant_t = 4
Epoch 5
Train


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2197.0), HTML(value='')))


Dev


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))


                                      precision    recall  f1-score   support

                       Not_offensive       0.87      0.91      0.89      3193
                           not-Tamil       0.86      0.83      0.84       172
     Offensive_Targeted_Insult_Other       0.00      0.00      0.00        65
     Offensive_Targeted_Insult_Group       0.32      0.41      0.36       295
               Offensive_Untargetede       0.45      0.36      0.40       356
Offensive_Targeted_Insult_Individual       0.52      0.34      0.41       307

                            accuracy                           0.78      4388
                           macro avg       0.50      0.47      0.48      4388
                        weighted avg       0.76      0.78      0.77      4388

Epoch 5, Val Loss = 0.7610415003110066, Val F1 = 0.4834264749779868, Best Val f1 = 0.4977503074658605, stagnant_t = 5
No increase for 5 epochs, Stopping ...


In [34]:
best_val_f1

0.4977503074658605

#### Final Model predictions on Dev

In [35]:
dev_preds = []
dev_probs = []
best_model.eval()
with torch.set_grad_enabled(False):
    for batch in tqdm(dev_loader):
        outputs_all = []
        for i in range(n_models):
            model = models[i]
            input_ids = batch['input_ids'+'_'+str(i)].to(device)
            attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            outputs_all.append(outputs[1])

        for i in range(len_extra_embeds):
            outputs_all.append(torch.Tensor(dev_embeddings[i][batch['index'], :]).to(device))

        bert_output = torch.cat(outputs_all, dim = -1) 
        out = best_model(bert_output)

        for logits in out.cpu().numpy():
            dev_preds.append(np.argmax(logits))
        for logits in out.cpu().numpy():
            dev_probs.append(np.exp(logits)/np.sum(np.exp(logits)))

y_true = dev_batch_labels
y_pred = dev_preds
target_names = label_mapping.keys()
report = classification_report(y_true, y_pred, target_names=target_names)
val_f1 = f1_score(y_true, y_pred, average='macro')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))

  del sys.path[0]
  app.launch_new_instance()





In [36]:
print(report)

                                      precision    recall  f1-score   support

                       Not_offensive       0.89      0.89      0.89      3193
                           not-Tamil       0.82      0.87      0.85       172
     Offensive_Targeted_Insult_Other       0.04      0.02      0.02        65
     Offensive_Targeted_Insult_Group       0.34      0.41      0.37       295
               Offensive_Untargetede       0.39      0.42      0.40       356
Offensive_Targeted_Insult_Individual       0.51      0.41      0.46       307

                            accuracy                           0.77      4388
                           macro avg       0.50      0.50      0.50      4388
                        weighted avg       0.77      0.77      0.77      4388



In [38]:
val_f1

0.4977503074658605

In [39]:
dev_probs = np.array(dev_probs)
dev_probs.shape

(4388, 6)

In [40]:
#### SAVE PREDS
np.save('dev_probs/dev_preds_'+model_name+'.npy', dev_probs)

In [41]:
len(test_dataset)

4392

#### Final Model predictions on Test

In [42]:
best_model.eval()
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

test_preds = []
test_probs = []
total_val_loss = 0
with torch.set_grad_enabled(False):
    for batch in tqdm(test_loader):
        outputs_all = []
        for i in range(n_models):
            model = models[i]
            input_ids = batch['input_ids'+'_'+str(i)].to(device)
            attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            outputs_all.append(outputs[1])

        for i in range(len_extra_embeds):
            outputs_all.append(torch.Tensor(test_embeddings[i][batch['index'], :]).to(device))

        bert_output = torch.cat(outputs_all, dim = -1) 
        out = best_model(bert_output)
 
        for logits in out.cpu().numpy():
            test_preds.append(np.argmax(logits))
        for logits in out.cpu().numpy():
            test_probs.append(np.exp(logits)/np.sum(np.exp(logits)))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))

  del sys.path[0]





In [43]:
test_probs = np.array(test_probs)
test_probs.shape

(4392, 6)

In [44]:
#### SAVE PREDS
np.save('test_probs/test_preds_'+model_name+'.npy', test_probs)

#### Save config in dict for predictions and ensemble

In [None]:
dict = {}
dict['pth_name'] = model_name
dict['model'] = model_names
dict['pretrained_keys'] = ['xlm-roberta-large','bert-base-multilingual-cased']
#dict['pretrained_keys'] = ['xlm-roberta-base','xlm-roberta-base','bert-base-multilingual-cased',"ai4bharat/indic-bert"]
dict['extra_embeds'] = add_extra_embeds
dict['length'] = lin_dim + dim_extra_embeds

In [None]:
print(dict)

In [None]:
model_name_tot = 'dicts/'+model_name

In [None]:
import pickle
with open(str(model_name_tot)+'.pickle', 'wb') as handle:
    pickle.dump(dict, handle)

#### Save Preds - Get predictions from individual model

In [None]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, f1_score

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Dataloaders
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)
fusion_classifier.load_state_dict(torch.load('../../finetuned_models/' + model_name + '.pth'))
fusion_classifier.eval()

dev_preds = []
with torch.set_grad_enabled(False):
    for batch in tqdm(dev_loader):
        outputs_all = []
        for i in range(n_models):
            model = models[i]
            input_ids = batch['input_ids'+'_'+str(i)].to(device)
            attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            outputs_all.append(outputs[1])
        
        for i in range(len_extra_embeds):
                outputs_all.append(torch.Tensor(dev_embeddings[i][batch['index'], :]).to(device))

        bert_output = torch.cat(outputs_all, dim = -1) 
        out = fusion_classifier(bert_output)
            
        for logits in out.cpu().numpy():
            dev_preds.append(np.argmax(logits))

In [None]:
y_true = dev_batch_labels
y_pred = dev_preds
target_names = label_mapping.keys()
report = classification_report(y_true, y_pred, target_names=target_names)

In [None]:
print(report)

In [None]:
np.savetxt("../../dev_preds/" + model_name + ".csv", dev_preds, delimiter=",")