In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading CSV from link
def read_csv_from_link(url):
    path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]
    df = pd.read_csv(path,delimiter="\t",error_bad_lines=False, header=None)
    return df

In [3]:
# Loading All Data
tamil_train = read_csv_from_link('https://drive.google.com/file/d/15auwrFAlq52JJ61u7eSfnhT9rZtI5sjk/view?usp=sharing')
tamil_dev = read_csv_from_link('https://drive.google.com/file/d/1Jme-Oftjm7OgfMNLKQs1mO_cnsQmznRI/view?usp=sharing')
mal_train = read_csv_from_link('https://drive.google.com/file/d/13JCCr-IjZK7uhbLXeufptr_AxvsKinVl/view?usp=sharing')
mal_dev = read_csv_from_link('https://drive.google.com/file/d/1J0msLpLoM6gmXkjC6DFeQ8CG_rrLvjnM/view?usp=sharing')
kannada_train = read_csv_from_link('https://drive.google.com/file/d/1XuOhSpdK8qsbO-lZHrIcVaU5FsCXc05T/view?usp=sharing')
kannada_dev = read_csv_from_link('https://drive.google.com/file/d/164zYZOeXIwt5jl3NggJU0CWRyD2fRT9z/view?usp=sharing')

b'Skipping line 2399: expected 2 fields, saw 3\nSkipping line 2525: expected 2 fields, saw 3\n'
b'Skipping line 777: expected 2 fields, saw 3\n'


In [4]:
# Tamil Preprocess
tamil_train = tamil_train.iloc[:, 0:2]
tamil_train = tamil_train.rename(columns={0: "text", 1: "label"})

tamil_dev = tamil_dev.iloc[:, 0:2]
tamil_dev = tamil_dev.rename(columns={0: "text", 1: "label"})

# Stats
tamil_train['label'] = pd.Categorical(tamil_train.label)
tamil_dev['label'] = pd.Categorical(tamil_dev.label)

### Load Model

In [5]:
import torch

In [6]:
print("GPU Nos: {}".format(torch.cuda.device_count()))
print(torch.cuda.get_device_name(0))
print(torch.cuda.get_device_name(1))

# Change Device - CPU/GPU-0/GPU-1
torch.cuda.set_device(1)
device = 'cuda'
device = device if torch.cuda.is_available() else 'cpu'

GPU Nos: 2
Tesla P100-PCIE-12GB
Tesla P100-PCIE-16GB


## Enter Path of Saved model here in torch.load()

In [7]:
# Model Select
import os
from os import listdir
from os.path import isfile, join
file_list = listdir('../finetuned_models/')

In [15]:
saved_model_filename = 'fusion_v1.pth'

In [10]:
models = []
tokenizers = []

# Loading Model
saved_model_filename = 'Mbert_base_cased_Tamil.pth'
from transformers import BertTokenizer, BertForSequenceClassification, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

load_dict = torch.load(os.path.join('../finetuned_models/', saved_model_filename))
load_dict = {k.split('bert.')[-1]: v for k, v in load_dict.items()}
load_dict = {k: v for k, v in load_dict.items() if k in model.state_dict()}
model.load_state_dict(load_dict, strict=False)
model.eval()

models.append(model)
tokenizers.append(tokenizer)

# Loading Model
saved_model_filename = 'XLMroberta_from_custom_pretrained_Tamil.pth'
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification, XLMRobertaModel
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
model = XLMRobertaModel.from_pretrained('xlm-roberta-base')

load_dict = torch.load(os.path.join('../finetuned_models/', saved_model_filename))
load_dict = {k.split('roberta.')[-1]: v for k, v in load_dict.items()}
load_dict = {k: v for k, v in load_dict.items() if k in model.state_dict()}
model.load_state_dict(load_dict, strict=False)
model.eval()

models.append(model)
tokenizers.append(tokenizer)

# Using Indic Bert
saved_model_filename = 'Indic_bert_Tamil.pth'
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModel.from_pretrained("ai4bharat/indic-bert")

load_dict = torch.load(os.path.join('../finetuned_models/', saved_model_filename))
load_dict = {k.split('albert.')[-1]: v for k, v in load_dict.items()}
load_dict = {k: v for k, v in load_dict.items() if k in model.state_dict()}
model.load_state_dict(load_dict, strict=False)
model.eval()

models.append(model)
tokenizers.append(tokenizer)

n_models = len(models)

for model in models:
    for param in model.parameters():
        param.requires_grad = False

In [11]:
label_mapping = {
    'Not_offensive': 0, 
    'not-Tamil': 1, 
    'Offensive_Targeted_Insult_Other': 2, 
    'Offensive_Targeted_Insult_Group': 3, 
    'Offensive_Untargetede': 4, 
    'Offensive_Targeted_Insult_Individual': 5
}

# Collecting Text and Labels
train_batch_sentences = list(tamil_train['text'])
train_batch_labels =  [label_mapping[x] for x in tamil_train['label']]
dev_batch_sentences = list(tamil_dev['text'])
dev_batch_labels =  [label_mapping[x] for x in tamil_dev['label']]

# Convert to Tensor
train_encodings = [tokenizer(train_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
train_labels = torch.tensor(train_batch_labels)
dev_encodings = [tokenizer(dev_batch_sentences, padding='max_length', truncation=True, max_length=64, return_tensors="pt") for tokenizer in tokenizers]
dev_labels = torch.tensor(dev_batch_labels)

In [13]:
# Dataset
from torch.utils.data import Dataset

class Tamil_Offensive_Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.n_models = len(encodings)

    def __getitem__(self, idx):
        item = {}
        for i in range(self.n_models):
            item.update({key+'_'+str(i): torch.tensor(val[idx]) for key, val in self.encodings[i].items()})
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Defining Datasets
dev_dataset = Tamil_Offensive_Dataset(dev_encodings, dev_labels)

In [16]:
import torch.nn.functional as F
import torch.nn as nn

# Basic Fully-Connected (Linear => BatchNorm => ReLU)
class BasicFC(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicFC, self).__init__()
        self.fc = nn.Linear(in_channels, out_channels, **kwargs)
        self.bn = nn.BatchNorm1d(out_channels, eps=0.001)

    def forward(self, x):
        x = self.fc(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)

class FusionNet(torch.nn.Module):
    def __init__(self, D_in, H1, H2, D_out):
        super(FusionNet, self).__init__()
        self.linear1_1 = BasicFC(D_in, H1)
        self.linear1_2 = BasicFC(H1, H2)
        self.dp = nn.Dropout(0.1)
        self.linear2 = torch.nn.Linear(H2, D_out, bias = False)

    def forward(self, x):
        h_relu_1 = self.linear1_1(x)
        h_relu_2 = self.dp(self.linear1_2(h_relu_1))
        y_pred = self.linear2(h_relu_2)
        return y_pred

In [18]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, f1_score

fusion_classifier = FusionNet(2304, 256, 64, 6)
fusion_classifier.load_state_dict(torch.load(os.path.join('../finetuned_models/', saved_model_filename)))
fusion_classifier.eval()

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
fusion_classifier.to(device)
for model in models:
    model.to(device)

# Dataloaders
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

dev_preds = []
with torch.set_grad_enabled(False):
    for batch in tqdm(dev_loader):
        outputs_all = []
        for i in range(n_models):
            model = models[i]
            input_ids = batch['input_ids'+'_'+str(i)].to(device)
            attention_mask = batch['attention_mask'+'_'+str(i)].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            outputs_all.append(outputs[1])

        bert_output = torch.cat(outputs_all, dim = -1) 
        out = fusion_classifier(bert_output)

        for logits in out.cpu().numpy():
            dev_preds.append(np.argmax(logits))

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=275.0), HTML(value='')))

  del sys.path[0]
  





In [21]:
y_true = dev_batch_labels
y_pred = dev_preds
target_names = label_mapping.keys()
report = classification_report(y_true, y_pred, target_names=target_names)

In [22]:
np.savetxt("../dev_preds/" + saved_model_filename[:-4] + ".csv", dev_preds, delimiter=",")

In [23]:
print(report)

                                      precision    recall  f1-score   support

                       Not_offensive       0.88      0.90      0.89      3193
                           not-Tamil       0.87      0.84      0.86       172
     Offensive_Targeted_Insult_Other       0.04      0.05      0.04        65
     Offensive_Targeted_Insult_Group       0.39      0.37      0.38       295
               Offensive_Untargetede       0.45      0.39      0.42       356
Offensive_Targeted_Insult_Individual       0.47      0.44      0.45       307

                            accuracy                           0.78      4388
                           macro avg       0.52      0.50      0.51      4388
                        weighted avg       0.77      0.78      0.77      4388

