In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
import os
root_path = 'gdrive/My Drive/EACL/'
os.chdir(root_path)

Mounted at /content/gdrive


In [None]:
!pip install transformers
!pip install demoji

In [9]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import copy
from transformers import BertModel, RobertaModel, BertTokenizer, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, random_split, DataLoader, IterableDataset, ConcatDataset
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score 
from tqdm import tqdm
import demoji 
import random
demoji.download_codes() 
plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams.update({'font.size': 8})
RANDOM_SEED = 42
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading emoji data ...
... OK (Got response in 0.13 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [10]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
random_seed(RANDOM_SEED, True)

In [70]:
class Dataset():
    def __init__(self, test_data, batch_size = 32):
        self.test_data = test_data
        self.batch_size = batch_size

        self.label_dict = {'Not_offensive': 0,
                            'Offensive_Targeted_Insult_Group': 3,
                            'Offensive_Targeted_Insult_Individual': 5,
                            'Offensive_Targeted_Insult_Other': 2,
                            'Offensive_Untargetede': 4,
                            'not-Kannada': 1}
        self.dict_label = {v:k for k,v in self.label_dict.items()}
        self.count_dic = {}

        self.test_inputs, self.test_text = self.process_data(self.test_data)
        self.test_dataloader = self.get_dataloader(self.test_inputs, train = False)

    def tokenize(self, sentences, padding = True, max_len = 256):
        tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        input_ids, attention_masks = [], []
        for sent in sentences:
            encoded_dict = tokenizer.encode_plus(sent,
                                                    add_special_tokens=True,
                                                    max_length=max_len, 
                                                    padding='max_length', 
                                                    return_attention_mask = True,
                                                    return_tensors = 'pt', 
                                                    truncation = True)
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return {'input_ids': input_ids, 'attention_masks': attention_masks}
    
    def process_data(self, data):
        sentences = []
        texts = []
        print(len(data))
        for line in data:
            sentence = line.strip().split('\t')
            sentence = ' '.join(sentence)
            texts.append(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict): 
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            sentences.append(sentence)
        inputs = self.tokenize(sentences)

        return inputs, texts
    
    def get_dataloader(self, inputs, train = True):
        data = TensorDataset(inputs['input_ids'], inputs['attention_masks'])
        if train:
            sampler = RandomSampler(data)
        else:
            sampler = SequentialSampler(data)
        return DataLoader(data, sampler=sampler, batch_size=self.batch_size)

In [71]:
with open('Dataset/kannada_offensive_test.csv', 'r') as f:
    test_data = f.readlines()
data = Dataset(test_data)

778


In [73]:
data.dict_label

{0: 'Not_offensive',
 1: 'not-Kannada',
 2: 'Offensive_Targeted_Insult_Other',
 3: 'Offensive_Targeted_Insult_Group',
 4: 'Offensive_Untargetede',
 5: 'Offensive_Targeted_Insult_Individual'}

In [63]:
# Save and Load Functions
def save_metrics(save_path, epochs, model, optimizer, F1):

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'epochs': epochs+1,
                  'F1': F1}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path, model, optimizer):
    try: 
        state_dict = torch.load(load_path, map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    except: 
        state_dict = {}

    print(f'Model loaded from <== {load_path}')
    
    return state_dict.get('epochs', 0), state_dict.get('F1', 0)

In [64]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
 
def get_predicted(preds):
    pred_flat = np.argmax(preds, axis=1).flatten()
    return pred_flat
 
def evaluate(test_dataloader, model):
    model.eval()
    y_preds, y_test = np.array([]), np.array([])

    for batch in test_dataloader:
        b_input_ids, b_input_mask = batch[0].to(device), batch[1].to(device)
        with torch.no_grad():        
            outputs = model(input_ids = b_input_ids, attention_mask = b_input_mask)
        ypred = outputs.logits
        ypred = ypred.cpu().numpy()
        y_preds = np.hstack((y_preds, get_predicted(ypred)))

    return y_preds

In [72]:
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels = 6).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, eps = 1e-8)
load_metrics('tamil_mal_kan.pt', model, optimizer)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

Model loaded from <== tamil_mal_kan.pt


(4, 0.7171719205487923)

In [74]:
ypred = evaluate(data.test_dataloader, model)
labels = [data.dict_label[x] for x in ypred]

In [75]:
len(labels)

778

In [76]:
df = pd.DataFrame()
df['id'] = range(1, len(ypred)+1)
df['text'] = data.test_text
df['label'] = labels
df.to_csv('indicnlp@kgp_kannada.tsv',sep='\t', index = False)

In [69]:
df.head()

Unnamed: 0,id,text,label
0,1,അപ്പൊ ഇതൊരൊന്നൊരാ മൊതലാണല്ലേ Suraj ആണ് നടൻ ന്...,Not_offensive
1,2,എന്ത് ഊള എഡിറ്റിംഗ് ആടോ ഇത് ഒരുമാതിരി vivo vid...,Not_offensive
2,3,Fefka ee padam release cheyyan samadhicho?,Not_offensive
3,4,അആഹാ.. സംഗീതം ജെക്‌സ് ബിജോയ് ആണ് അപ്പൊ പൊട്ടലു...,Not_offensive
4,5,Ravile thane views likes ethra ayyi enn nokan ...,Not_offensive
