In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
import os
root_path = 'gdrive/My Drive/EACL/'
os.chdir(root_path)

Mounted at /content/gdrive


In [2]:
!pip install sentencepiece==0.1.94
!pip install transformers==4.0.1
!pip install demoji
!pip install tweet-preprocessor
# !pip install transformers[sentencepiece]

Collecting sentencepiece==0.1.94
[?25l  Downloading https://files.pythonhosted.org/packages/e5/2d/6d4ca4bef9a67070fa1cac508606328329152b1df10bdf31fb6e4e727894/sentencepiece-0.1.94-cp36-cp36m-manylinux2014_x86_64.whl (1.1MB)
[K     |▎                               | 10kB 26.0MB/s eta 0:00:01[K     |▋                               | 20kB 16.0MB/s eta 0:00:01[K     |▉                               | 30kB 14.3MB/s eta 0:00:01[K     |█▏                              | 40kB 13.1MB/s eta 0:00:01[K     |█▌                              | 51kB 9.3MB/s eta 0:00:01[K     |█▊                              | 61kB 8.5MB/s eta 0:00:01[K     |██                              | 71kB 9.6MB/s eta 0:00:01[K     |██▍                             | 81kB 10.6MB/s eta 0:00:01[K     |██▋                             | 92kB 10.9MB/s eta 0:00:01[K     |███                             | 102kB 8.9MB/s eta 0:00:01[K     |███▎                            | 112kB 8.9MB/s eta 0:00:01[K     |███▌     

In [3]:
import numpy as np
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch.nn as nn
import torch
import copy
from transformers import BertModel, RobertaModel, BertTokenizer, RobertaTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, RandomSampler, SequentialSampler, random_split, DataLoader, IterableDataset, ConcatDataset
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score 
from tqdm import tqdm
import demoji 
import random
demoji.download_codes() 
import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.RESERVED)
plt.rcParams['figure.figsize'] = [15, 8]
plt.rcParams.update({'font.size': 8})
RANDOM_SEED = 42
model_path = 'xlm-roberta-base'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Downloading emoji data ...
... OK (Got response in 0.19 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK


In [4]:
def random_seed(seed_value, use_cuda):
    np.random.seed(seed_value)  
    torch.manual_seed(seed_value)  
    random.seed(seed_value)
    if use_cuda:
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)  
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
random_seed(RANDOM_SEED, True)

In [23]:
hastags = []

In [24]:
class Dataset():
    def __init__(self, train_data, val_data, batch_size = 32):
        self.train_data = train_data
        self.val_data = val_data
        self.batch_size = batch_size

        self.label_dict = {'Not_offensive': 0,
                            'Offensive_Targeted_Insult_Group': 4,
                            'Offensive_Targeted_Insult_Individual': 3,
                            'Offensive_Targeted_Insult_Other': 2,
                            'Offensive_Untargetede': 1,
                            'not-Tamil': 5}
                                    
        self.count_dic = {}

        self.train_inputs, self.train_labels = self.process_data(self.train_data)
        self.val_inputs, self.val_labels = self.process_data(self.val_data)
        # count_dic = {}
        # for data in self.train_labels:
        #     label = int(data)
        #     count_dic[label] = count_dic.get(label, 0)+1
        # self.weights = torch.Tensor([len(self.train_labels)/count_dic[i] for i in range(2)]).to(device)
        self.train_dataloader = self.get_dataloader(self.train_inputs, self.train_labels)
        self.val_dataloader = self.get_dataloader(self.val_inputs, self. val_labels, train = False)

    def tokenize(self, sentences, padding = True, max_len = 256):
        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
        input_ids, attention_masks = [], []
        for sent in sentences:
            encoded_dict = tokenizer.encode_plus(sent,
                                                    add_special_tokens=True,
                                                    max_length=max_len, 
                                                    padding='max_length', 
                                                    return_attention_mask = True,
                                                    return_tensors = 'pt', 
                                                    truncation = True)
            input_ids.append(encoded_dict['input_ids'])
            attention_masks.append(encoded_dict['attention_mask'])
        
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)

        return {'input_ids': input_ids, 'attention_masks': attention_masks}
    
    def process_data(self, data):
        sentences, labels = [], []
        print(len(data))
        for line in data:
            sentence = line.strip().split('\t')
            label = sentence.pop()
            # if label == 'not-Kannada': print(sentence)
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            words = sentence
            for word in (' '.join(words)).split():
                if word[0]=='#': hastags.append(word)
            sentence = p.clean(' '.join(sentence)).replace('#','')
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict): 
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            sentences.append(sentence)
            labels.append(self.label_dict[label])
            # if label == 'not-malayalam': labels.append(1)
            # else: labels.append(0)
            self.count_dic[labels[-1]] = self.count_dic.get(labels[-1], 0) + 1
        inputs = self.tokenize(sentences)

        return inputs, torch.Tensor(labels)
    
    def get_dataloader(self, inputs, labels, train = True):
        data = TensorDataset(inputs['input_ids'], inputs['attention_masks'], labels)
        if train:
            sampler = RandomSampler(data)
        else:
            sampler = SequentialSampler(data)
        return DataLoader(data, sampler=sampler, batch_size=self.batch_size)

In [25]:
with open('Dataset/kannada_offensive_train.csv', 'r') as f:
    train_data = f.readlines()
with open('Dataset/kannada_offensive_dev.csv', 'r') as f:
    val_data = f.readlines()
data = Dataset(train_data, val_data)

6217
777


In [26]:
len(hastags)

73

In [27]:

hastags

['#',
 '#Who',
 '#drrajkumar',
 '#marakkar',
 '#1',
 '#ಇದು',
 '#ASN',
 '#handsup',
 '#RIP',
 '#ASN',
 '#ಕನ್ನಡಿಗರು',
 '#marakkar',
 '#person',
 '#1',
 '#1',
 '#win',
 '#DBOSS',
 '#asn',
 '#Kgf',
 '#2',
 '#ASN',
 '#rcbedarcbbeku',
 '#stunning',
 '#',
 '#ASN',
 '#No',
 '#diya....',
 '#ದಿಯಾ',
 '#Shaanvi.',
 '#vijayprakash',
 '#1',
 '#ASN!',
 '#',
 '#Slowpoison',
 '#ASN',
 '##',
 '##n',
 '#avanesrimannarayana',
 '#ASN',
 '#ASN',
 '#1',
 '#Dia',
 '#trending',
 '#ASN',
 '#DBoss',
 '#Vp',
 '#ASN',
 "#'Hands",
 '#BOSS',
 '#u',
 '#1',
 '#ಮಸ್ತಮಗಾ',
 '#PruthviAmber',
 '##jai',
 '#Shanvi_Srivatsava',
 '#Rakshit_Shetty',
 '#Rashmika_Mandanna',
 '#rashmikamandhana',
 '#ASN',
 '#1.46',
 '#ASN',
 '#KGFCHAPTER2',
 '#KGF2',
 '#DBOSS',
 '#AvanesrimannarayananAll',
 '#ಕ್ರಿಯೇಟಿವ್',
 '#ಕನ್ನಡ',
 '#ಧನ್ಯವಾದಗಳು',
 '#ಕರ್ನಾಟಕ',
 '#ಪ್ರಜ್ವಲ್',
 '#ASN',
 '#dboss',
 '#1']

In [17]:
!pip install wordsegment

Collecting wordsegment
[?25l  Downloading https://files.pythonhosted.org/packages/cf/6c/e6f4734d6f7d28305f52ec81377d7ce7d1856b97b814278e9960183235ad/wordsegment-1.3.1-py2.py3-none-any.whl (4.8MB)
     |████████████████████████████████| 4.8MB 8.3MB/s 
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1


In [21]:
from wordsegment import segment, load
load()
ans = 'no_rakshaa_marana_mass'
ans = 'Pradesh_______________G'
segment(ans)

['pradesh', 'g']

In [None]:
# Save and Load Functions
def save_metrics(save_path, epochs, model, optimizer, F1):

    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'epochs': epochs+1,
                  'F1': F1}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path, model, optimizer):
    try: 
        state_dict = torch.load(load_path, map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    except: 
        state_dict = {}

    print(f'Model loaded from <== {load_path}')
    
    return state_dict.get('epochs', 0), state_dict.get('F1', 0)

def load_metrics_new(load_path, model):
    try: 
        state_dict = torch.load(load_path, map_location=device)
        model.load_state_dict(state_dict['model_state_dict'])
        # optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    except: 
        state_dict = {}

    print(f'Model loaded from <== {load_path}')
    
    return state_dict.get('epochs', 0), state_dict.get('F1', 0)

In [None]:
class Transform(torch.nn.Module):
    def __init__(self, D_in, num_labels):
        super(Transform, self).__init__()
        self.embeddings = AutoModel.from_pretrained(model_path)
        self.dropout = nn.Dropout(0.3)
        self.final = nn.Linear(D_in*2, num_labels, bias = True)

    def forward(self, input_ids, mask):
        outputs = self.embeddings(input_ids, mask)
        out = outputs.last_hidden_state
        mean_pooling = torch.mean(out, 1)
        max_pooling, _ = torch.max(out, 1)

        embed = torch.cat((mean_pooling, max_pooling), 1)
        y_pred = self.final(self.dropout(embed))
        return y_pred

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
 
def get_predicted(preds):
    pred_flat = np.argmax(preds, axis=1).flatten()
    return pred_flat
 
def evaluate(test_dataloader, model):
    model.eval()
    y_preds, y_test = np.array([]), np.array([])

    for batch in test_dataloader:
        b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device).long()
        with torch.no_grad():        
            ypred = model(b_input_ids, b_input_mask)
        ypred = ypred.cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        y_preds = np.hstack((y_preds, get_predicted(ypred)))
        y_test = np.hstack((y_test, label_ids))

    weighted_f1 = f1_score(y_test, y_preds, average='weighted')
    return weighted_f1, y_preds, y_test
 
def train(training_dataloader, validation_dataloader, model, filepath, weights = None, learning_rate = 2e-5, epochs = 4, print_every = 10):
    total_steps = len(training_dataloader) * epochs
    no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps = 1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    
    current_epoch, best_weighted_f1 = load_metrics(filepath, model, optimizer)
    if weights == None:
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.CrossEntropyLoss(weight=weights)
    for epoch_i in tqdm(range(current_epoch, epochs)):
        model.train()
        for step, batch in enumerate(training_dataloader):
            b_input_ids, b_input_mask, b_labels = batch[0].to(device), batch[1].to(device), batch[2].to(device).long()
            
            outputs = model(b_input_ids, b_input_mask)
            loss = criterion(outputs, b_labels)
 
            if step%print_every == 0:
                print(loss.item())
 
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
 
        print('### Validation Set Stats')
        weighted_f1, ypred, ytest = evaluate(validation_dataloader, model)
        print("  Weighted F1: {0:.2f}".format(weighted_f1))
        if weighted_f1 > best_weighted_f1:
            best_weighted_f1 = weighted_f1
            save_metrics(filepath, epoch_i, model, optimizer, weighted_f1)

In [None]:
model = Transform(768, 5).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, eps = 1e-8)
no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps = 1e-8)
# load_metrics('olid_xlmr_base.pt', model, optimizer)

In [None]:
# save_metrics('olid_xlmr_base_embed.pt', -1, model.embeddings, optimizer, -1)
model = Transform(768, 6).to(device)
load_metrics_new('olid_xlmr_base_embed_new.pt', model.embeddings)

Model loaded from <== olid_xlmr_base_embed_new.pt


(3, 0.8405510904977567)

In [None]:
train(data.train_dataloader, data.val_dataloader, model, 'olid_xlmr_tamil_new.pt')

Model loaded from <== olid_xlmr_tamil_new.pt


  0%|          | 0/4 [00:00<?, ?it/s]

1.3827037811279297
0.707119882106781
0.5697749853134155
0.8126499056816101
0.7258977293968201
0.3760716915130615
0.6903999447822571
0.5676354765892029
0.7841488122940063

0.9662981629371643
0.6774706244468689
0.8256979584693909
0.6239492893218994
0.4127058684825897
0.37586748600006104
0.7699794173240662
0.7174965143203735
1.0236085653305054
0.9283753037452698
0.6038984060287476
0.47299477458000183
0.726555585861206
1.0698517560958862
0.5911938548088074
0.9037986993789673
0.9411525726318359
0.7520509958267212
0.83879154920578
0.3930736482143402
0.5956424474716187
0.5345407128334045
0.6719488501548767
0.5271205306053162
0.6932833194732666
1.1580049991607666
0.4742433428764343
0.9651106595993042
0.7170631289482117
0.5074365735054016
0.8433086276054382
0.9022130370140076
0.8905602097511292
0.8149163722991943
0.5376572608947754
0.7340426445007324
0.5734100341796875
0.7564501762390137
0.416908860206604
0.5136358141899109
0.5415738224983215
0.7176908254623413
0.8215780258178711
0.494366735219

 25%|██▌       | 1/4 [30:44<1:32:14, 1844.76s/it]

0.7933605909347534
0.5151197910308838
0.5166828632354736
0.46889933943748474
0.9520547389984131
0.8959338068962097
0.42427942156791687
0.47711795568466187
0.46560126543045044
0.5400868654251099
0.6178332567214966
0.5327184200286865
0.6397090554237366
0.3624483048915863
0.4193245768547058
0.6319773197174072
0.3282817602157593
0.8783979415893555
0.4085277020931244
0.6466694474220276
0.5888128280639648
0.4689101576805115
0.40321406722068787
0.35843271017074585
0.5046445727348328
0.5234825015068054
0.39294034242630005
0.6316843032836914
0.6147304177284241
0.7901074290275574
0.7911117076873779
0.6955722570419312
0.7148879766464233
0.9732711315155029
0.6275277137756348
0.6134104132652283
0.6413905620574951
0.40025800466537476
0.7028116583824158
0.8608729839324951
0.5303041338920593
0.5391413569450378
0.5851832628250122
0.7702240943908691
0.9642824530601501
0.6209015250205994
0.4587070345878601
0.5528832674026489
0.7994793057441711
0.36921024322509766
0.4919770359992981
0.5487979650497437
0.8

 50%|█████     | 2/4 [1:01:33<1:01:32, 1846.04s/it]

0.4885443449020386
0.8822228312492371
0.7324984669685364
0.3435123860836029

0.5732443928718567
0.5000120401382446
0.42473259568214417
0.5671505928039551
0.44952505826950073
0.43027371168136597
0.46255096793174744
0.3511328101158142
0.549448549747467
0.8108782768249512
0.6124988198280334
0.35077449679374695
0.5348209142684937
0.688344419002533
0.29810523986816406
0.6902356743812561
0.5786144733428955
0.5910938382148743
0.5006276965141296
0.5537449717521667
0.2786235511302948
0.4176810085773468
0.3546854853630066
0.11333626508712769
0.24400822818279266
0.6662166714668274
0.5485678911209106
0.5094572305679321
0.3810337483882904
0.3550266623497009
0.716845691204071
0.3990563452243805
0.36124494671821594
0.6315447092056274
0.422586590051651
0.3099980652332306
0.39722853899002075
0.7008979916572571
0.607639729976654
0.5055176615715027
0.4090156853199005
0.7828459739685059
0.5812380909919739
0.3243483603000641
0.8141909241676331
0.39077743887901306
0.4104311168193817
0.2794378995895386
0.638

 75%|███████▌  | 3/4 [1:32:24<30:47, 1847.48s/it]  

0.6547251343727112
0.7879370450973511
0.6123150587081909
0.445584237575531
0.4572160542011261
0.6254869103431702
0.5910829901695251
0.5909932255744934
0.5006228685379028
0.37875616550445557
0.6272394061088562
0.3367610573768616
0.44425633549690247
0.3940967917442322
0.5128639340400696
0.3253736197948456
0.5020584464073181
0.38453999161720276
0.4458157420158386
0.3809550404548645
0.17242690920829773
0.441678911447525
0.3079788088798523
0.6551818251609802
0.4938412308692932
0.3514661490917206
0.1946377456188202
0.3489604890346527
0.3526236116886139
0.4102954566478729
0.15892374515533447
0.2711177170276642
0.478920578956604
0.1888723373413086
0.3949044644832611
0.2829122543334961
0.740734338760376
0.646075963973999
0.6948750615119934
0.19991141557693481
0.43339142203330994
0.5855204463005066
0.4017508029937744
0.33921656012535095
0.5164597630500793
0.3450016677379608
0.2790840268135071
0.1697201132774353
0.7202146053314209
0.7646333575248718
0.2637281119823456
0.5711539387702942
0.2509796

100%|██████████| 4/4 [2:03:13<00:00, 1848.33s/it]


In [None]:
load_metrics_new('olid_xlmr_tamil_new.pt', model)

In [None]:
_, ypred, ytest = evaluate(data.val_dataloader, model)
from sklearn.metrics import confusion_matrix
array = confusion_matrix(ytest, ypred)

In [None]:
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

df_cm = pd.DataFrame(array, range(6), range(6))
# plt.figure(figsize=(10,7))
sn.set(font_scale=1.4) # for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}) # font size

plt.show()

In [None]:
train(data.train_dataloader, data.val_dataloader, model, 'tamil_mal.pt')