# Import Packages

In [1]:
import os
import numpy as np
import pandas as pd
import transformers
import torch
from torch.utils.data import (
    Dataset, 
    DataLoader, 
    RandomSampler, 
    SequentialSampler
)

import math 
from transformers import  (
    BertPreTrainedModel, 
    RobertaConfig, 
    RobertaTokenizerFast
)

from transformers.optimization import (
    AdamW, 
    get_linear_schedule_with_warmup
)

from scipy.special import softmax
from torch.nn import CrossEntropyLoss

from sklearn.metrics import (
    confusion_matrix,
    matthews_corrcoef,
    roc_curve,
    auc,
    average_precision_score,
)

from transformers.models.roberta.modeling_roberta import (
    RobertaClassificationHead,
    RobertaConfig,
    RobertaModel,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
asonam_release_all_tweets = pd.read_csv('data/asonam_release_all_tweets.csv')
annotated_tweets_w_text = pd.read_csv('data/annotated_tweets_w_text.csv')

In [3]:
annotated_tweets_w_text.head(5)

Unnamed: 0,Tweet ID,Text,label
0,1242553623260868608,Are we still allowed to quote ancient Chinese ...,0
1,1246508137638580225,@mamacat2u @VBeltiz More power to you! This C...,0
2,1233468243534372865,"CNBC: WHO, Tedros reiterated that the virus co...",0
3,1243626072387747841,"""The heightened racism experienced by Asian co...",1
4,1225611530978217989,Coronavirus and Nepali in China: KP Oli has di...,0


In [4]:
asonam_release_all_tweets.head(5)

Unnamed: 0,6003796066304,0
0,1230768059503140869,0.0
1,1230773097868873731,0.0
2,1230777259440496640,0.0
3,1230766478275665920,0.0
4,1230772376775692289,0.0


In [5]:
annotated_tweets_w_text['label'] = np.where(annotated_tweets_w_text['label']==2,1,0)

In [6]:
annotated_tweets_w_text['label'].value_counts(normalize=True)

0    0.812664
1    0.187336
Name: label, dtype: float64

In [7]:
# split the data sets into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(annotated_tweets_w_text['Text'],
                                                    annotated_tweets_w_text['label'],
                                                    stratify=annotated_tweets_w_text['label'], 
                                                    test_size=0.2)

# Define parameters for the fine-tuning 

In [8]:
# We first check if GPU is available or not
print(torch.__version__)

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

2.0.1+cu117
There are 1 GPU(s) available.
We will use the GPU: Quadro RTX 5000


In [9]:
# We first define the parameters
model_name = 'roberta-base'
num_labels = 2
device = torch.device("cuda")
tokenizer_name = model_name
max_seq_length = 128  
train_batch_size = 16
test_batch_size = 16
weight_decay=0.01
gradient_accumulation_steps = 1
num_train_epochs = 10
learning_rate = 1e-05
adam_epsilon = 1e-08

In [10]:
class RobertaForSequenceClassification(BertPreTrainedModel):
    
    def __init__(self, config):
        super(RobertaForSequenceClassification, self).__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)
        
    def forward(self, input_ids, attention_mask, labels):
        outputs = self.roberta(input_ids,attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)
        outputs = (logits,) + outputs[2:]
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs
        return outputs

# Load pre-trained Roberta model and tokenizer

In [11]:
config_class = RobertaConfig
model_class = RobertaForSequenceClassification
tokenizer_class = RobertaTokenizerFast

config = config_class.from_pretrained(model_name, num_labels=num_labels)

model = model_class.from_pretrained(model_name, config=config)
print('Model=\n',model,'\n')

tokenizer = tokenizer_class.from_pretrained(tokenizer_name, do_lower_case=False)
print('Tokenizer=',tokenizer,'\n')

Downloading model.safetensors: 100%|██████████| 499M/499M [00:39<00:00, 12.7MB/s] 
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'roberta.poo

Model=
 RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
     

# Define a class to convert text and labels into a Dataset object with encoded text and labels

In [12]:
class ClassificationDataset(Dataset):
    
    def __init__(self, data, tokenizer):
        text, labels = data
        self.examples = tokenizer(text=text,text_pair=None,truncation=True,padding="max_length",
                                  max_length=max_seq_length,return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)
        
    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, index):
        return {key: self.examples[key][index] for key in self.examples}, self.labels[index]


train_examples = (X_train.astype(str).tolist(), y_train.tolist())
train_dataset = ClassificationDataset(train_examples,tokenizer)

test_examples = (X_test.astype(str).tolist(),  y_test.tolist())
test_dataset = ClassificationDataset(test_examples,tokenizer)

# Methods to prepare a batch from train (and test) datasets

In [13]:
def generate_inputs_dict(batch):
    inputs = {key: value.squeeze(1).to(device) for key, value in batch[0].items()}
    inputs["labels"] = batch[1].to(device)
    return inputs

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,sampler=train_sampler,batch_size=train_batch_size)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=test_batch_size)

# Extract a batch as sanity-check
batch = generate_inputs_dict(next(iter(train_dataloader)))
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)

print(batch)

{'input_ids': tensor([[    0,  1039, 10567,  ...,     1,     1,     1],
        [    0,  1039,   510,  ...,     1,     1,     1],
        [    0, 20763,  1603,  ...,     1,     1,     1],
        ...,
        [    0,  1106,    47,  ...,     1,     1,     1],
        [    0,   250,  7159,  ...,     1,     1,     1],
        [    0,  6785,    10,  ...,     1,     1,     1]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0'), 'labels': tensor([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0], device='cuda:0')}


In [14]:
t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
optimizer_grouped_parameters = []
custom_parameter_names = set()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters.extend(
    [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
)

warmup_steps = 500 # warmup_steps = math.ceil(t_total * warmup_ratio)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)



# Method to compute accuracy of predictions

In [15]:
def compute_metrics(preds, model_outputs, labels, eval_examples=None, multi_label=False):
    assert len(preds) == len(labels)
    mismatched = labels != preds
    wrong = [i for (i, v) in zip(eval_examples, mismatched) if v.any()]
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds, labels=[0, 1]).ravel()
    precision = (tp)/(tp + fp)
    recall = (tp)/(tp + fn)
    f1 = 2*precision*recall/(precision + recall)
    scores = np.array([softmax(element)[1] for element in model_outputs])
    fpr, tpr, thresholds = roc_curve(labels, scores)
    auroc = auc(fpr, tpr)
    auprc = average_precision_score(labels, scores)
    return (
        {
            **{"mcc": mcc, "tp": tp, "tn": tn, "fp": fp, "fn": fn, "auroc": auroc, "auprc": auprc, "precision": precision, "recall": recall, "f1": f1},
        },
        wrong,
        scores
    )

In [16]:
X_test_final =  X_test.reset_index().copy()
X_test_final 

Unnamed: 0,index,Text
0,711,#coronavirus #COVID19 You can't even say goodb...
1,517,The President is knowingly inciting hate again...
2,1652,@ChenXiHao Don’t discriminate us Asians ! 💙🙏🏻 ...
3,1502,it breaks my heart that you fucking racists ar...
4,512,@SkyNews @Independent @Channel4News @itvnews @...
...,...,...
453,1246,"Lol, shut your ass up you fat covid 19 virus h..."
454,615,"After Corona virus in China, now it's Pakistan..."
455,63,@Jay_Qi37__China @EnzoMazak @eille68577793 @an...
456,529,@globaltimesnews Fuck China! #ChineseBioterror...


In [18]:
model.to(device)

model.zero_grad()

for epoch in range(num_train_epochs):

    model.train()
    epoch_loss = []
    
    for batch in train_dataloader:
        batch = generate_inputs_dict(batch)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        epoch_loss.append(loss.item())
        
    # evaluate model after each epoch
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(test_dataloader)
    preds = np.empty((len(test_dataset), num_labels))
    out_label_ids = np.empty((len(test_dataset)))
    model.eval()
    
    for i,test_batch in enumerate(test_dataloader):
        with torch.no_grad():
            test_batch = generate_inputs_dict(test_batch)
            input_ids = test_batch['input_ids'].to(device)
            attention_mask = test_batch['attention_mask'].to(device)
            labels = test_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(test_dataset)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    result, wrong, scores = compute_metrics(preds, model_outputs, out_label_ids, test_examples)
    X_test_final['preds'] = preds
    X_test_final['scores'] = scores # predicting 1
    print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))
    print('epoch',epoch,'Testing  avg loss',eval_loss)
    print(result)

  precision = (tp)/(tp+fp)


epoch 0 Training avg loss 0.4605375173299209
epoch 0 Testing  avg loss 0.36912028902563554
{'mcc': 0.0, 'tp': 0, 'tn': 372, 'fp': 0, 'fn': 86, 'auroc': 0.8910040010002501, 'auprc': 0.6804892784139719, 'precision': nan, 'recall': 0.0, 'f1': nan}
epoch 1 Training avg loss 0.32475626215986586
epoch 1 Testing  avg loss 0.32061040439996225
{'mcc': 0.5362980289990233, 'tp': 31, 'tn': 370, 'fp': 2, 'fn': 55, 'auroc': 0.9233870967741935, 'auprc': 0.7997807095987548, 'precision': 0.9393939393939394, 'recall': 0.36046511627906974, 'f1': 0.5210084033613445}
epoch 2 Training avg loss 0.2351519814328007
epoch 2 Testing  avg loss 0.2215027079500001
{'mcc': 0.7209090086804603, 'tp': 60, 'tn': 361, 'fp': 11, 'fn': 26, 'auroc': 0.9467679419854964, 'auprc': 0.8633869023931652, 'precision': 0.8450704225352113, 'recall': 0.6976744186046512, 'f1': 0.7643312101910827}
epoch 3 Training avg loss 0.1551587715583003
epoch 3 Testing  avg loss 0.20180056629509763
{'mcc': 0.7834318747275633, 'tp': 73, 'tn': 354, '

In [19]:
model.save_pretrained("model")

In [20]:
X_test_final['label'] = y_test.values
X_test_hate = X_test_final[X_test_final['preds']==1].sort_values(['scores'])

In [21]:
X_test_hate.to_csv('output/X_test_hate.csv')

In [22]:
X_test_hate

Unnamed: 0,index,Text,preds,scores,label
425,2278,"#YoMuzziesSoBad , they h@te Hindus so much, th...",1,0.506634,0
203,2259,"@alx @realDonaldTrump No,now we Chinese have b...",1,0.510949,0
281,294,Absolute joke of a man. Cut their salaries off...,1,0.524943,0
146,2159,@SpicyKoreanQ The world would be a much better...,1,0.527265,1
239,369,Chinese people be weird af... now there’s goin...,1,0.656011,1
...,...,...,...,...,...
115,2092,@zlj517 Mother fucker china virus..what about...,1,0.997734,1
291,45,#COVID19 so this all started cuz some chinks w...,1,0.997773,1
205,1618,@yoogieboobie Fuck u and your chink obsession,1,0.997814,1
174,1699,@CNN Fucking chinese virus Fucking chinese liars,1,0.997878,1
