In [None]:
!pip install wilds
!pip install transformers
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html



In [None]:
import dill
import os

import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from tqdm import tqdm

In [None]:
from wilds import get_dataset

dataset = get_dataset(dataset="civilcomments", download=True)

In [None]:
train = dataset.get_subset("train")
test = dataset.get_subset("test")

trainX = [data[0] for data in train]
trainY = torch.stack(([data[1] for data in train]))
pudd
testX = [data[0] for data in test]
testY = torch.stack(([data[1] for data in test]))
testMeta = torch.stack(([data[2] for data in test]))

val = dataset.get_subset('val')
valX = [data[0] for data in val]
valY = [data[1] for data in val]

In [None]:
# distilroberta-base 
# mrm8488/distilroberta-finetuned-tweets-hate-speech

def convert_txt2tokenid(tokenizer, text):
  """
  Uses the loaded tokenizer from HuggingFace to tokenize the raw text into int index
  return tokenized torch array
  """
  token_ids = []
  for sent in text:
    tokens = tokenizer.encode(sent, truncation=True, padding='max_length', return_tensors = 'pt')
    token_ids.append(tokens)
  return torch.cat(token_ids, dim=0)

# Tokenizers used in the domain adapted versions of RoBERTa are identical to roberta-base
roberta_tokenizer = RobertaTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-tweets-hate-speech")

encoded_trainX = convert_txt2tokenid(roberta_tokenizer, trainX)
encoded_testX = convert_txt2tokenid(roberta_tokenizer, testX)

In [None]:
batch_size = 16

train_dataset = TensorDataset(encoded_trainX, trainY)
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

test_dataset = TensorDataset(encoded_testX, testY, testMeta)
test_dataloader = DataLoader(
            test_dataset,
            batch_size = batch_size
        )

In [None]:
from torch import nn
from transformers import RobertaModel, RobertaTokenizer

# RobertaForSequenceClassification could also be used.
# Drop out rate as used in the paper
class CustomRoberta(nn.Module):
    def __init__(self):
          super(CustomRoberta, self).__init__()
          self.robert = RobertaModel.from_pretrained("mrm8488/distilroberta-finetuned-tweets-hate-speech", output_attentions = True, output_hidden_states = True)
          self.linear = nn.Linear(768, 2)
          self.dropout = nn.Dropout(0.1)
          self.activation = nn.Tanh()

    def forward(self, ids):
          # index 1 represents the pooled_output, the cls token.
          sequence_output = self.robert(ids)[1]
          
          linear_output = self.linear(sequence_output)
          dropout = self.dropout(linear_output)
          output = self.activation(dropout)

          return output

model = CustomRoberta()
model.cuda()

Some weights of the model checkpoint at mrm8488/distilroberta-finetuned-tweets-hate-speech were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at mrm8488/distilroberta-finetuned-tweets-hate-speech and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to us

CustomRoberta(
  (robert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [None]:
torch.manual_seed(42)
epochs = 5
optimizer = AdamW(model.parameters(), lr = 1e-5)

In [None]:
def get_loss_value(model, loader, device, cal_f1=True, benchmark_val=False):
    """
    Evaluation loop for the multi-class classification problem.
    return (loss, accuracy)
    """
    model.eval()
    losses = []
    accuracies = []
    pred_labels = []
    true_labels = []
    meta_info = []

    with torch.no_grad():
        for i, (images, labels, meta) in enumerate(loader):
            images = images.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = torch.nn.functional.cross_entropy(outputs, labels, reduce=None).detach()
            losses.append(loss.reshape(-1))
            preds = torch.argmax(outputs, dim=1)
            acc = (preds == labels).float().detach()
            pred_labels+=preds.detach().cpu().tolist()
            true_labels+=labels.detach().cpu().tolist()
            accuracies.append(acc.reshape(-1))
            meta_info.append(meta)

        if benchmark_val:
          return torch.FloatTensor(pred_labels), torch.FloatTensor(true_labels), torch.cat(meta_info, dim=0)

        losses = torch.cat(losses, dim=0).mean().cpu().data.numpy()
        accuracies = torch.cat(accuracies, dim=0).mean().cpu().data.numpy()

        ## As the original paper used the macro F1 score to evaluate the fine-tuned models
        ## additional argument (cal_f1) defined to calculate macro F1 score within this function
        if cal_f1:
          p_macro, r_macro, f1_macro, support_macro = \
                  precision_recall_fscore_support(y_true=np.array(true_labels), y_pred=np.array(pred_labels), average='macro')
          return losses, accuracies, p_macro, r_macro, f1_macro
        else:
          return losses, accuracies


In [None]:
import glob

model_path = glob.glob("./drive/MyDrive/CS699/homework #3/distilroberta_hate_speech_backup/*")

device = torch.device("cuda")

def load_ckp(checkpoint_fpath, model):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint)
    return model

for ckp_path in model_path:
  print(ckp_path)
  model = load_ckp(ckp_path, model)
  #loss, acc, prec, recall, f1 = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
  pred, label, meta = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
  #print("\t Loss: %f, Accuracy on the test dataset: %f" %(loss, acc))
  #print("\t prec: %f, recall: %f, macro f1: %f" %(prec, recall, f1))
  print(dataset.eval(pred, label, meta))
  print('--------------------------')
  break

./drive/MyDrive/CS699/homework #3/distilroberta_hate_speech_backup/1_model.pt
({'acc_avg': 0.923158586025238, 'acc_y:0_male:1': 0.9447568655014038, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.5760326981544495, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.9560617804527283, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.5616739988327026, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.9031152725219727, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.5254934430122375, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.9742169976234436, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.4793650805950165, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.9084967374801636, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.5556238293647766, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.9442952871322632, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.5538461804389954, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_bla

In [None]:
RESULT_FOLDER = "./drive/MyDrive/CS699/homework #3"
os.makedirs(f"{RESULT_FOLDER}/distilroberta-finetuned-tweets-hate-speech/", exist_ok=True)

device = torch.device("cuda")

with tqdm(total=epochs*len(train_dataloader)) as pbar:
  for epoch in range(epochs):
    model.train()

    for i, batch in enumerate(train_dataloader):

      d_input_id = batch[0].to(device)
      d_labels = batch[1].to(device)
      outputs = model(d_input_id)
      loss = torch.nn.functional.cross_entropy(outputs, d_labels)

      model.zero_grad()
      loss.backward()
      optimizer.step()
      pbar.update(1)

    pred, label, meta = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
    print(dataset.eval(pred, label, meta))
    
    torch.save(
        model.state_dict(), f'{RESULT_FOLDER}/distilroberta-finetuned-tweets-hate-speech/{epoch + 1}_model.pt',
        pickle_module=dill
    )
    


 20%|██        | 16815/84075 [2:01:23<7:45:30,  2.41it/s]

({'acc_avg': 0.9200714826583862, 'acc_y:0_male:1': 0.92449551820755, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.6631865501403809, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.941392183303833, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.6356828212738037, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.8314641714096069, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.6842105388641357, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.9619866013526917, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.5571428537368774, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.8802987933158875, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.63368159532547, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.9369127750396729, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.5923076868057251, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.830584704875946, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.697462

 40%|████      | 33630/84075 [4:23:10<5:50:23,  2.40it/s]

({'acc_avg': 0.9275911450386047, 'acc_y:0_male:1': 0.9594773650169373, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.532001793384552, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.966640830039978, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.5400881171226501, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.9193146228790283, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.5189144611358643, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.9790926575660706, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.45793649554252625, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.936881422996521, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.5070682168006897, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.9624161124229431, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.5057692527770996, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.9142428636550903, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.56

 60%|██████    | 50445/84075 [6:45:01<3:53:31,  2.40it/s]

({'acc_avg': 0.9258420467376709, 'acc_y:0_male:1': 0.9444260597229004, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.6082614660263062, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.9585302472114563, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.5969163179397583, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.9233644604682922, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.49424341320991516, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.9711593985557556, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.5007936358451843, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.9305322170257568, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.5365703701972961, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.9563758373260498, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.5346153974533081, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.8896551728248596, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0

 80%|████████  | 67260/84075 [9:06:54<1:56:31,  2.41it/s]

({'acc_avg': 0.922844648361206, 'acc_y:0_male:1': 0.9341713786125183, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.6441216468811035, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.9456238150596619, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.6334801912307739, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.8598130941390991, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.6365131735801697, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.963556706905365, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.5452380776405334, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.8898226022720337, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.627535343170166, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.9409396052360535, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.6000000238418579, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.8179910182952881, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.735

100%|██████████| 84075/84075 [11:28:43<00:00,  2.41it/s]

({'acc_avg': 0.925834596157074, 'acc_y:0_male:1': 0.9496361017227173, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.5601452589035034, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.9623386859893799, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.5414096713066101, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.9059190154075623, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.5222039222717285, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.9737212061882019, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.4611110985279083, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.9297852516174316, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.5230485796928406, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.9570469856262207, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.5192307829856873, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.8707646131515503, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.6

100%|██████████| 84075/84075 [11:49:06<00:00,  1.98it/s]
