In [None]:
!pip install wilds
!pip install transformers
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.11.0+cu113.html
!pip install pillow==7.2.0

Collecting wilds
  Downloading wilds-2.0.0-py3-none-any.whl (126 kB)
[?25l[K     |██▋                             | 10 kB 35.0 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 39.0 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 44.7 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 38.0 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 25.2 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 28.5 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 29.3 MB/s eta 0:00:01[K     |████████████████████▊           | 81 kB 30.4 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 32.9 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 34.0 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 34.0 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 34.0 MB/s eta 0:00:01[K     |████████████████████████████████| 126 kB 34.0 MB/s 
Collecti

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 29.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 73.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 60.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 7.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 66.7 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel fo

In [None]:
import dill
import os

import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from tqdm import tqdm

In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from wilds import get_dataset

dataset = get_dataset(dataset="civilcomments", download=True)

Downloading dataset to data/civilcomments_v1.0...
You can also download the dataset manually at https://wilds.stanford.edu/downloads.
Downloading https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/ to data/civilcomments_v1.0/archive.tar.gz


  0%|          | 0/90644480 [00:00<?, ?Byte/s]

Extracting data/civilcomments_v1.0/archive.tar.gz to data/civilcomments_v1.0

It took 0.3 minutes to download and uncompress the dataset.



In [None]:
test = dataset.get_subset("test")

testX = [data[0] for data in test]
testY = torch.stack(([data[1] for data in test]))
testMeta = torch.stack(([data[2] for data in test]))

pretrained_path = 'cardiffnlp/twitter-roberta-base-hate'


# Tokenizers used in the domain adapted versions of RoBERTa are identical to roberta-base
roberta_tokenizer = RobertaTokenizer.from_pretrained(pretrained_path)
encoded_testX = roberta_tokenizer(testX, truncation=True, max_length = 300, padding='max_length', return_tensors = 'pt', return_attention_mask = True)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/588 [00:00<?, ?B/s]

In [None]:
import random

from wilds.common.data_loaders import get_train_loader
from wilds.common.grouper import CombinatorialGrouper

target_groups = ['black', 'y']
n_groups = len(target_groups) * 2
batch_size = 16

grouper = CombinatorialGrouper(dataset, target_groups)

train = dataset.get_subset("train")
train_loader = get_train_loader(
    "group", train, grouper=grouper, n_groups_per_batch=n_groups, batch_size=batch_size
)


test_dataset = TensorDataset(encoded_testX['input_ids'],encoded_testX['attention_mask'], testY, testMeta)
test_dataloader = DataLoader(
            test_dataset,
            batch_size = batch_size
        )

In [None]:
from torch import nn
from transformers import RobertaModel, RobertaTokenizer

# RobertaForSequenceClassification could also be used.
# Drop out rate as used in the paper
class CustomRoberta(nn.Module):
    def __init__(self):
          super(CustomRoberta, self).__init__()
          self.roberta = RobertaModel.from_pretrained(pretrained_path, output_hidden_states = True)
          self.hidden_layer = nn.Linear(768, 768)
          self.dropout = nn.Dropout(0.1)
          self.activation = nn.ReLU() # or tanh()
          self.output_layer = nn.Linear(768, 2)
          
    def forward(self, d_ids, d_mask):
          # index 1 represents the pooled_output, the cls token.
          sequence_output = self.roberta(input_ids = d_ids,attention_mask=d_mask)[1]
          sequence_output = self.dropout(sequence_output)
          hidden_output = self.hidden_layer(sequence_output)
          dropout = self.dropout(hidden_output)
          act = self.activation(dropout)
          output = self.output_layer(act)

          return output

model = CustomRoberta()
model.cuda()

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-hate were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-hate and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inf

CustomRoberta(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [None]:
torch.manual_seed(42)
epochs = 5

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
group_weights = [1] * (batch_size//n_groups)



In [None]:
import math

def update_dro_group_weights(weights, group_idx, loss, eta_q = 0.01):
  new_weight = weights[group_idx] * math.exp(eta_q * loss.item())
  weights[group_idx] = new_weight
  return [weight/sum(weights) for weight in weights]

In [None]:
def get_loss_value(model, loader, device, cal_f1=True, benchmark_val=False):
    """
    Evaluation loop for the multi-class classification problem.
    return (loss, accuracy)
    """
    model.eval()
    losses = []
    accuracies = []
    pred_labels = []
    true_labels = []
    meta_info = []

    with torch.no_grad():
        for i, (ids, masks, labels, meta) in enumerate(loader):
            ids = ids.to(device)
            masks = masks.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(ids,masks)
            loss = torch.nn.functional.cross_entropy(outputs, labels, reduce=None).detach()
            losses.append(loss.reshape(-1))
            preds = torch.argmax(outputs, dim=1)
            acc = (preds == labels).float().detach()
            pred_labels+=preds.detach().cpu().tolist()
            true_labels+=labels.detach().cpu().tolist()
            accuracies.append(acc.reshape(-1))
            meta_info.append(meta)

        if benchmark_val:
          return torch.FloatTensor(pred_labels), torch.FloatTensor(true_labels), torch.cat(meta_info, dim=0)

        losses = torch.cat(losses, dim=0).mean().cpu().data.numpy()
        accuracies = torch.cat(accuracies, dim=0).mean().cpu().data.numpy()

        ## As the original paper used the macro F1 score to evaluate the fine-tuned models
        ## additional argument (cal_f1) defined to calculate macro F1 score within this function
        if cal_f1:
          p_macro, r_macro, f1_macro, support_macro = \
                  precision_recall_fscore_support(y_true=np.array(true_labels), y_pred=np.array(pred_labels), average='macro')
          return losses, accuracies, p_macro, r_macro, f1_macro
        else:
          return losses, accuracies


In [None]:
## Defining step sizes in DRO
eta_q = 0.01

RESULT_FOLDER = "./drive/MyDrive/CS699/homework #3/DRO"
os.makedirs(f"{RESULT_FOLDER}/{pretrained_path}/", exist_ok=True)

device = torch.device("cuda")

with tqdm(total=epochs*len(train_loader)) as pbar:
  for epoch in range(epochs):
    model.train()
    
    for i, batch in enumerate(train_loader):
      selected_group = random.randint(0, (batch_size//n_groups) - 1)
      selected_idx = selected_group * 4

      batch_text = batch[0][selected_idx:selected_idx+4]#.to(device)
      d_labels = batch[1][selected_idx:selected_idx+4].to(device)

      tokenized_text = roberta_tokenizer(batch_text, truncation=True, max_length = 300, padding='max_length', return_tensors = 'pt', return_attention_mask = True)
      d_input_id = tokenized_text['input_ids'].to(device)
      d_att_mask = tokenized_text['attention_mask'].to(device)
      
      outputs = model(d_input_id,d_att_mask)
      loss = torch.nn.functional.cross_entropy(outputs, d_labels)

      group_weights = update_dro_group_weights(group_weights, selected_group, loss, eta_q = eta_q)
      optimizer.param_groups[0]['lr'] = 1e-5 * group_weights[selected_group]

      model.zero_grad()
      loss.backward()
      optimizer.step()
      pbar.update(1)


    torch.save(
        model.state_dict(), f'{RESULT_FOLDER}/{pretrained_path}/{epoch + 1}_model.pt',
        pickle_module=dill
    )
    
    pred, label, meta = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
    print(dataset.eval(pred, label, meta))

 20%|██        | 16814/84070 [1:08:08<4:33:52,  4.09it/s]

({'acc_avg': 0.7735121250152588, 'acc_y:0_male:1': 0.6661428809165955, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.9278256893157959, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.6740249395370483, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.9392070770263672, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.4367601275444031, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.9350329041481018, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.7593587040901184, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.9198412895202637, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.4154995381832123, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.9631223082542419, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.6120805144309998, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.9307692050933838, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.6011993885040283, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.

 40%|████      | 33628/84070 [2:55:30<3:24:30,  4.11it/s]

({'acc_avg': 0.8805220723152161, 'acc_y:0_male:1': 0.8454350233078003, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.7898320555686951, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.8607800006866455, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.7960352301597595, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.6834890842437744, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8133223652839661, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8909181356430054, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.7841269969940186, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.7055088877677917, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8322064876556396, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.8130872249603271, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.7942307591438293, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.8413792848587036, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.

 60%|██████    | 50442/84070 [4:42:49<2:15:52,  4.12it/s]

({'acc_avg': 0.8471618294715881, 'acc_y:0_male:1': 0.7859742045402527, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8647299408912659, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7973058819770813, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8709251284599304, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.5470405220985413, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8824012875556946, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8381952047348022, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8444444537162781, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.4750700294971466, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.9366933107376099, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.7322147488594055, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8615384697914124, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7382308840751648, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.

 80%|████████  | 67256/84070 [6:30:10<1:07:36,  4.15it/s]

({'acc_avg': 0.8457565307617188, 'acc_y:0_male:1': 0.7778696417808533, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8597367405891418, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7894068956375122, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8647577166557312, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.5442367792129517, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8807565569877625, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8417485952377319, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8460317254066467, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.6244631409645081, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8733866214752197, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.7640939354896545, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8480769395828247, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7586206793785095, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.

100%|██████████| 84070/84070 [8:56:41<00:00,  2.61it/s]

({'acc_avg': 0.8430581092834473, 'acc_y:0_male:1': 0.777125358581543, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8597367405891418, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7860921025276184, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8744493126869202, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.5124610662460327, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8922697305679321, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8220807909965515, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8698412775993347, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.5512605309486389, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8998156189918518, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.7271811962127686, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8615384697914124, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7946026921272278, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.6




In [None]:
import glob
pretrained_path = 'roberta-base'
model_path  = glob.glob(f"/home/hhamad/CSCI699-HW1/homework3/log/{pretrained_path}/*")

device = torch.device("cuda")

def load_ckp(checkpoint_fpath, model):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint)
    return model

for ckp_path in model_path:
  print(ckp_path)
  model = load_ckp(ckp_path, model)
  #loss, acc, prec, recall, f1 = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
  pred, label, meta = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
  #print("\t Loss: %f, Accuracy on the test dataset: %f" %(loss, acc))
  #print("\t prec: %f, recall: %f, macro f1: %f" %(prec, recall, f1))
  print(dataset.eval(pred, label, meta))
  print('--------------------------')
