In [2]:
!pip install wilds
!pip install transformers
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.11.0+cu113.html
!pip install pillow==7.2.0

Collecting wilds
  Downloading wilds-2.0.0-py3-none-any.whl (126 kB)
[K     |████████████████████████████████| 126 kB 14.7 MB/s 
[?25hCollecting pillow>=7.2.0
  Downloading Pillow-9.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 89.0 MB/s 
[?25hCollecting outdated>=0.2.0
  Downloading outdated-0.2.1-py3-none-any.whl (7.5 kB)
Collecting scipy>=1.5.4
  Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
[K     |████████████████████████████████| 38.1 MB 1.2 MB/s 
[?25hCollecting ogb>=1.2.6
  Downloading ogb-1.3.3-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 7.9 MB/s 
Collecting littleutils
  Downloading littleutils-0.2.2.tar.gz (6.6 kB)
Building wheels for collected packages: littleutils
  Building wheel for littleutils (setup.py) ... [?25l[?25hdone
  Created wheel for littleutils: filename=littleutils-0.2.2-py3-none-any.whl size=7048 sha256

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 14.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 68.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 78.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 89.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.3 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel fo

In [3]:
import dill
import os

import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn import preprocessing
import torch
from torch import nn
from transformers import RobertaModel, RobertaTokenizer
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from tqdm import tqdm

In [4]:
from wilds import get_dataset

dataset = get_dataset(dataset="civilcomments", download=True)

Downloading dataset to data/civilcomments_v1.0...
You can also download the dataset manually at https://wilds.stanford.edu/downloads.
Downloading https://worksheets.codalab.org/rest/bundles/0x8cd3de0634154aeaad2ee6eb96723c6e/contents/blob/ to data/civilcomments_v1.0/archive.tar.gz


  0%|          | 0/90644480 [00:00<?, ?Byte/s]

Extracting data/civilcomments_v1.0/archive.tar.gz to data/civilcomments_v1.0

It took 0.29 minutes to download and uncompress the dataset.



In [5]:
test = dataset.get_subset("test")

testX = [data[0] for data in test]
testY = torch.stack(([data[1] for data in test]))
testMeta = torch.stack(([data[2] for data in test]))

pretrained_path = 'mrm8488/distilroberta-finetuned-tweets-hate-speech'


# Tokenizers used in the domain adapted versions of RoBERTa are identical to roberta-base
roberta_tokenizer = RobertaTokenizer.from_pretrained(pretrained_path)
encoded_testX = roberta_tokenizer(testX, truncation=True, max_length = 300, padding='max_length', return_tensors = 'pt', return_attention_mask = True)

Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/713 [00:00<?, ?B/s]

In [6]:
import random

from wilds.common.data_loaders import get_train_loader
from wilds.common.grouper import CombinatorialGrouper

target_groups = ['black', 'y']
n_groups = len(target_groups) * 2
batch_size = 16

grouper = CombinatorialGrouper(dataset, target_groups)

train = dataset.get_subset("train")
train_loader = get_train_loader(
    "group", train, grouper=grouper, n_groups_per_batch=n_groups, batch_size=batch_size
)


test_dataset = TensorDataset(encoded_testX['input_ids'],encoded_testX['attention_mask'], testY, testMeta)
test_dataloader = DataLoader(
            test_dataset,
            batch_size = batch_size
        )

In [7]:
from torch import nn
from transformers import RobertaModel, RobertaTokenizer

# RobertaForSequenceClassification could also be used.
# Drop out rate as used in the paper
class CustomRoberta(nn.Module):
    def __init__(self):
          super(CustomRoberta, self).__init__()
          self.roberta = RobertaModel.from_pretrained(pretrained_path, output_hidden_states = True)
          self.hidden_layer = nn.Linear(768, 768)
          self.dropout = nn.Dropout(0.1)
          self.activation = nn.ReLU() # or tanh()
          self.output_layer = nn.Linear(768, 2)
          
    def forward(self, d_ids, d_mask):
          # index 1 represents the pooled_output, the cls token.
          sequence_output = self.roberta(input_ids = d_ids,attention_mask=d_mask)[1]
          sequence_output = self.dropout(sequence_output)
          hidden_output = self.hidden_layer(sequence_output)
          dropout = self.dropout(hidden_output)
          act = self.activation(dropout)
          output = self.output_layer(act)

          return output

model = CustomRoberta()
model.cuda()

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

Some weights of the model checkpoint at mrm8488/distilroberta-finetuned-tweets-hate-speech were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at mrm8488/distilroberta-finetuned-tweets-hate-speech and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to us

CustomRoberta(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((76

In [8]:
torch.manual_seed(42)
epochs = 5

optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
group_weights = [1] * (batch_size//n_groups)



In [9]:
import math

def update_dro_group_weights(weights, group_idx, loss, eta_q = 0.01):
  new_weight = weights[group_idx] * math.exp(eta_q * loss.item())
  weights[group_idx] = new_weight
  return [weight/sum(weights) for weight in weights]

In [10]:
def get_loss_value(model, loader, device, cal_f1=True, benchmark_val=False):
    """
    Evaluation loop for the multi-class classification problem.
    return (loss, accuracy)
    """
    model.eval()
    losses = []
    accuracies = []
    pred_labels = []
    true_labels = []
    meta_info = []

    with torch.no_grad():
        for i, (ids, masks, labels, meta) in enumerate(loader):
            ids = ids.to(device)
            masks = masks.to(device)
            labels = labels.to(device)
            
            # Forward pass
            outputs = model(ids,masks)
            loss = torch.nn.functional.cross_entropy(outputs, labels, reduce=None).detach()
            losses.append(loss.reshape(-1))
            preds = torch.argmax(outputs, dim=1)
            acc = (preds == labels).float().detach()
            pred_labels+=preds.detach().cpu().tolist()
            true_labels+=labels.detach().cpu().tolist()
            accuracies.append(acc.reshape(-1))
            meta_info.append(meta)

        if benchmark_val:
          return torch.FloatTensor(pred_labels), torch.FloatTensor(true_labels), torch.cat(meta_info, dim=0)

        losses = torch.cat(losses, dim=0).mean().cpu().data.numpy()
        accuracies = torch.cat(accuracies, dim=0).mean().cpu().data.numpy()

        ## As the original paper used the macro F1 score to evaluate the fine-tuned models
        ## additional argument (cal_f1) defined to calculate macro F1 score within this function
        if cal_f1:
          p_macro, r_macro, f1_macro, support_macro = \
                  precision_recall_fscore_support(y_true=np.array(true_labels), y_pred=np.array(pred_labels), average='macro')
          return losses, accuracies, p_macro, r_macro, f1_macro
        else:
          return losses, accuracies


In [11]:
## Defining step sizes in DRO
eta_q = 0.01

RESULT_FOLDER = "./drive/MyDrive/CS699/homework #3/DRO"
os.makedirs(f"{RESULT_FOLDER}/{pretrained_path}/", exist_ok=True)

device = torch.device("cuda")

with tqdm(total=epochs*len(train_loader)) as pbar:
  for epoch in range(epochs):
    model.train()
    
    for i, batch in enumerate(train_loader):
      selected_group = random.randint(0, (batch_size//n_groups) - 1)
      selected_idx = selected_group * 4

      batch_text = batch[0][selected_idx:selected_idx+4]#.to(device)
      d_labels = batch[1][selected_idx:selected_idx+4].to(device)

      tokenized_text = roberta_tokenizer(batch_text, truncation=True, max_length = 300, padding='max_length', return_tensors = 'pt', return_attention_mask = True)
      d_input_id = tokenized_text['input_ids'].to(device)
      d_att_mask = tokenized_text['attention_mask'].to(device)
      
      outputs = model(d_input_id,d_att_mask)
      loss = torch.nn.functional.cross_entropy(outputs, d_labels)

      group_weights = update_dro_group_weights(group_weights, selected_group, loss, eta_q = eta_q)
      optimizer.param_groups[0]['lr'] = 1e-5 * group_weights[selected_group]

      model.zero_grad()
      loss.backward()
      optimizer.step()
      pbar.update(1)


    torch.save(
        model.state_dict(), f'{RESULT_FOLDER}/{pretrained_path}/{epoch + 1}_model.pt',
        pickle_module=dill
    )
    
    pred, label, meta = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
    print(dataset.eval(pred, label, meta))

 20%|██        | 16815/84070 [19:33<651:14:49, 34.86s/it]

({'acc_avg': 0.807836651802063, 'acc_y:0_male:1': 0.700049638748169, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.9087607860565186, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7239579558372498, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8947136402130127, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.48348909616470337, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.9136512875556946, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8266258835792542, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8484126925468445, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.5676937699317932, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8948985934257507, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.7214764952659607, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8673076629638672, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.6536731719970703, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.8

 40%|████      | 33632/84070 [39:03<340:31:23, 24.30s/it]

({'acc_avg': 0.8376911878585815, 'acc_y:0_male:1': 0.7658782601356506, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8615524172782898, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7846110463142395, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8577092289924622, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.5461059212684631, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8881579041481018, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8507561087608337, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8238095045089722, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.6154995560646057, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8678549528121948, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.7446308732032776, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8615384697914124, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7238380908966064, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.

 60%|██████    | 50446/84070 [58:26<244:08:16, 26.14s/it]

({'acc_avg': 0.8226069211959839, 'acc_y:0_male:1': 0.7427224516868591, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8760780692100525, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7645814418792725, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8718061447143555, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.49937695264816284, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8963815569877625, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8232377767562866, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8563492298126221, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.5818861126899719, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8819913864135742, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.740604043006897, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8500000238418579, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7451274394989014, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.

 80%|████████  | 67260/84070 [1:17:50<122:14:36, 26.18s/it]

({'acc_avg': 0.8283401131629944, 'acc_y:0_male:1': 0.7430532574653625, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8679074048995972, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7624655961990356, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8726872205734253, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.5242990851402283, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8856908082962036, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8329063653945923, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8380952477455139, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.5992530584335327, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8715426921844482, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.755033552646637, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8442307710647583, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7718141078948975, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.7

100%|██████████| 84070/84070 [1:37:20<00:00, 14.40it/s]

({'acc_avg': 0.8442690372467041, 'acc_y:0_male:1': 0.7804333567619324, 'count_y:0_male:1': 12092.0, 'acc_y:1_male:1': 0.8393100500106812, 'count_y:1_male:1': 2203.0, 'acc_y:0_female:1': 0.7998448610305786, 'count_y:0_female:1': 14179.0, 'acc_y:1_female:1': 0.8453744649887085, 'count_y:1_female:1': 2270.0, 'acc_y:0_LGBTQ:1': 0.5566978454589844, 'count_y:0_LGBTQ:1': 3210.0, 'acc_y:1_LGBTQ:1': 0.8717105388641357, 'count_y:1_LGBTQ:1': 1216.0, 'acc_y:0_christian:1': 0.8500950336456299, 'count_y:0_christian:1': 12101.0, 'acc_y:1_christian:1': 0.8206349015235901, 'count_y:1_christian:1': 1260.0, 'acc_y:0_muslim:1': 0.6018674373626709, 'count_y:0_muslim:1': 5355.0, 'acc_y:1_muslim:1': 0.8641671538352966, 'count_y:1_muslim:1': 1627.0, 'acc_y:0_other_religions:1': 0.7587248086929321, 'count_y:0_other_religions:1': 2980.0, 'acc_y:1_other_religions:1': 0.8480769395828247, 'count_y:1_other_religions:1': 520.0, 'acc_y:0_black:1': 0.7931034564971924, 'count_y:0_black:1': 3335.0, 'acc_y:1_black:1': 0.




In [None]:
import glob
pretrained_path = 'roberta-base'
model_path  = glob.glob(f"/home/hhamad/CSCI699-HW1/homework3/log/{pretrained_path}/*")

device = torch.device("cuda")

def load_ckp(checkpoint_fpath, model):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint)
    return model

for ckp_path in model_path:
  print(ckp_path)
  model = load_ckp(ckp_path, model)
  #loss, acc, prec, recall, f1 = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
  pred, label, meta = get_loss_value(model, test_dataloader, device=device, benchmark_val=True)
  #print("\t Loss: %f, Accuracy on the test dataset: %f" %(loss, acc))
  #print("\t prec: %f, recall: %f, macro f1: %f" %(prec, recall, f1))
  print(dataset.eval(pred, label, meta))
  print('--------------------------')
