# Initalization

In [9]:
import pandas as pd

from transformers import BertTokenizer
from tqdm.notebook import tqdm

# MultiNLI (Just Like JTT and GFRO)

In [2]:
# !wget https://nlp.stanford.edu/data/dro/multinli_bert_features.tar.gz

--2023-09-04 22:13:20--  https://nlp.stanford.edu/data/dro/multinli_bert_features.tar.gz
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/dro/multinli_bert_features.tar.gz [following]
--2023-09-04 22:13:22--  https://downloads.cs.stanford.edu/nlp/data/dro/multinli_bert_features.tar.gz
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40604486 (39M) [application/octet-stream]
Saving to: ‘multinli_bert_features.tar.gz’

tinli_bert_features   2%[                    ] 983.74K  5.02KB/s    eta 65m 26s^C


In [1]:
## Downlaod metadata from https://github.com/kohpangwei/group_DRO/tree/master/dataset_metadata/multinli and put it in raw/

In [2]:
!tar -xzf multinli_bert_features.tar.gz
!mv cached_dev_bert-base-uncased_128_mnli raw/cached_dev_bert-base-uncased_128_mnli
!mv cached_dev_bert-base-uncased_128_mnli-mm raw/cached_dev_bert-base-uncased_128_mnli-mm
!mv cached_train_bert-base-uncased_128_mnli raw/cached_train_bert-base-uncased_128_mnli

# !rm multinli_bert_features.tar.gz

In [4]:
import os
import torch
import pandas as pd
from PIL import Image
import numpy as np
import torchvision.transforms as transforms
from torch.utils.data import Dataset, Subset

class MultiNLIDataset(Dataset):
    """
    MultiNLI dataset.
    label_dict = {
        'contradiction': 0,
        'entailment': 1,
        'neutral': 2
    }
    # Negation words taken from https://arxiv.org/pdf/1803.02324.pdf
    negation_words = ['nobody', 'no', 'never', 'nothing']
    """

    def __init__(self, root_dir,
                 target_name, confounder_names,
                 augment_data=False,
                 model_type='bert'):
        self.root_dir = root_dir
        self.target_name = target_name
        self.confounder_names = confounder_names
        self.model_type = model_type
        self.augment_data = augment_data

        assert len(confounder_names) == 1
        assert confounder_names[0] == 'sentence2_has_negation'
        assert target_name in ['gold_label_preset', 'gold_label_random']
        assert augment_data == False
        assert model_type == 'bert'

        self.data_dir = os.path.join(
            self.root_dir,
            'raw')
        self.glue_dir = os.path.join(
            self.root_dir,
            'raw')
        if not os.path.exists(self.data_dir):
            raise ValueError(
                f'{self.data_dir} does not exist yet. Please generate the dataset first.')
        if not os.path.exists(self.glue_dir):
            raise ValueError(
                f'{self.glue_dir} does not exist yet. Please generate the dataset first.')

        # Read in metadata
        type_of_split = target_name.split('_')[-1]
        self.metadata_df = pd.read_csv(
            os.path.join(
                self.data_dir,
                f'metadata_{type_of_split}.csv'),
            index_col=0)

        # Get the y values
        # gold_label is hardcoded
        self.y_array = self.metadata_df['gold_label'].values
        self.n_classes = len(np.unique(self.y_array))

        self.confounder_array = self.metadata_df[confounder_names[0]].values
        self.n_confounders = len(confounder_names)


        # Map to groups
        self.n_groups = len(np.unique(self.confounder_array)) * self.n_classes
        self.group_array = (self.y_array*(self.n_groups/self.n_classes) + self.confounder_array).astype('int')


        # Extract splits
        self.split_array = self.metadata_df['split'].values
        self.split_dict = {
            'train': 0,
            'val': 1,
            'test': 2
        }

        # Load features
        self.features_array = []
        for feature_file in [
            'cached_train_bert-base-uncased_128_mnli',
            'cached_dev_bert-base-uncased_128_mnli',
            'cached_dev_bert-base-uncased_128_mnli-mm'
            ]:
            features = torch.load(
                os.path.join(
                    self.glue_dir,
                    feature_file))
            self.features_array += features

        self.all_input_ids = torch.tensor([f.input_ids for f in self.features_array], dtype=torch.long)
        self.all_input_masks = torch.tensor([f.input_mask for f in self.features_array], dtype=torch.long)
        self.all_segment_ids = torch.tensor([f.segment_ids for f in self.features_array], dtype=torch.long)
        self.all_label_ids = torch.tensor([f.label_id for f in self.features_array], dtype=torch.long)

        self.x_array = torch.stack((
            self.all_input_ids,
            self.all_input_masks,
            self.all_segment_ids), dim=2)

        assert np.all(np.array(self.all_label_ids) == self.y_array)


    def __len__(self):
        return len(self.y_array)

    def __getitem__(self, idx):
        y = self.y_array[idx]
        g = self.group_array[idx]
        x = self.x_array[idx, ...]
        return x, y, g

    def get_splits(self, splits, train_frac=1.0):
        subsets = {}
        for split in splits:
            assert split in ('train','val','test'), split+' is not a valid split'
            mask = self.split_array == self.split_dict[split]
            num_split = np.sum(mask)
            indices = np.where(mask)[0]
            if train_frac<1 and split == 'train':
                num_to_retain = int(np.round(float(len(indices)) * train_frac))
                indices = np.sort(np.random.permutation(indices)[:num_to_retain])
            subsets[split] = Subset(self, indices)
        return subsets

    def group_str(self, group_idx):
        y = group_idx // (self.n_groups/self.n_classes)
        c = group_idx % (self.n_groups//self.n_classes)

        attr_name = self.confounder_names[0]
        group_name = f'{self.target_name} = {int(y)}, {attr_name} = {int(c)}'
        return group_name

In [5]:
pwd

'/home/user01/hamidreza/Learning-How-to-Mask-Text-Input-for-Better-Generalization/dataset'

In [7]:
random_dataset = MultiNLIDataset(
    root_dir='/home/user01/hamidreza/Learning-How-to-Mask-Text-Input-for-Better-Generalization/dataset',
    target_name='gold_label_random', confounder_names=['sentence2_has_negation'],
    )
random_splited_dataset = random_dataset.get_splits(['train','val','test'])

In [10]:
PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME, do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [11]:
data_type = 'train'
input_list = []
label_list = []
group_list = []
segment_list = []
for index in tqdm(range(len(random_splited_dataset[data_type]))):
    input = random_splited_dataset[data_type][index][0][:, 0]
    string_input = tokenizer.decode(input)
    clean_string_input = [x for x in string_input.split() if x not in ['[CLS]', '[SEP]', '[PAD]']]
    string_input = " ".join(clean_string_input)
    # attention_mask = random_train_dataset[data_type][0][0][:, 1]
    # split = random_train_dataset[data_type][index][0][:, 2]
    segment_ids = random_splited_dataset[data_type][index][0][:, 2]
    label = random_splited_dataset[data_type][index][1]
    group = random_splited_dataset[data_type][index][2]

    input_list.append(string_input)
    label_list.append(label)
    group_list.append(group)
    segment_list.append(segment_ids.numpy())

  0%|          | 0/206175 [00:00<?, ?it/s]

In [13]:
train_pdf = pd.DataFrame(data={'text':input_list, 'label':label_list, 'group':group_list, 'segments':segment_list})
train_pdf.head()

Unnamed: 0,text,label,group,segments
0,you know during the season and i guess at at y...,1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,yeah i tell you what though if you go price so...,2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,at the end of rue des francs - bourgeois is wh...,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"i burst through a set of cabin doors, and fell...",1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,it's not that the questions they asked weren't...,2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [14]:
train_pdf.to_csv('MultiNLI_dataset/MultiNLI_dataset/train.csv', index=False)

In [15]:
data_type = 'test'
test_input_list = []
test_label_list = []
test_group_list = []
test_segment_list = []
for index in tqdm(range(len(random_splited_dataset[data_type]))):
    input = random_splited_dataset[data_type][index][0][:, 0]
    string_input = tokenizer.decode(input)
    clean_string_input = [x for x in string_input.split() if x not in ['[CLS]', '[SEP]', '[PAD]']]
    string_input = " ".join(clean_string_input)
    # attention_mask = random_train_dataset[data_type][0][0][:, 1]
    # split = random_train_dataset[data_type][index][0][:, 2]
    segment_ids = random_splited_dataset[data_type][index][0][:, 2]
    label = random_splited_dataset[data_type][index][1]
    group = random_splited_dataset[data_type][index][2]

    test_input_list.append(string_input)
    test_label_list.append(label)
    test_group_list.append(group)
    test_segment_list.append(segment_ids.numpy())

  0%|          | 0/123712 [00:00<?, ?it/s]

In [16]:
test_pdf = pd.DataFrame(data={'text':test_input_list, 'label':test_label_list, 'group':test_group_list, 'segments':test_segment_list})
test_pdf.tail()

Unnamed: 0,text,label,group,segments
123707,he trained in desktop publishing and combined ...,1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
123708,"so, i have my sister's kid here and i'm going ...",0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
123709,each week's demand has been divided by the ave...,1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
123710,that's a good attitude! you feel good about th...,2,4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, ..."
123711,"bloomer ( for ` flower'), butter ( for ` ram')...",1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
test_pdf.to_csv('MultiNLI_dataset/MultiNLI_dataset/test.csv', index=False)

In [18]:
data_type = 'val'
val_input_list = []
val_label_list = []
val_group_list = []
val_segment_list = []
for index in tqdm(range(len(random_splited_dataset[data_type]))):
    input = random_splited_dataset[data_type][index][0][:, 0]
    string_input = tokenizer.decode(input)
    clean_string_input = [x for x in string_input.split() if x not in ['[CLS]', '[SEP]', '[PAD]']]
    string_input = " ".join(clean_string_input)
    # attention_mask = random_train_dataset[data_type][0][0][:, 1]
    # split = random_train_dataset[data_type][index][0][:, 2]
    segment_ids = random_splited_dataset[data_type][index][0][:, 2]
    label = random_splited_dataset[data_type][index][1]
    group = random_splited_dataset[data_type][index][2]

    val_input_list.append(string_input)
    val_label_list.append(label)
    val_group_list.append(group)
    val_segment_list.append(segment_ids.numpy())

  0%|          | 0/82462 [00:00<?, ?it/s]

In [19]:
val_pdf = pd.DataFrame(data={'text':val_input_list, 'label':val_label_list, 'group':val_group_list, 'segments':val_segment_list})
val_pdf.tail()

Unnamed: 0,text,label,group,segments
82457,"today, bodenheim's novel might be of interest ...",0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
82458,"thus, step down ( or back ) and give me a shot...",1,2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
82459,"for indianapolis, that public university must ...",0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
82460,do you watch that? can you see?,0,0,"[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, ..."
82461,the recorder captured the sounds of loud thump...,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [20]:
val_pdf.to_csv('MultiNLI_dataset/MultiNLI_dataset/val.csv', index=False)