In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import re
from collections import Counter
from tqdm import tqdm

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

import ilm.ilm.tokenize_util
from ilm.ilm.infer import infill_with_ilm
from perturbation_functions import calculate_necc_and_suff, gen_num_samples_table, gen_probs_table

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
MODEL_DIR = '../Models/ILM/'
MASK_CLS = 'ilm.mask.hierarchical.MaskHierarchical'

tokenizer = ilm.ilm.tokenize_util.Tokenizer.GPT2
with open(os.path.join(MODEL_DIR, 'additional_ids_to_tokens.pkl'), 'rb') as f:
    additional_ids_to_tokens = pickle.load(f)
additional_tokens_to_ids = {v:k for k, v in additional_ids_to_tokens.items()}
try:
    ilm.ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens, tokenizer)
except ValueError:
    print('Already updated')
print(additional_tokens_to_ids)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#model = GPT2LMHeadModel.from_pretrained(MODEL_DIR)
config = GPT2Config.from_json_file(f'{MODEL_DIR}config.json')
model = GPT2LMHeadModel(config)
# Charger les poids
model.load_state_dict(torch.load(f'{MODEL_DIR}model_weights.pt', map_location=torch.device('cpu')))
model.eval()
_ = model.to(device)

Already updated
{'<|startofinfill|>': 50257, '<|endofinfill|>': 50258, '<|infill_document|>': 50259, '<|infill_paragraph|>': 50260, '<|infill_sentence|>': 50261, '<|infill_ngram|>': 50262, '<|infill_word|>': 50263}


In [6]:
# generate approximately 100 perturbations for each token. 
num_samples = gen_num_samples_table(30, 100)
probs_table = gen_probs_table(30)
mask_tokn = additional_tokens_to_ids['<|infill_ngram|>']

orig_texts = []
necc_perturbed = []
suff_perturbed = []
necc_masks = []
suff_masks = []

with open("extension_sexism/data_sexism/samples_sexism_small.txt", "r") as ff:
    with tqdm(total=120) as pbar:
        for text in ff:
            necc_pp, suff_pp, necc_mm, suff_mm = calculate_necc_and_suff(text, ilm_tokenizer=tokenizer, ilm_model=model, cl_tokenizer=None, cl_model=None, num_samples=num_samples,
                               mask_tokn=mask_tokn, additional_tokens_to_ids=additional_tokens_to_ids, probs_table=probs_table, 
                               return_pert_only=True)

            orig_texts.append(text)
            necc_perturbed.append(necc_pp)
            suff_perturbed.append(suff_pp)
            necc_masks.append(necc_mm)
            suff_masks.append(suff_mm)
            pbar.update(1)
    
necc_suff_perturbations = {'orig_texts': orig_texts, 
                           'necc_perturbed': necc_perturbed, 
                           'suff_perturbed': suff_perturbed,
                           'necc_masks': necc_masks,
                           'suff_masks': suff_masks}

pickle.dump(necc_suff_perturbations, open('extension_sexism/data_sexism/sexism_necc_suff_perturbations_small.pickle', 'wb'))

  0%|          | 0/120 [00:00<?, ?it/s]

 67%|██████▋   | 80/120 [1:28:31<44:15, 66.39s/it]   


In [8]:
necc_suff_perturbations['necc_perturbed']

[["I'll change she is the daughter of president uhmoustafa.",
  'I thought that she gets raped',
  "i hope she'd raped",
  'i hope she gets raped',
  'ok, gotta close she gets it,',
  'I hope she loves you so, too',
  'I hope she gets a few months now.',
  'I think she gets raped',
  'i hope you gets shot',
  'I hope she gets better',
  'I think she gets raped',
  'I hope she is raped',
  'no, she gets messed up.',
  'I hope she gets her revenge',
  'I hope she gets what she deserves.',
  'I hope she gets what she deserves.',
  'I think it gets pregnant she she gets raped',
  'I hope it gets raped',
  'i hope she gets a chuckle.',
  'I know if she gets hated the moment',
  "I'm accused of raped",
  "i hope she doesn't get it ;)",
  'i hope she gets raped',
  'we hope she gets raped',
  'i hope she gets raped',
  'I think of she gets had any blood relation to death,',
  "byne time she's 18, than she'll be removed from comment",
  'I hope you are raped',
  'I hope she is raped',
  'you h