In [2]:
import pandas as pd
import numpy as np
import json
import random
import spacy
from transformers import BertTokenizer, BertForMaskedLM
class DataPerturbation:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
        self.model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')
        self.en = spacy.load('en_core_web_sm')
        
    @staticmethod
    def connect_token_segments(tokens):
        connected_tokens = []
        for token in tokens:
            if token.startswith("##"):
                connected_tokens[-1] = connected_tokens[-1] + token[2:]
            else:
                connected_tokens.append(token)
        return connected_tokens


    def delete_words(self, Definition, num=10):

        tokens = self.tokenizer.tokenize(Definition)
        tokens = self.connect_token_segments(tokens)

        index = [i for i in range(len(tokens))]

        deleted_index = random.sample(index, num)
        deleted_index = set(deleted_index)

        deleted_tokens = [tokens[i] for i in index if i not in deleted_index]
        Definition_perturb = self.tokenizer.convert_tokens_to_string(deleted_tokens)
        return Definition_perturb
        
    def delete_stopwords(self, Definition):
        
        tokens = self.tokenizer.tokenize(Definition)
        tokens = self.connect_token_segments(tokens)

        stopwords = self.en.Defaults.stop_words
        deleted_tokens=[]
        for token in tokens:
            if token.lower() not in stopwords:
                deleted_tokens.append(token)
        Definition_perturb =  self.tokenizer.convert_tokens_to_string(deleted_tokens)
        return Definition_perturb

    def insert_words(self, Definition, num_mask=10):

        tokens = self.tokenizer.tokenize(Definition)
        if len(tokens)>512:
            return Definition
        tokens = self.connect_token_segments(tokens)

        index = [i for i in range(len(tokens))]

        index = random.sample(index, num_mask)

        for i in index:
            tokens.insert(i, '[MASK]')

        
        
        Definition = self.tokenizer.convert_tokens_to_string(tokens)
        inputs = self.tokenizer(Definition, return_tensors='pt')
        input_ids = inputs['input_ids'][0]
        outputs = self.model(**inputs)
        predictions = outputs[0]

        _, sorted_idx = predictions[0].sort(dim=-1, descending=True)

        predicted_index = [sorted_idx[i, 0].item() for i in range(0, len(predictions[0])-1)]
        for x in range(1, len(predictions[0])-1):
            if input_ids[x] == 103:
                input_ids[x] = predicted_index[x]

        return self.tokenizer.decode(input_ids, skip_special_tokens=True)

    def replace_words(self, Definition, num_mask=10):

        tokens = self.tokenizer.tokenize(Definition)
        if len(tokens)>512:
            return Definition
        tokens = self.connect_token_segments(tokens)

        index = [i for i in range(len(tokens))]

        index = random.sample(index, num_mask)

        for i in index:
            tokens[i] = '[MASK]'

        if len(tokens)>512:
            return Definition
        
        Definition = self.tokenizer.convert_tokens_to_string(tokens)
        inputs = self.tokenizer(Definition, return_tensors='pt')
        input_ids = inputs['input_ids'][0]
        outputs = self.model(**inputs)
        predictions = outputs[0]

        _, sorted_idx = predictions[0].sort(dim=-1, descending=True)

        predicted_index = [sorted_idx[i, 0].item() for i in range(0, len(predictions[0])-1)]
        for x in range(1, len(predictions[0])-1):
            if input_ids[x] == 103:
                input_ids[x] = predicted_index[x]

        return self.tokenizer.decode(input_ids, skip_special_tokens=True)
    
    def shuffle_words(self, Definition):

        tokens = self.tokenizer.tokenize(Definition)
        tokens = self.connect_token_segments(tokens)

        random.shuffle(tokens)
        return self.tokenizer.convert_tokens_to_string(tokens)
    
    def shuffle_sentences(self, Definition):

        doc = self.en(Definition)
        sents = list(map(str, doc.sents))
        random.shuffle(sents)
        return " ".join(sents)

    def repeat_sentences(self, Definition, index = None):
        doc = self.en(Definition)
        sents = list(map(str, doc.sents))
        if None == index:
            index = random.randint(0, len(sents)-1)
        sents = sents[:index] + [sents[index]] + sents[index:]
        return " ".join(sents)

In [3]:
import inspect
data_perturbation = DataPerturbation()
attrs = (getattr(data_perturbation, name) for name in dir(data_perturbation))
methods = list(filter(inspect.ismethod, attrs))


Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
# load txt file
with open('/home/gujiashe/Tk-Instruct/data/splits/default/dev_tasks copy.txt', 'r') as f:
    lines = f.readlines()
    task_names = [line.strip() for line in lines]
for seed_id in range(1, 4):
    random.seed(seed_id)
    for method in methods:
        if method.__name__ == '__init__':
            continue
        print(method.__name__)
        perturb_txt = open(f'/home/gujiashe/Tk-Instruct/data/perturb/splits/default/dev_{method.__name__}_{seed_id}.txt', 'w')
        for task_name in task_names:
            path = '/home/gujiashe/Tk-Instruct/data/tasks/'+task_name+'.json'
            new_path = '/home/gujiashe/Tk-Instruct/data/perturb/tasks/'+task_name+f'_{method.__name__}_{seed_id}.json'
            print(new_path.split('/')[-1].strip('.json'), file=perturb_txt)
            with open(path, encoding="utf-8") as task_f:
                s = task_f.read()
                task_data = json.loads(s)
                task_data["Definition"] = [method(task_data["Definition"][0])]
                # print(task_data["Definition"][0])
                json.dump(task_data, open(new_path, 'w', encoding="utf-8"), indent = 4)
                # print(task_data)
        perturb_txt.close()
            

delete_stopwords
delete_words
insert_words
repeat_sentences
replace_words
shuffle_sentences
shuffle_words
delete_stopwords
delete_words
insert_words
repeat_sentences
replace_words
shuffle_sentences
shuffle_words
delete_stopwords
delete_words
insert_words
repeat_sentences
replace_words
shuffle_sentences
shuffle_words


In [8]:
# load txt file
with open('/home/gujiashe/Tk-Instruct/data/splits/default/dev_tasks copy.txt', 'r') as f:
    lines = f.readlines()
    task_names = [line.strip() for line in lines]


for seed_id in range(1, 4):
    random.seed(seed_id)
    task_names_another = task_names.copy()
    random.shuffle(task_names_another)
    method_name = "shuffle_instructions"
    print(method_name)
    perturb_txt = open(f'/home/gujiashe/Tk-Instruct/data/perturb/splits/default/dev_{method_name}_{seed_id}.txt', 'w')
    for i, task_name in enumerate(task_names):
        path = '/home/gujiashe/Tk-Instruct/data/tasks/'+task_name+'.json'
        new_path = '/home/gujiashe/Tk-Instruct/data/perturb/tasks/'+task_name+f'_{method_name}_{seed_id}.json'
        print(new_path.split('/')[-1].strip('.json'), file=perturb_txt)
        with open(path, encoding="utf-8") as task_f:
            s = task_f.read()
            task_data = json.loads(s)
            path_another = '/home/gujiashe/Tk-Instruct/data/tasks/'+task_names_another[i]+'.json'
            with open(path_another, encoding="utf-8") as task_f_another:
                s_another = task_f_another.read()
                task_data_another = json.loads(s_another)
                task_data["Definition"] = task_data_another["Definition"]
                json.dump(task_data, open(new_path, 'w', encoding="utf-8"), indent = 4)
    perturb_txt.close()
            

shuffle_instructions
shuffle_instructions
shuffle_instructions


In [4]:
# load txt file
with open('/home/gujiashe/Tk-Instruct/data/splits/xlingual/dev_tasks.txt', 'r') as f:
    lines = f.readlines()
    task_names = [line.strip() for line in lines]
for seed_id in range(1, 4):
    random.seed(seed_id)
    for method in methods:
        if method.__name__ == '__init__':
            continue
        print(method.__name__)
        perturb_txt = open(f'/home/gujiashe/Tk-Instruct/data/perturb/splits/xlingual/dev_{method.__name__}_{seed_id}.txt', 'w')
        for task_name in task_names:
            path = '/home/gujiashe/Tk-Instruct/data/tasks/'+task_name+'.json'
            new_path = '/home/gujiashe/Tk-Instruct/data/perturb/tasks/'+task_name+f'_{method.__name__}_{seed_id}.json'
            print(new_path.split('/')[-1].strip('.json'), file=perturb_txt)
            with open(path, encoding="utf-8") as task_f:
                s = task_f.read()
                task_data = json.loads(s)
                task_data["Definition"] = [method(task_data["Definition"][0])]
                # print(task_data["Definition"][0])
                json.dump(task_data, open(new_path, 'w', encoding="utf-8"), indent = 4)
                # print(task_data)
        perturb_txt.close()
            

delete_stopwords
delete_words
insert_words
repeat_sentences
replace_words
shuffle_sentences
shuffle_words
delete_stopwords
delete_words
insert_words
repeat_sentences
replace_words
shuffle_sentences
shuffle_words
delete_stopwords
delete_words
insert_words
repeat_sentences
replace_words
shuffle_sentences
shuffle_words


In [9]:
# load txt file
with open('/home/gujiashe/Tk-Instruct/data/splits/xlingual/dev_tasks.txt', 'r') as f:
    lines = f.readlines()
    task_names = [line.strip() for line in lines]


for seed_id in range(1, 4):
    random.seed(seed_id)
    task_names_another = task_names.copy()
    random.shuffle(task_names_another)
    method_name = "shuffle_instructions"
    print(method_name)
    perturb_txt = open(f'/home/gujiashe/Tk-Instruct/data/perturb/splits/xlingual/dev_{method_name}_{seed_id}.txt', 'w')
    for i, task_name in enumerate(task_names):
        path = '/home/gujiashe/Tk-Instruct/data/tasks/'+task_name+'.json'
        new_path = '/home/gujiashe/Tk-Instruct/data/perturb/tasks/'+task_name+f'_{method_name}_{seed_id}.json'
        print(new_path.split('/')[-1].strip('.json'), file=perturb_txt)
        with open(path, encoding="utf-8") as task_f:
            s = task_f.read()
            task_data = json.loads(s)
            path_another = '/home/gujiashe/Tk-Instruct/data/tasks/'+task_names_another[i]+'.json'
            with open(path_another, encoding="utf-8") as task_f_another:
                s_another = task_f_another.read()
                task_data_another = json.loads(s_another)
                task_data["Definition"] = task_data_another["Definition"]
                json.dump(task_data, open(new_path, 'w', encoding="utf-8"), indent = 4)
    perturb_txt.close()
            

shuffle_instructions
shuffle_instructions
shuffle_instructions
