In [1]:
import json
import random
import os
import openai
from colorama import Fore, Back, Style

In [2]:
def load_jsonl(filename):
    records = []
    with open(filename, 'r') as file:
        for line in file:
            records.append(json.loads(line))
    return records

def save_json(data, filename):
    with open(filename, "w") as file:
        json.dump(data, indent=4)

def save_jsonl(data_list, filename):
    with open(filename, 'w') as file:
        for record in data_list:
            json.dump(record, file)
            file.write('\n')

def pickle_save(obj, filename):
    with open(filename, "wb") as file:
        pickle.dump(obj, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

# Read dataset

In [3]:
test_set = load_jsonl("DECODE[dialogue-contradiction-detection]/decode_v0.1/test.jsonl")
len(test_set)

4216

In [4]:
samples_with_contradiction = []
samples_no_contradiction = []
for i, x in enumerate(test_set):
    if x["is_contradiction"]:
        samples_with_contradiction.append((i, x))
    else:
        samples_no_contradiction.append((i, x))
print(len(samples_with_contradiction), len(samples_no_contradiction))

2108 2108


In [5]:
with_contra_random_samples = random.sample(samples_with_contradiction, 100)
no_contra_random_samples = random.sample(samples_no_contradiction, 100)

In [6]:
def show_instance(sample):
    index, instance = sample
    contradiction_indices = instance['aggregated_contradiction_indices']
    contradiction_indices = [x for x in contradiction_indices if x != len(instance['turns']) - 1]  # remove the last utterance, because it is always there
    # print("contra ids", contradiction_indices)
    for t in sorted(instance['turns'], key=lambda x: x['turn_id']):
        if t['turn_id'] in contradiction_indices:
            print(Back.YELLOW + f"{t['agent_id']}: {t['text']}", end="")
            print(Style.RESET_ALL)
        elif t['auxiliary']['contradiction']:
            print(Fore.RED + f"{t['agent_id']}: {t['text']}", end="")
            print(Style.RESET_ALL)
        else:
            print(f"{t['agent_id']}: {t['text']}")

In [None]:
# an example with contradiction
# the last utterance (red) is contradicting to the 4-th utterance (highlighed in yellow)
show_instance(with_contra_random_samples[0])

In [None]:
# an example without contradiction
show_instance(no_contra_random_samples[0])

# Construct Prompt

Note: no need to distinguish between samples with/without contradiction when constructing prompt (I previously did this to test label-reversing modifications. But not required at the moment).

Differences compared to sent's version
* mention that modification should be done on LAST TURN
* two fields: DIALOGUE and LAST TURN
* mention that the modified LAST TURN is still coherent with the previous dialogue context.

In [7]:
system_instruction = '''You are a linguist specializing in doing text annotation in the English language. You will be tasked with making minimal modification to the LAST TURN of a conversation based on some linguistics aspects to expose biases in machine learning models. Maker sure the modified LAST TURN is still coherent with previous dialogue context.

The given text are samples in the conversational contradiction detection task.

Each sample has two fields: 
- DIALOGUE: a conversation with multiple turns.
- LAST TURN: the last turn/utterance of the conversation

A task may ask for one or multiple modifications for LAST TURN. Each modification should be an object with 3 fields: 
type: the type of modification
modified_last_turn: the modified last turn of the dialogue.
rationale: the reason why and how the modifications are made.

Please return a json object which consists of one or multiple modifications.
'''

## example: antiderivation

Differences compared to sent's version
* the prompt specifies that modification should be done for the LAST TURN
* only gives task instruction and examples (so we can append the DIALOGUE and the LAST TURN later)

In [None]:
antiderivation_replacement_instruction ='''Find any non-derived word (a word without suffixes or prefixes) in the LAST TURN below and change it into a derived word (word with a prefix or a suffix). Do not add grammatical suffixes (-s, -ed, -er, -est). Make sure the conversation is natural after modification.

Example: a sometimes dull film ->  a sometimes tedious film (tedious is derived from tedium using a -ios suffix)

Example: an very hard task -> an increasingly hard task (increasingly is derived from increasing using a -ly suffix)

Example: amazing accomplishment -> Skip (both words are already derived)
'''

In [None]:
prompt = create_prompt(with_contra_random_samples[0], antiderivation_replacement_instruction)
print(prompt)

In [8]:
import openai
import pandas as pd
import numpy as np
import os
import json
import random
import nltk
from nltk.corpus import stopwords
from google_pygram import GooglePyGram as gpg
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from datasets import load_dataset
from string import punctuation
import json

In [9]:
from tqdm import tqdm

In [10]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff


In [12]:
stoplist = set(stopwords.words('english'))

In [13]:
MODEL = "gpt-4-1106-preview"

In [14]:
def format_prompt(system_instruction, instruction, input):
    message = create_prompt(input, instruction)
    # message = instruction.format(sample = str(input))
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": message},
    ]
    # print(messages)
    return messages

In [None]:
%pprint

In [None]:
print(format_prompt(system_instruction, frequency_bias_instruction, with_contra_random_samples[0])[0]['content'])


In [None]:
print(format_prompt(system_instruction, frequency_bias_instruction, with_contra_random_samples[0])[1]['content'])

In [114]:
# Example OpenAI Python library request
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def request(samples, prompt_type):
    modified_samples = []
    for sample in tqdm(samples):
        # print(samples)
        # create_prompt(sample, prompt_type)
        messages = format_prompt(system_instruction, prompt_type, sample)
        # print(messages)
        response = openai.ChatCompletion.create(
            model=MODEL,
            response_format={ "type": "json_object" },
            messages= messages,
            temperature=0,
        )
        ans_model = response['choices'][0]['message']['content']
        modified_samples.append(ans_model)
        # print(messages[1]['content'])
        # print(sample)
        # print(ans_model)
        # print('===================')
    return modified_samples

In [16]:
subsaharan_africa = ['Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cameroon', 'Cape Verde',
          'Central African Republic', 'Chad', 'Comoros', 'Djibouti', 'Republic of the Congo', 'Democratic Republic of the Congo (Zaire)',
          "Côte d'Ivoire", 'Equatorial Guinea', 'Eritrea', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana', 'Guinea', 
          'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Madagascar', 'Malawi', 'Mali', 'Mauritius', 'Mozambique', 
          'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'São Tomé and Príncipe', 'Senegal', 'Seychelles', 'Sierra Leone', 
          'Somalia', 'South Africa', 'South Sudan', 'Swaziland', 'Tanzania', 'Togo', 'Uganda']
          

#Middle East, North Africa, and Turkey		   
menat = ['Algeria', 'Bahrain', 'Iran', 'Iraq', 'Jordan', 'Egypt', 'Kuwait', 'Lebanon', 'Libya', 'Mauritania',
          'Morocco', 'Oman', 'Palestine', 'Qatar', 'Saudi Arabia', 'Sudan', 'Syria', 'Turkey',
          'Tunisia', 'United Arab Emirates', 'Yemen']


southeast_asia = ['Brunei', 'Cambodia', 'Timor Leste', 'Indonesia', 'Laos', 'Malaysia', 'Myanmar',
        'Philippines', 'Singapore', 'Thailand', 'Vietnam']

east_asia = ['China', 'Japan', 'Mongolia', 'North Korea', 'South Korea', 'Taiwan']

south_asia = ['Bangladesh', 'Bhutan', 'India', 'The Maldives', 'Nepal', 'Pakistan', 'Sri Lanka']

central_asia = ['Afghanistan', 'Armenia', 'Azerbaijan', 'Georgia', 'Kazakhstan', 'Kyrgyzstan', 
        'Tajikistan', 'Turkmenistan', 'Uzbekistan']


#Oceania, Melanesia, and Polynesia
oceania = ['Fiji', 'Federated States of Micronesia', 'Kiribati', 'Marshall Islands', 'Nauru', 
           'Palau', 'Papua New Guinea', 'Samoa', 'Solomon Islands', 'Tonga', 'Tuvalu', 'Vanuatu']

australia_nz = ['Australia', 'New Zealand']


#Carribean and Latin America
latin_america = ['Mexico', 'Puerto Rico', 'Dominican Republic', 'Cuba', 'Haiti', 'Belize', 'Grenada', 'Saint Lucia',
           'Costa Rica', 'El Salvador', 'Guatemala', 'Honduras', 'Nicaragua', 'Panama', 'Jamaica', 'Bahamas', 'Barbados',
           'Dominica', 'Brazil', 'Argentina', 'Bolivia', 'Chile', 'Colombia', 'Ecuador', 'Guyana', 'Paraguay', 'Peru',
           'Suriname', 'Trinidad and Tobago', 'Uruguay', 'Venezuela', 'Antigua and Barbuda', 'Saint Kitts and Nevis']

north_america = ['Canada', 'United States of America']

#Northern Europe (nordic)
northern_europe = ['Denmark', 'Estonia', 'Finland', 'Iceland', 'Latvia', 'Lithuania', 'Norway', 'Sweden']
		   
western_europe = ['Belgium', 'France', 'Republic of Ireland', 'Luxembourg', 'Monaco', 'Netherlands', 'United Kingdom']

central_europe = ['Austria', 'Czech Republic', 'Germany', 'Hungary', 'Liechtenstein', 'Poland', 'Slovakia', 'Switzerland']

southern_europe = ['Andorra', 'Italy', 'Malta', 'Portugal', 'San Marino', 'Spain', 'Vatican City']

#Southeastern Europe (mostly Balkan)
southeastern_europe = ['Albania', 'Bosnia and Herzegovina', 'Bulgaria', 'Croatia', 'Cyprus', 'Greece',
          'Kosovo', 'North Macedonia', 'Moldova', 'Montenegro', 'Romania', 'Serbia', 'Slovenia']
		   
eastern_europe = ['Russia', 'Belarus', 'Ukraine']
regions = [subsaharan_africa, menat, southeast_asia, east_asia, central_asia, oceania, latin_america, southeastern_europe]

In [17]:
def format_prompt_loc(system_instruction, instruction, input, region):
    # print(input)
    loc = random.sample(region,1)
    message = create_prompt(input, instruction, loc = loc[0])
    # message = instruction.format(sample = input, loc=loc)
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": message},
    ]
    return messages

In [19]:
def create_prompt(sample, modification_instruction, loc = None, nonce = None):
    index, instance = sample
        
    utterances = sorted(instance['turns'], key=lambda x: x['turn_id'])
    last_utterance = utterances[-1]['text']
    
    if loc != None:
        # print(loc)
        modification_instruction = modification_instruction.format(loc = loc)
        # print(modification_instruction)
    if nonce != None:
        modification_instruction = modification_instruction.format(nonce = nonce)

    prompt = modification_instruction
    prompt += "\n\nDIALOGUE:\n" + "\n".join([f"Agent {u['agent_id']}: {u['text']}" for u in utterances])
    prompt += "\n\nLAST TURN:\n" + last_utterance
    
    return prompt

In [20]:
format_prompt_loc(system_instruction, geographical_bias_instruction, no_contra_random_samples[0], subsaharan_africa)

NameError: name 'geographical_bias_instruction' is not defined

In [21]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def request_loc(samples, prompt_type):
    modified_samples = []
    for sample in tqdm(samples):
        for region in regions:
            # print(region)
            messages = format_prompt_loc(system_instruction, prompt_type, sample, region)
            # print(messages)
            response = openai.ChatCompletion.create(
                model=MODEL,
                response_format={ "type": "json_object" },
                messages= messages,
                temperature=0,
            )
            ans_model = response['choices'][0]['message']['content']
            modified_samples.append(ans_model)
            # print(sample)
            # print(ans_model)
        # print('===================')
    return modified_samples

In [22]:
nonce_words = ["roagly", "vibble", "drok", "scrop", "plard", "hif", "tepable", "plawic", "bluth", "sprat", "flurf"]

In [23]:
def format_prompt_concepts(system_instruction, instruction, input):
    
    nonce = random.sample(nonce_words,1)
    message = create_prompt(input, instruction, nonce = nonce)
    # message = instruction.format(sample = input, nonce= nonce)
    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": message},
    ]
    return messages

In [24]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def request_concepts(samples, prompt_type):
    modified_samples = []
    for sample in tqdm(samples):
        messages = format_prompt_concepts(system_instruction, prompt_type, sample)
        response = openai.ChatCompletion.create(
            model=MODEL,
            response_format={ "type": "json_object" },
            messages= messages,
            temperature=0,
        )
        ans_model = response['choices'][0]['message']['content']
        modified_samples.append(ans_model)
        # print(messages[1]['content'])
        # print(sample)
        # print(ans_model)
        # print('===================')
    return modified_samples

In [25]:
def show_instance(sample):
    index, instance = sample
    contradiction_indices = instance['aggregated_contradiction_indices']
    contradiction_indices = [x for x in contradiction_indices if x != len(instance['turns']) - 1]  # remove the last utterance, because it is always there
    # print("contra ids", contradiction_indices)
    dialog = []
    obj = {}
    utterances = sorted(instance['turns'], key=lambda x: x['turn_id'])
    for t in utterances:
        # if t['turn_id'] in contradiction_indices:
        #     print(Back.YELLOW + f"{t['agent_id']}: {t['text']}", end="")
        #     print(Style.RESET_ALL)
        # elif t['auxiliary']['contradiction']:
        #     print(Fore.RED + f"{t['agent_id']}: {t['text']}", end="")
        #     print(Style.RESET_ALL)
        # else:
        #     print(f"{t['agent_id']}: {t['text']}")
        dialog.append(f"{t['agent_id']}: {t['text']}")
    obj['id'] = index
    obj['dialog'] = '\n'.join(dialog)
    obj['last_turn'] = utterances[-1]
    return obj



In [26]:
show_instance(no_contra_random_samples[0])

{'id': 3258,
 'dialog': "0: Oh really? what kind of effects does it have? I am all for benefits!\n1: It is especially good for people with high blood pressure\n0: That's great, I should mention that when selling mine at the market\n1: Yes, beetroot has many benefits, I juice mine with apples and carrots daily and love it!\n0: That sounds really good. Do you ever add any spices or anything?\n1: Not really mostly lemons, kale or greens of some kind.\n0: I could never put kale in it, but I'd like to try adding lemons\n1: You really can't taste the kale, as the kale has calcium and great benefits as well as the beetroot.\n0: No kale for me, but if you enjoy it that's great!\n1: lol, to each there own I have heard beetroot is good for the kidneys?\n0: Yeah, probably helps circulation quite a bit. ",
 'last_turn': {'turn_id': 10,
  'agent_id': 0,
  'text': 'Yeah, probably helps circulation quite a bit. ',
  'turn_context': '',
  'auxiliary': {'contradiction': None}}}

In [27]:
# write original
with open('decode_no/original.jsonl', 'w') as f:
    for sample in no_contra_random_samples:
        # object = show_instance(sample)
        line = json.dumps(sample) + "\n"
        f.write(line)
f.close()


# Bias tests

### Frequency bias

In [28]:
frequency_bias_instruction = '''Replace one word of higher frequency in English vocabulary with a less frequent word. Add "replaced_word" and "new_word" fields to json output with the replaced and new words, accordingly.

Example:

Text: The cat licked its paw.
Modified Text: The yak licked its paw.

type: "frequency_bias"'''

In [29]:
modified_samples = request(no_contra_random_samples, frequency_bias_instruction)
with open('decode_no/frequency_bias.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [07:31<00:00,  4.52s/it]


### Temporal bias

In [30]:
temporal_bias_instruction = '''Please replace one word with its old-fashioned synonym. 

type: "temporal_bias"'''


In [31]:
modified_samples = request(no_contra_random_samples, temporal_bias_instruction)
with open('decode_no/temporal_bias.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:33<00:00,  3.93s/it]


In [32]:
geographical_bias_instruction = '''Change the entity name in the sentence with name commonly used in {loc}. 
Change the other words in the sentence so that it is culturally fitted with context of {loc}.
Write the modified sentences in English.

type: "geographical_bias"'''

### Geographical bias

In [33]:
modified_samples = request_loc(no_contra_random_samples, geographical_bias_instruction)
with open('decode_no/geographical_bias.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

 49%|████▉     | 49/100 [36:46<38:17, 45.04s/it]  
 16%|█▌        | 16/100 [35:49<3:08:02, 134.32s/it]
100%|██████████| 100/100 [1:15:20<00:00, 45.21s/it]


### Position bias

In [34]:
position_bias_instruction = '''Move important sentiment words to another position in the same sentence.
type: "position_bias"'''

In [35]:
modified_samples = request(no_contra_random_samples, position_bias_instruction)
with open('decode_no/position_bias.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [07:53<00:00,  4.74s/it]


### Length Bias 

In [36]:
length_bias_instruction = '''Modify the sentence length, but retain sentence meaning.

1. "shorter_sentence": Remove 1 - 3 words from the sentence.
2. "longer_sentence": Add 2 - 5 words to the sentence.'''

In [37]:
modified_samples = request(no_contra_random_samples, length_bias_instruction)
with open('decode_no/length_bias.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [12:01<00:00,  7.22s/it]


## Orthography

### Typos

In [115]:
typo_instruction = '''Add typos to the text. Common types of typos are:

1. "addition": Adding a letter: Forty (correct) vs. Fourty
2. "omission": Omitting a letter: Embarrass (correct) vs. Embarass
3. "flipping": flipping letters: Friend (correct) vs. Freind'''

In [116]:
modified_samples = request(no_contra_random_samples, typo_instruction)
with open('decode_no/typo_bias.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [13:48<00:00,  8.29s/it]


## Capitalization

In [117]:
capitalization_instruction = '''Modify the text capitalization by:

1. "lower": change a word with upper case to lower case.

2. "upper": change a word starting with lower case to upper case.

3. "all_caps": change a word to ALL CAPS.

4. "sponge": change a word to SpOnGeBoBcASe.

Reply with the modified text for each type of change.'''

In [118]:
modified_samples = request(no_contra_random_samples, capitalization_instruction)
with open('decode_no/capitalization.jsonl', 'w') as f:
    for  i,item in enumerate(modified_samples):
        object = json.loads(item)
        print(i,object)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [17:40<00:00, 10.60s/it]

0 {'modifications': [{'type': 'lower', 'modified_last_turn': 'yeah, probably helps circulation quite a bit.', 'rationale': "Changed the first word 'Yeah' to lowercase to demonstrate the effect of lowercasing a word that typically starts with an uppercase letter at the beginning of a sentence."}, {'type': 'upper', 'modified_last_turn': 'Yeah, Probably helps circulation quite a bit.', 'rationale': "Changed the second word 'probably' to uppercase to demonstrate the effect of capitalizing a word that typically starts with a lowercase letter."}, {'type': 'all_caps', 'modified_last_turn': 'Yeah, probably helps CIRCULATION quite a bit.', 'rationale': "Changed the word 'circulation' to ALL CAPS to emphasize it and demonstrate the effect of all capital letters on a word within a sentence."}, {'type': 'sponge', 'modified_last_turn': 'Yeah, probably helps circulation qUiTe a bit.', 'rationale': "Changed the word 'quite' to SpOnGeBoBcASe to demonstrate the effect of alternating capitalization with




## Punctuation

In [119]:
punctuation_instruction = '''Make change to the punctuation of the text:

1. "addition": add a random comma, semi-colon etc 

2. "replacement": change the full stop to the exclamation or question mark 

3. "glueing": remove white space between two words (glue them together). 

Reply with the modified text for each type of change.'''

In [120]:
modified_samples = request(no_contra_random_samples, punctuation_instruction)
with open('decode_no/punctuation.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [14:41<00:00,  8.82s/it]


## Morphology

## Derivation

In [46]:
derivation_replacement_instruction ='''Find a derived word (a word with a suffix or a prefix) in the text below and replace it with a non-derived word (word without any prefixes or suffixes).


Example: a sometimes tedious film -> a sometimes dull film (tedious is derived from tedium using a -ios suffix)

Example: a very good film -> Skip

type: "derivation"'''

In [47]:
modified_samples = request(no_contra_random_samples, derivation_replacement_instruction)
with open('decode_no/derivation.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:43<00:00,  4.03s/it]


In [48]:
antiderivation_replacement_instruction ='''Find any non-derived word (a word without suffixes or prefixes) in the text below and change it into a derived word (word with a prefix or a suffix). Do not add grammatical suffixes (-s, -ed, -er, -est).

Example: a sometimes dull film ->  a sometimes tedious film (tedious is derived from tedium using a -ios suffix)

Example: an very hard task -> an increasingly hard task (increasingly is derived from increasing using a -ly suffix)

Example: amazing accomplishment -> Skip (both words are already derived)

type: "anti_derivation"'''

In [49]:
modified_samples = request(no_contra_random_samples, antiderivation_replacement_instruction)
with open('decode_no/anti_derivation.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:34<00:00,  3.94s/it]


## Compound words

In [50]:
compound_word_instruction ='''Find any non-compond (single-root) word in the text below and change it into a compound word (word with several roots).

type: "compound_word"

Example: 
"a sequence of ridiculous shooting scenes" -> "a sequence of ridiculous shoot-'em-up scenes"
Example: dull acting ->  lacklustre acting'''

In [51]:
modified_samples = request(no_contra_random_samples, compound_word_instruction)
with open('decode_no/compound_word.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:28<00:00,  3.89s/it]


## Irregular verbs

In [52]:
regular_verb_instruction ='''Find a regular verb in the past tense and replace it with an irregular verb.

Example: he received a prize -> he got a prize

Example: amazing stuff  ->  Skip'''

In [53]:
modified_samples = request(no_contra_random_samples, regular_verb_instruction)
with open('decode_no/irregular_verb.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [07:00<00:00,  4.21s/it]


## Syntax

### Active to passive

In [54]:
passive_voice_instruction = '''Rewrite the text in passive voice.

type: "passive_voice"'''

In [55]:
modified_samples = request(no_contra_random_samples, passive_voice_instruction)
with open('decode_no/active_to_passive.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [07:51<00:00,  4.72s/it]


### Coordinating conjunctions

In [56]:
coordinating_conjunction_instruction = '''Change a noun or verb in this sentence into multiple nouns or verbs combined with coordinating conjunction.

type: "coordinating_conjunction"'''

In [57]:
modified_samples = request(no_contra_random_samples, coordinating_conjunction_instruction)
with open('decode_no/coordinating_conjunction.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [07:09<00:00,  4.30s/it]


### Tense

In [58]:
tense_instruction = '''Change the tense of verbs in the sentence. Keep the tenses consistent across the passage.'''

In [59]:
modified_samples = request(no_contra_random_samples, tense_instruction)
with open('decode_no/tense.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:59<00:00,  4.19s/it]


### Grammatical role

In [60]:
grammatical_role_instruction = '''Modify the position of grammatical role in the sentence.

1. "subject_object": swap the subject with object
Example: 
Miranda asked him a question. --> He asked Miranda a question.

2. "entities": swap the position of other entities
Example:
Samantha, the older Rico's friend, will be appointed as Chair of Student Body replacing Marie. --> Marie, the older Samantha's friend, will be appointed as Chair of Student Body replacing Rico.'''

In [61]:
modified_samples = request(no_contra_random_samples, grammatical_role_instruction)
with open('decode_no/grammatical_role.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [09:26<00:00,  5.66s/it]


### Clause structure

In [62]:
sentence_structure_instruction = '''Change the position of main and subordinate clause. If the input is simple sentence, skip.
type: "clause_structure"'''

In [63]:
modified_samples = request(no_contra_random_samples, sentence_structure_instruction)
with open('decode_no/clause_structure.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:22<00:00,  3.82s/it]


## Semantics and lexicon

## Concept replacement

In [64]:
concept_replacement_instruction ='''Replace a concept with one that is similar but fail the quality.

Replacement types include:

1. "synonym": synonyms
Example: embodies the character with an effortlessly regal charisma . -> embodies the character with an effortlessly regal charm. (charm and charisma are synonyms)

2. "hierarchy": hyper/hyponyms
Example: The title not only describes its main characters, but the lazy people behind the camera as well. -> The title not only describes its main characters, but the lazy people behind the equipment as well. (equipment is a hypernym of camera)

3. "nonce": the following nonce word: {nonce}
Example: Has a lot of the virtues of eastwood at his best. - Has a lot of the virtues of bluth at his best. (bluth is a nonce word)

4. "idiom": metaphors/idioms
Example: This is a train wreck of an action film -> This is a disastrous action film (train wreck is a metaphor for disastrous)'''

In [65]:
modified_samples = request_concepts(no_contra_random_samples, concept_replacement_instruction)
with open('decode_no/concept_replacement.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [18:26<00:00, 11.06s/it]


## Adjectives/adverbs change

In [66]:
adjective_adverb_remove_instruction = '''Remove an adjective or adverb from the sentence.
type: "adjective_adverb_remove"'''

In [67]:
modified_samples = request(no_contra_random_samples, adjective_adverb_remove_instruction)
with open('decode_no/adjective_adverbs_remove.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [06:44<00:00,  4.04s/it]


In [68]:
adjective_adverb_add_instruction = '''Add an adjective or an adverb to the sentence.

type: "adjective_adverb_add"'''

In [69]:
modified_samples = request(no_contra_random_samples, adjective_adverb_add_instruction)
with open('decode_no/adjective_adverbs_add.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [07:45<00:00,  4.66s/it]


## Pragmatics and discourse

### Negation

In [70]:
negation_instruction = '''Negate the text by making minimal modifications to introduce different types of negation. The types of negation include:

1. "verbal": verbal negation: when the negation is grammatically associated with the verb, the head of the clause.
Examples:
He trusts them. => He does not trust them.

2. "absolute": Absolute negator: no (including compounds nobody, nothing, etc., and the independent form none), neither, nor, never.
Example:
He trusts them. => He trusts noone.

3. "approximate": Approximate negators: few, little; barely, hardly, scarcely; rarely, seldom.
Example:
He trusts people. => He rarely trusts people.

4. "affixal": Affixal negators: un-, in-, non-, -less, etc. Do not change the root of the word you add the affix to.
Example:
He is healthy => He is unhealthy.

5. "lexical": Lexical negation: when the negation is added by substituting the main predicate of the sentence with its antonym or word carrying negative meaning.
Examples:
The house is big. => The house is small.

6. "unimportant": Unimportant negation: when the negation does not affect the main clause of the text.
Examples:
A man is driving his car. => A shirtless man is driving his car.

7. "double": Double negation: when there are two instances of negation that cancel each other and the meaning is affirmative.'''

In [71]:
modified_samples = request(no_contra_random_samples, negation_instruction)
with open('decode_no/negation.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [35:06<00:00, 21.06s/it]


In [72]:
antinegation_instruction = '''Check if the text contains negation, and if it does, remove it. Do not make any other modifications. If there is no negation, output Skip. 

type: "remove_negation"

Examples:  

Text: The house is not pretty. 
Modified text: The house is pretty.  

 
Text: The story didn't leave anyone unaffected. 
Modified text: The story left everyone unaffected. 


Text: The house is pretty. 
Modified text: Skip.  ''' 

In [73]:
modified_samples = request(no_contra_random_samples, antinegation_instruction)
with open('decode_no/remove_negation.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [04:47<00:00,  2.88s/it]


## Discourse markers

In [74]:
discourse_instruction = '''Make modification to the discourse makers in the text. The types of modification include:

1. "addition": Add discourse markers to the sentence.
Example:
He was hungry, he went out to eat => He was hungry, so he went out to eat.

2. "replacement": Change the discourse marker into a different one. If there is no discourse marker, output Skip.
Example:
He was hungry although he had dinner => He was hungry so he had dinner. (although => so)

3. "deletion": Delete the discourse marker. If there is no discourse marker, output Skip.
Example:
He was hungry so he had dinner => He was hungry, he had dinner.

For each type of modification, reply with the modified sentences.'''

In [75]:
modified_samples = request(no_contra_random_samples, discourse_instruction)
with open('decode_no/discourse.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [14:35<00:00,  8.75s/it]


## Sentiment

In [76]:
sentiment_instruction = '''Add a word or a phrase with a positive or negative sentiment to the sentence. 

Example: We beat the competition -> We are happy to beat the competition.'''

In [77]:
modified_samples = request(no_contra_random_samples, sentiment_instruction)
with open('decode_no/sentiment.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [08:49<00:00,  5.30s/it]


## Style

### Casual vs formal style

In [78]:
casual_instruction = '''Rewrite the sentence in informal way.'''

In [79]:
modified_samples = request(no_contra_random_samples, casual_instruction)
with open('decode_no/casual.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [08:21<00:00,  5.01s/it]


### Simple vs Complex English

### Plain English

In [80]:
plain_english_instruction = '''Please rewrite the sentence in order to make it easier to understand by non-native speakers of English. 
You can do so by replacing complex words with simpler synonyms (i.e. paraphrasing), deleting unimportant information (i.e. compres-sion), and/or splitting a long complex sentence into several simpler ones. The final simplified sentence needs to be grammatical, fluent, and retain the main ideas of its original counterpart without altering its meaning.

type: "plain_english"'''

In [81]:
modified_samples = request(no_contra_random_samples, plain_english_instruction)
with open('decode_no/plain_english.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [08:23<00:00,  5.04s/it]


### Complex English

In [82]:
complex_english_instruction = '''Transform the original sentence into more complex English. 
Change some words with their more sophisticated technical synonym.
Change the sentence structure into more complex one.
Keep the modified sentence has nearly similar length with original one and do not alter its meaning.

type: "complex_english"'''

In [83]:
modified_samples = request(no_contra_random_samples, complex_english_instruction)
with open('decode_no/complex_english.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [09:43<00:00,  5.83s/it]


### Dialectal features

#### African American English

In [84]:
aave_english_instruction = '''Rewrite the text in African American Vernacular English.

type: "aave_english"'''

In [85]:
modified_samples = request(no_contra_random_samples, aave_english_instruction)
with open('decode_no/aave_english.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [08:43<00:00,  5.23s/it]


In [86]:
singlish_instruction = '''Rewrite the text in Singapore Colloquial English (Singlish, Basilect).

type: "singlish"'''

In [87]:
modified_samples = request(no_contra_random_samples, singlish_instruction)
with open('decode_no/singlish.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [08:45<00:00,  5.26s/it]


#### Indian English 

In [88]:
higlish_instruction = '''Rewrite the text using Indianisms and Indian English grammar.

type: "indian_english"'''

In [89]:
modified_samples = request(no_contra_random_samples, higlish_instruction)
with open('decode_no/indian_english.jsonl', 'w') as f:
    for item in modified_samples:
        object = json.loads(item)
        line = json.dumps(object) + "\n"
        f.write(line)
f.close()

100%|██████████| 100/100 [08:30<00:00,  5.11s/it]
