<a href="https://colab.research.google.com/github/jacinthes/slovene-nli-benchmark/blob/main/GPT3.5%20synthetic%20data%20generation/synthetic_data_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Synthetic data generation**
This is the script used to generate labeled premise hypothesis pairs.
The script is provided with sentences, which will be used as premises and the prompt, which instructs GPT3 to generate 3 hypotheses - one for each NLI label (entailment, neutral, contradiction)

In [None]:
pip install openai

In [8]:
import openai
import pandas as pd
import re
from time import time, sleep
from tqdm import tqdm
import os


openai.api_key = '' # Provide your OpenAI API key

In [29]:
def open_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as infile:
        return infile.read()

# This is the call to GPT3.
def gpt3_completion(prompt, engine='text-davinci-003', temp=0.05, top_p=1.0, tokens=1000,
                    freq_pen=0.0, pres_pen=0.0):
    
    prompt = prompt.encode(encoding='utf-8', errors='ignore').decode()
    try:
        response = openai.Completion.create(
            engine=engine,
            prompt=prompt,
            temperature=temp,
            max_tokens=tokens,
            top_p=top_p,
            frequency_penalty=freq_pen,
            presence_penalty=pres_pen)
        text = response['choices'][0]['text'].strip()
        text = re.sub('\s+', ' ', text)
        filename = '%s_gpt3.txt' % time()
        
        # Create the logs folder if it does not exists
        if not os.path.exists('gpt3_logs'):
            os.makedirs('gpt3_logs')

        # Save the whole prompt and the response so that we can inspect it when necessary
        with open('gpt3_logs/%s' % filename, 'w') as outfile:
            outfile.write('PROMPT:\n\n' + prompt + '\n\n###############\n\nRESPONSE:\n\n' + text)
        return text
    except Exception as e:
        print('Error communicating with OpenAI:', e)

In [76]:
# Read input sentences - one sentence per line
with open("sample_sentences.txt", "r") as f:
    sentences = f.readlines()

# strip the newline characters from the sentences
sentences = [sentence.strip() for sentence in sentences]

premises = list()
hypotheses = list()
labels = list()

# Base prompt which instructs GPT to generate three training samples and returns them using a defined format so that it can then be parsed
prompt_base = open_file('NLI_generation_prompt.txt')
for sentence in tqdm(sentences):

  prompt = prompt_base.replace('<<PREMISE>>', sentence) # Replace the premise with the new sentence
  gpt3_response = gpt3_completion(prompt)
  try:
    # Parse and save the response
    contradiction = re.search(r'Contradiction: (.*?) Entailment', gpt3_response).group(1)
    entailment = re.search(r'Entailment: (.*?) Neutral', gpt3_response).group(1)
    neutral = gpt3_response.split('Neutral: ')[1]

    premises.append(sentence)
    hypotheses.append(entailment)
    labels.append('entailment')
    
    premises.append(sentence)
    hypotheses.append(contradiction)
    labels.append('contradiction')
    
    premises.append(sentence)
    hypotheses.append(neutral)
    labels.append('neutral')
  except Exception as e:
    print('Error parsing the response:', e)
    print(f'Bad response for sentence: {sentence}')
    print(f'Response: {gpt3_response}\n')

In [None]:
# Store the generated data locally
df = pd.DataFrame()
df['PREMISES'] = premises
df['HYPOTHESES'] = hypotheses
df['LABELS'] = labels
df.to_excel('synthetic_data.xlsx')