# Create similar prompts datasets

## Goal

Use GPT4 API to create a diverse and high quality dataset.

## Imports

In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import random
import json
import time
from concurrent.futures import ProcessPoolExecutor

from transformers import AutoTokenizer

from prometeo.evaluation import get_sharpened_cosine_similarity, estimate_mean

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

pd.set_option('display.max_colwidth', 200)

## Code

In [None]:
def chat_with_gpt4(prompt, temperature=0.7):
    client = OpenAI(api_key=os.environ['OPENAI_API_KEY'], organization=os.environ['OPENAI_API_ORG'])
    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        response_format={"type": "json_object"},
    )
    return completion.choices[0].message.content

In [None]:
def monitor_progress(submits):
    progress = 0
    with tqdm(total=len(submits), smoothing=0) as progress_bar:
        while 1:
            time.sleep(1)
            current_progress = np.sum([submit.done() for submit in submits])
            if current_progress > progress:
                progress_bar.update(current_progress - progress)
                progress = current_progress
            if progress == len(submits):
                break

In [None]:
raise

## Select seed prompts

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_imitation_of_leaked_v1.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v2.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v5_gpt4.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_multi_instruction_v1.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test_with_gpt4_v2.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/gemma_suppl_rewrite_curated_with_gpt4.csv',
]
df = pd.concat([pd.read_csv(filepath) for filepath in filepaths], ignore_index=True)
df = df[['original_text', 'rewritten_text', 'rewrite_prompt']]
print(df.shape)
df.head()

In [None]:
seed_prompts = df.rewrite_prompt.unique()
print(len(seed_prompts))

## Generate new prompts

In [None]:
new_prompts_template = """Your task is to generate new, realistic and diverse prompts that could be used to rewrite some text.
Below you can find some examples of prompts for inspiration.
Please create 20 new prompts and answer them with a list in json format.

Use this format:

```json
{
  "prompts": [
    ...
  ]
}
```

## Examples of prompts:

"""

def generate_new_prompts(n_runs, random_seed=42):
    np.random.seed(random_seed)
    with ProcessPoolExecutor(max_workers=40) as pool:
        submits = []
        for _ in tqdm(range(n_runs), desc='Creating submits'):
            prompt = new_prompts_template + '\n'.join(np.random.choice(seed_prompts, 10, replace=False))
            submits.append(pool.submit(chat_with_gpt4, prompt))
        monitor_progress(submits)
        results = [submit.result() for submit in submits]
    return results

In [None]:
results = generate_new_prompts(n_runs=100, random_seed=42)

In [None]:
new_prompts = []
for result in results:
    new_prompts.extend(json.loads(result)['prompts'])
len(new_prompts)

In [None]:
df = pd.DataFrame(new_prompts, columns=['prompt'])
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompts/prompts_v3_scale.csv', index=False)

## Generate train samples

In [None]:
def generate_new_prompts(n_runs, random_seed=42):
    np.random.seed(random_seed)
    with ProcessPoolExecutor(max_workers=40) as pool:
        submits = []
        for _ in tqdm(range(n_runs), desc='Creating submits'):
            prompt = new_prompts_template + '\n'.join(np.random.choice(seed_prompts, 10, replace=False))
            submits.append(pool.submit(chat_with_gpt4, prompt))
        monitor_progress(submits)
        results = [submit.result() for submit in submits]
    return results

In [None]:
prompt_template = """
Given the following text prompt your task is to:

1. Write a short text that could have sense to be modified with the given text prompt. The number of words should be less than 200.
2. Rewrite the text using the given text prompt.

The output should be in json, with the following format:

{"original_text": "...", "rewritten_text": "..."}

## Text prompt

```PLACEHOLDER```
"""

In [None]:
with ProcessPoolExecutor(max_workers=40) as pool:
    submits = []
    for prompt in tqdm(new_prompts, desc='Creating submits'):
        prompt = prompt_template.replace('PLACEHOLDER', prompt)
        submits.append(pool.submit(chat_with_gpt4, prompt))
    monitor_progress(submits)
    results = [submit.result() for submit in submits]

The cost of this generation was around 20$

In [None]:
rows = []
for prompt, result in tqdm(zip(new_prompts, results), total=len(new_prompts)):
    try:
        row = json.loads(result)
        row['rewrite_prompt'] = prompt
        rows.append(row)
    except Exception as e:
        print(f'Error with prompt: {prompt}')
        print(e)

In [None]:
df = pd.DataFrame(rows)
df.head()

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v6_scale.csv', index=False)