# Create similar prompts datasets

## Goal

Use GPT4 API to create a diverse and high quality dataset.

## Imports

In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import random
import json
import time
from concurrent.futures import ProcessPoolExecutor

from transformers import AutoTokenizer

from prometeo.evaluation import get_sharpened_cosine_similarity, estimate_mean

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

pd.set_option('display.max_colwidth', 200)

## Code

In [None]:
def chat_with_gpt4(prompt, temperature=0.7):
    client = OpenAI(api_key=os.environ['OPENAI_API_KEY'], organization=os.environ['OPENAI_API_ORG'])
    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        response_format={"type": "json_object"},
    )
    return completion.choices[0].message.content

In [None]:
def monitor_progress(submits):
    progress = 0
    with tqdm(total=len(submits), smoothing=0) as progress_bar:
        while 1:
            time.sleep(1)
            current_progress = np.sum([submit.done() for submit in submits])
            if current_progress > progress:
                progress_bar.update(current_progress - progress)
                progress = current_progress
            if progress == len(submits):
                break

In [None]:
raise

## Create superdataset

On a first step let's concat all the training datasets in a single one for easier handling.

In [None]:
filepaths = [
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_imitation_of_leaked_v1.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v2.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v3.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v4_with_hints.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v5_gpt4.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_multi_instruction_v1.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test_with_gpt4_v2.csv',
    '/mnt/hdd0/Kaggle/llm_prompt_recovery/data/gemma_suppl_rewrite_curated_with_gpt4.csv',
]
df = pd.concat([pd.read_csv(filepath) for filepath in filepaths], ignore_index=True)
df = df[['original_text', 'rewritten_text', 'rewrite_prompt']]
print(df.shape)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/super_dataset_v1.csv', index=False)

## Get similar prompts

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/super_dataset_v1.csv')
df.head()

In [None]:
unique_prompts = df['rewrite_prompt'].unique()
print(len(unique_prompts), len(df))

Some prompts are repeated.

In [None]:
prompt_template = """Your task is to rewrite the following text prompt while preserving the meaning.
You can reword the text, use synonyms, or change the structure of the sentences.
Please provide 10 different rewrites using this json format:
{"rewrite_1": "...", "rewrite_2": "...", ...., "rewrite_10": "..."}

```text
PLACEHOLDER
```
"""

I have made a first naive generation with 10 prompts and it has taken 90 seconds, that means that it would take 3.5 hours to generate prompts for the whole dataset. But at the same time I know that there is a limit of 500RPM for gpt-4-turbo, so ideally I could do the task in 2 minutes. I have to parallelize the generation.

In [None]:
responses = []
for unique_prompt in tqdm(unique_prompts[:10]):
    formatted_prompt = prompt_template.replace('PLACEHOLDER', unique_prompt)
    print(formatted_prompt)
    responses.append(chat_with_gpt4(formatted_prompt))
    print(responses[-1])
    print('*'*100)

In [None]:
with ProcessPoolExecutor(max_workers=40) as pool:
    submits = []
    for unique_prompt in tqdm(unique_prompts, desc='Creating submits'):
        formatted_prompt = prompt_template.replace('PLACEHOLDER', unique_prompt)
        submits.append(pool.submit(chat_with_gpt4, formatted_prompt))
    monitor_progress(submits)
    results = [submit.result() for submit in submits]

In just 5 minutes the work is done! I could have probably done it faster using 80 workers but I didn't want to risk hitting the limit. This is already pretty fast.

The cost of generating this new prompts was 11$.

In [None]:
parsed_results = [json.loads(result) for result in results]

In [None]:
np.unique([len(parsed_result) for parsed_result in parsed_results], return_counts=True)

All the results seem to have 10 prompts as required, that is very good.

In [None]:
prompt_to_variations = dict(zip(unique_prompts, parsed_results))
with open('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompt_to_variations.json', 'w') as f:
    json.dump(prompt_to_variations, f)

## Measure similarity with T5

Let's measure the similarity of the prompts using T5 embeddings.

In [None]:
prompt_to_similarity = dict()
for prompt, variations in tqdm(prompt_to_variations.items(), total=len(prompt_to_variations)):
    keys = list(variations.keys())
    prompt_variations = [variations[key] for key in keys]
    similarity = get_sharpened_cosine_similarity([prompt], prompt_variations)
    prompt_to_similarity[prompt] = dict(zip(keys, similarity))

Let's visualize an histogram with the similarity.

In [None]:
similarity_values = np.concatenate([list(similarity.values()) for similarity in prompt_to_similarity.values()])
plt.hist(similarity_values, bins=100);
plt.title('Distribution of SCS between prompts and their variations')
plt.xlabel('Sharpened cosine similarity');

In [None]:
np.mean(similarity_values > 0.8)

The distribution is pretty good, 85% of the prompt variations score above `0.8`.
I believe that is a good threshold.

## Filter the prompts

In [None]:
similarity_threshold = 0.8
prompt_to_variations_filtered = dict()
for prompt, similarity in prompt_to_similarity.items():
    keys = list(similarity.keys())
    values = list(similarity.values())
    prompt_variations = [prompt_to_variations[prompt][key] for key, value in zip(keys, values) if value > similarity_threshold]
    prompt_to_variations_filtered[prompt] = prompt_variations

In [None]:
plt.hist([len(variations) for variations in prompt_to_variations_filtered.values()], bins=np.arange(0.5, 11));

In [None]:
np.sum([len(variations) for variations in prompt_to_variations_filtered.values()])

## Create the superdataset v2

In [None]:
super_dataset = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    prompt = row['rewrite_prompt']
    variations = prompt_to_variations_filtered[prompt]
    for variation in variations:
        super_dataset.append([row['original_text'], row['rewritten_text'], variation])
# convert to pandas dataframe
super_df = pd.DataFrame(super_dataset, columns=['original_text', 'rewritten_text', 'rewrite_prompt'])
print(super_df.shape)
super_df.head()

In [None]:
super_df.sample(frac=1).to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/super_dataset_v2.csv', index=False)