# Create dataset with GPT4

## Goal

Use GPT4 API to create a diverse and high quality dataset.

## Imports

In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import random
import json

from transformers import AutoTokenizer

from prometeo.evaluation import get_sharpened_cosine_similarity, estimate_mean

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

pd.set_option('display.max_colwidth', 200)

## Definitions

In [None]:
text_types = """
text
email
letter
memo
dialogue
news
poem
essay
report
academic paper
story
play
blog
advertisement
brochure
social media post
manual
guide
legal document
diary
speech
lecture
review
"""
text_types = text_types.strip().split('\n')
print(text_types)

In [None]:
poem_types = ['haiku', 'sonnet', 'limerick']
languages = ['chinese', 'spanish', 'hindi', 'portuguese', 'russian', 'french', 'german']
programming_languages = ['javascript', 'python', 'java', 'C++',] # 'Ruby', 'PHP'] removed because the created samples were wrong
codes = ['morse code', 'base64']
encryptions = [f'Caesar cipher shift {shift}' for shift in range(1, 6)]

In [None]:
writing_styles = """
academic
action-packed
advertising copy
blog writing
business
descriptive
fantasy writing
fiction
non-fiction
formal
informal
horror writing
humorous
imaginative
journalistic
legal writing
medical writing
mystery writing
narrative style
scientific report
white paper
"""
writing_styles = writing_styles.strip().split('\n')
print(writing_styles)

authors = """
Edgar Allan Poe
Emily Dickinson
Ernest Hemingway
F. Scott Fitzgerald
Jane Austen
J.D. Salinger
Langston Hughes
Mark Twain
Maya Angelou
Toni Morrison
Dr. Seuss
Tupac Shakur
William Shakespeare
"""
authors = authors.strip().split('\n')
print(authors)

In [None]:
tones = ['formal', 'informal', 'humorous', 'persuasive', 'objective', 'subjective',
         'optimistic', 'pessimistic', 'sarcastic', 'sincere', 'inspirational', 'critical',
         'descriptive', 'narrative', 'didactic']

In [None]:
persons = ['first', 'third']

In [None]:
text_formats = ['Markdown', 'HTML', 'RST (Restructured Text)']
data_formats = ['CSV', 'JSON', 'YAML']

In [None]:
songs = ['country', 'rock', 'pop', 'rap', 'reggae', 'metal', 'folk']

## Code

In [None]:
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'], organization=os.environ['OPENAI_API_ORG'])

In [None]:
def chat_with_gpt4(prompt, temperature=0.7):
    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
        response_format={"type": "json_object"},
    )
    return completion.choices[0].message.content

In [None]:
prompt_template = """
Given the following text prompt your task is to:

1. Write a short text that could have sense to be modified with the given text prompt. The number of words should be less than 200.
2. Rewrite the text using the given text prompt.

The output should be in json, with the following format:

{"original_text": "...", "rewritten_text": "..."}

## Text prompt

```{prompt}```
"""

In [None]:
def parse_prompt(response):
    if '{' not in response:
        print(f'Response does not contain json: {response}')
        return response

    json_text = '{' + response.split('{')[1].split('}')[0] + '}'
    data = eval(json_text)
    if isinstance(data, dict):
        return list(data.values())[0]
    elif isinstance(data, set):
        return data.pop()
    raise ValueError(f'Could not parse prompt from {response}')

In [None]:
def normalize_response(response, prompt):
    normalized_response = response.split('{')[0]
    normalized_response += '{"prompt": "' + prompt + '"}'
    normalized_response = normalized_response.replace('\n\n', '\n')
    return normalized_response

In [None]:
key_to_variables = {
    'text': text_types,
    'poem': poem_types,
    'language': languages,
    'programming_language': programming_languages,
    'code': codes,
    'encryption': encryptions,
    'writing_style': writing_styles,
    'author': authors,
    'tone': tones,
    'person': persons,
    'text_format': text_formats,
    'data_format': data_formats,
    'song': songs,
}

def format_prompts(prompts):
    formatted_prompts = []
    for prompt in prompts:
        keys = re.findall(r'\{.*?\}', prompt)
        if not keys:
            formatted_prompts.append(prompt)
            continue
        prioritary_keys = [key for key in keys if key != '{text}']
        if prioritary_keys:
            formatted_prompts.extend(format_prompts(fill_variable_in_prompt(prompt, prioritary_keys[0])))
        else:
            formatted_prompts.extend(format_prompts(fill_variable_in_prompt(prompt, keys[0])))
    return formatted_prompts

def fill_variable_in_prompt(prompt, key):
    naked_key = key[1:-1]
    values = key_to_variables[naked_key]
    if naked_key  == 'text':
        while True:
            sampled_variable = random.choice(values)
            if sampled_variable not in prompt:
                break
        return [prompt.replace(key, sampled_variable, 1)]
    else:
        values = key_to_variables[naked_key]
        return [prompt.replace(key, value) for value in values]

In [None]:
model_path = '/home/gbarbadillo/data/mixtral-8x7b-instruct-v0.1-hf/'
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True)

In [None]:
def create_dataset(random_seed):
    df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompts/prompts_v1.csv')
    random.seed(random_seed)
    prompts = prepare_prompts(df.prompt)
    rows = generate_samples(prompts)
    dataset = pd.DataFrame(rows)
    dataset.to_csv(f'/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v{random_seed}.csv', index=False)
    dataset.head()

def prepare_prompts(raw_prompts):
    prompts = []
    for raw_prompt in raw_prompts:
        prompts.extend(format_prompts([raw_prompt]))
    return prompts

def generate_samples(prompts):
    rows = []
    for prompt in tqdm(prompts, desc='Creating data'):
        try:
            ret = chat_with_gpt4(prompt_template.replace('{prompt}', prompt))
            output = json.loads(ret)
            rows.append(dict(original_text=output['original_text'], rewritten_text=output['rewritten_text'], rewrite_prompt=prompt))
        except Exception as e:
            print(f'Error with prompt: {prompt}')
            print(e)
    return rows

In [None]:
raise

## Create dataset with high quality prompts

### High quality prompts

On a first step we have to take the prompts and format them because many of the prompts have placeholder for different variables.

In [None]:
for random_seed in [3, 4]:
    create_dataset(random_seed)

Each generation of around 275 prompts costs around 2$.

- v1 did not use any random seed
- v2, v3, v4 use the seed in their number (2, 3, 4...)

### Create dataset with Newtonbaba prompts

Newtonbaba gemma produced the best results when using few-shot prompts, let's see how the prompts look.

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/newtonbaba/gemma_data_set_prompt_recover_1_curated.csv')
df.head()

In [None]:
unique_prompts = df.rewrite_prompt.unique()
print(len(unique_prompts))
unique_prompts

I don't like the prompts. I prefer to generate more data using my own prompts.

### Create dataset with prompts that imitate the leaked ones

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompts/imitation_of_leaked_v1.csv')
df.head()

In [None]:
rows = generate_samples(df.prompt)
dataset = pd.DataFrame(rows)
dataset.to_csv(f'/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_imitation_of_leaked_v1.csv', index=False)
dataset.head()

63 samples, would they be helpful?

### Multi-instruction prompts

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompts/multi-instruction-prompts-v1.csv')
df.head()

In [None]:
rows = generate_samples(df.prompt)
dataset = pd.DataFrame(rows)
dataset.to_csv(f'/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_multi_instruction_v1.csv', index=False)
dataset.head()

### Add more prompts

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompts/prompts_v2.csv')
df.head()

In [None]:
rows = generate_samples(df.prompt)
dataset = pd.DataFrame(rows)
dataset.to_csv(f'/mnt/hdd0/Kaggle/llm_prompt_recovery/data/high_quality_dataset_v5_gpt4.csv', index=False)
dataset.head()