# Create dataset with GPT4

## Goal

Use GPT4 API to create a diverse and high quality dataset.

## Imports

In [None]:
import os
from openai import OpenAI
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import re
import random

from prometeo.evaluation import get_sharpened_cosine_similarity, estimate_mean

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Definitions

In [None]:
text_types = """
text
email
letter
memo
dialogue
news
poem
essay
report
academic paper
story
play
blog
advertisement
brochure
social media post
manual
guide
legal document
diary
speech
lecture
review
"""
text_types = text_types.strip().split('\n')
print(text_types)

In [None]:
poem_types = ['haiku', 'sonnet', 'limerick']
languages = ['chinese', 'spanish', 'hindi', 'portuguese', 'russian', 'french', 'german']
programming_languages = ['javascript', 'python', 'java', 'C++', 'Ruby', 'PHP']
codes = ['morse code', 'base64']
encryptions = [f'Caesar cipher shift {shift}' for shift in range(1, 6)]

In [None]:
writing_styles = """
academic
action-packed
advertising copy
blog writing
business
descriptive
fantasy writing
fiction
non-fiction
formal
informal
horror writing
humorous
imaginative
journalistic
legal writing
medical writing
mystery writing
narrative style
scientific report
white paper
"""
writing_styles = writing_styles.strip().split('\n')
print(writing_styles)

authors = """
Edgar Allan Poe
Emily Dickinson
Ernest Hemingway
F. Scott Fitzgerald
Jane Austen
J.D. Salinger
Langston Hughes
Mark Twain
Maya Angelou
Toni Morrison
Dr. Seuss
Tupac Shakur
William Shakespeare
"""
authors = authors.strip().split('\n')
print(authors)

In [None]:
tones = ['formal', 'informal', 'humorous', 'persuasive', 'objective', 'subjective',
         'optimistic', 'pessimistic', 'sarcastic', 'sincere', 'inspirational', 'critical',
         'descriptive', 'narrative', 'didactic']

In [None]:
persons = ['first', 'third']

In [None]:
text_formats = ['Markdown', 'HTML', 'RST (Restructured Text)']
data_formats = ['CSV', 'JSON', 'YAML']

In [None]:
songs = ['country', 'rock', 'pop', 'rap', 'reggae', 'metal', 'folk']

## Code

In [None]:
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'], organization=os.environ['OPENAI_API_ORG'])

In [None]:
def chat_with_gpt4(prompt, temperature=0):
    completion = client.chat.completions.create(
        model="gpt-4-0125-preview",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature=temperature,
    )
    return completion.choices[0].message.content

In [None]:
prompt_template = """
Analyze the original and rewritten text and answer with the most likely text prompt that was given to rewrite or make stylistic changes to the original text.

- The text prompt should be a single sentence. Reply just with a short sentence and do not add any notes or comments.
- Sometimes the rewritten text will have hints about the text prompt. For example if it starts by
  Reworded, Rephrased, Translated, etc. you should include that word in the text prompt.
- Unless necessary do not make reference to details in the original text and keep the text prompt abstract and generic.

## Original text

{original_text}

## Rewritten text

{rewritten_text}

## Output format

Let's do the task step by step:

1. On a first step analyze the differences of the texts in less than 30 words.
2. On a second step write the most likely prompt using json format
"""

In [None]:
def parse_prompt(response):
    if '{' not in response:
        print(f'Response does not contain json: {response}')
        return response

    json_text = '{' + response.split('{')[1].split('}')[0] + '}'
    data = eval(json_text)
    if isinstance(data, dict):
        return list(data.values())[0]
    elif isinstance(data, set):
        return data.pop()
    raise ValueError(f'Could not parse prompt from {response}')

In [None]:
def normalize_response(response, prompt):
    normalized_response = response.split('{')[0]
    normalized_response += '{"prompt": "' + prompt + '"}'
    normalized_response = normalized_response.replace('\n\n', '\n')
    return normalized_response

In [None]:
key_to_variables = {
    'text': text_types,
    'poem': poem_types,
    'language': languages,
    'programming_language': programming_languages,
    'code': codes,
    'encryption': encryptions,
    'writing_style': writing_styles,
    'author': authors,
    'tone': tones,
    'person': persons,
    'text_format': text_formats,
    'data_format': data_formats,
    'song': songs,
}

def format_prompts(prompts):
    formatted_prompts = []
    for prompt in prompts:
        keys = re.findall(r'\{.*?\}', prompt)
        if not keys:
            formatted_prompts.append(prompt)
            continue
        prioritary_keys = [key for key in keys if key != '{text}']
        if prioritary_keys:
            formatted_prompts.extend(format_prompts(fill_variable_in_prompt(prompt, prioritary_keys[0])))
        else:
            formatted_prompts.extend(format_prompts(fill_variable_in_prompt(prompt, keys[0])))
    return formatted_prompts

def fill_variable_in_prompt(prompt, key):
    naked_key = key[1:-1]
    values = key_to_variables[naked_key]
    if naked_key  == 'text':
        while True:
            sampled_variable = random.choice(values)
            if sampled_variable not in prompt:
                break
        return [prompt.replace(key, sampled_variable, 1)]
    else:
        values = key_to_variables[naked_key]
        return [prompt.replace(key, value) for value in values]

In [None]:
raise

## Format the prompts

On a first step we have to take the prompts and format them because many of the prompts have placeholder for different variables.

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/prompts/prompts_v1.csv')
df.head()

In [None]:
prompts = []
for raw_prompt in df.prompt:
    prompts.extend(format_prompts([raw_prompt]))
print(len(prompts))
for prompt in prompts:
    print(prompt)

## Calling to GPT4 API

- https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo
- https://platform.openai.com/docs/api-reference/chat/create?lang=python
- https://platform.openai.com/docs/api-reference/introduction

The most recent model is `gpt-4-0125-preview`

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test.csv')
df.head()

In [None]:
iterator = tqdm(zip(df['original_text'].values, df['rewritten_text'].values), total=len(df))
responses = []
for original_text, rewritten_text in iterator:
    prompt = prompt_template.format(original_text=original_text, rewritten_text=rewritten_text)
    responses.append(chat_with_gpt4(prompt))

It took 20 minutes to compute all the responses.

In [None]:
df['gpt4_prompt'] = responses
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test_with_gpt4.csv', index=False)

### Postprocess the responses

In [None]:
prompts = [parse_prompt(response) for response in responses]

GPT4 works very well and I had no problem in parsing all the responses.

We might want to train later with the data, so let's normalize the responses to use always the same format.

In [None]:
normalized_responses = [normalize_response(response, prompt) for response, prompt in zip(responses, prompts)]
df['gpt4_response'] = responses
df['gpt4_prompt'] = prompts
df['gpt4_normalized_response'] = normalized_responses
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test_with_gpt4.csv', index=False)

Study the number of tokens of the responses.

In [None]:
from transformers import AutoTokenizer
model_path = '/home/gbarbadillo/data/mixtral-8x7b-instruct-v0.1-hf/'
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True)

In [None]:
df.head()

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/mooney_test_with_gpt4.csv')
plt.hist([len(tokenizer.tokenize(response)) for response in df.gpt4_normalized_response], bins=50);
plt.xlabel('Number of tokens')
plt.title('Distribution of GPT4 responses token length')

The mean number of tokens is around 50, and the max is below 75. Thus we should be able to generate text of that lenght on submission.

## Evaluate GPT4 on rewritten supplementary material

### Compute GPT4 responses

In [None]:
df = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/gemma_suppl_rewrite_curated.csv')
df.head()

In [None]:
iterator = tqdm(zip(df['original_text'].values, df['rewritten_text'].values), total=len(df))
responses = []
for original_text, rewritten_text in iterator:
    prompt = prompt_template.format(original_text=original_text, rewritten_text=rewritten_text)
    responses.append(chat_with_gpt4(prompt))

In [None]:
prompts = [parse_prompt(response) for response in responses]
normalized_responses = [normalize_response(response, prompt) for response, prompt in zip(responses, prompts)]
df['gpt4_response'] = responses
df['gpt4_prompt'] = prompts
df['gpt4_normalized_response'] = normalized_responses

In [None]:
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/gemma_suppl_rewrite_curated_with_gpt4.csv', index=False)

One of the responses did not have json format.

### Evaluate

In [None]:
scs = get_sharpened_cosine_similarity(df['rewrite_prompt'].values, df['gpt4_prompt'].values)
estimate_mean(scs)

In [None]:
df['scs'] = scs
df.to_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/gemma_suppl_rewrite_curated_with_gpt4.csv', index=False)

GPT4 only scores 0.7036 +- 0.014 on the rewritten supplementary material. My current best score is 0.724

However if I look at GPT4 "errors" they are not errors at all:

- Many times the prompt given by GPT4 is better than the original prompt, the problem is that Gemma does not follow the given instruction
- The other big problem is that Gemma is not an [injective function](https://en.wikipedia.org/wiki/Injective_function), many different prompts can lead to the same answer. How to choose among the space of possible prompts? We could inject some
bias if we know the style of the host.

I judge that GPT4 answers are very good. I believe that a combination of GPT4 analisys and Gemma could lead to a very strong model.

What is the similarity I get if I compare my best model with GPT4?

In [None]:
mixtral = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/evaluations/mixtral_prompt_engineering/e1bcc54b4fcff31eeafed9bc0a0933a0a8d7b3c1955c55baf3d57c045bb433b3.csv')
print(estimate_mean(mixtral['sharpened_cosine_similarity'].values))
mixtral.head()

In [None]:
scs = get_sharpened_cosine_similarity(mixtral['predicted_prompt'].values, df['gpt4_prompt'].values)
estimate_mean(scs)

The score rises to 0.755