# Prompt engineering

## Goal

First steps with prompt engineering:

- Choose a validation dataset
- Implement a validation pipeline
- Try different prompt strategies

## Imports

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
import gc
import time
import re
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
from tqdm.auto import tqdm

import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text as text  # Registers the ops.

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (20, 5)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Code

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.float16,
    bnb_4bit_use_double_quant= True,
    llm_int8_enable_fp32_cpu_offload= True)

torch.cuda.empty_cache()
gc.collect()

In [None]:
auto_device_map = {
    'model.embed_tokens': 0,
    'model.layers.0': 0,
    'model.layers.1': 0,
    'model.layers.2': 0,
    'model.layers.3': 0,
    'model.layers.4': 0,
    'model.layers.5': 0,
    'model.layers.6': 0,
    'model.layers.7': 0,
    'model.layers.8': 0,
    'model.layers.9': 0,
    'model.layers.10': 0,
    'model.layers.11': 0,
    'model.layers.12': 0,
    'model.layers.13': 0,
    'model.layers.14': 1,
    'model.layers.15': 1,
    'model.layers.16': 1,
    'model.layers.17': 1,
    'model.layers.18': 1,
    'model.layers.19': 1,
    'model.layers.20': 1,
    'model.layers.21': 1,
    'model.layers.22': 1,
    'model.layers.23': 1,
    'model.layers.24': 1,
    'model.layers.25': 1,
    'model.layers.26': 1,
    'model.layers.27': 1,
    'model.layers.28': 1,
    'model.layers.29': 1,
    'model.layers.30': 1,
    'model.layers.31': 1,
    'model.norm': 1,
    'lm_head': 1
 }

def create_shared_device_map(transition_layer):
    shared_device_map = {}
    for idx, key in enumerate(auto_device_map):
        if idx <= transition_layer:
            shared_device_map[key] = 0
        else:
            shared_device_map[key] = 1
    return shared_device_map

def create_intertwined_device_map():
    device_map = {}
    for idx, key in enumerate(auto_device_map):
        if idx == 0:
            device_map[key] = 1
        elif idx >= 33:
            device_map[key] = 0
        else:
            device_map[key] = idx % 2
    return device_map

In [None]:
model_path = '/mnt/hdd0/Kaggle/llm_prompt_recovery/models/mixtral-8x7b-instruct-v0.1-hf'
model_path = '/home/gbarbadillo/data/mixtral-8x7b-instruct-v0.1-hf/'
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    #device_map="auto",
    device_map=create_intertwined_device_map(),
    #device_map=create_shared_device_map(16),
    trust_remote_code=True,
    #attn_implementation="flash_attention_2",
    )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id # this is needed to do batch inference
gc.collect()

In [None]:
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)

def chat_with_mixtral(prompt, max_new_tokens=200, verbose=True, do_sample=False, temperature=0.7, top_p=0.95):
    if not prompt.startswith('<s>[INST]'):
        print('Formatting the prompt to Mixtral needs.')
        prompt = f'<s>[INST] {prompt} [/INST]'
    start = time.time()

    if do_sample:
        sampling_kwargs = dict(do_sample=True, temperature=temperature, top_p=top_p)
    else:
        sampling_kwargs = dict(do_sample=False)

    sequences = pipe(
        prompt ,
        max_new_tokens=max_new_tokens,
        # https://www.reddit.com/r/LocalLLaMA/comments/184g120/mistral_fine_tuning_eos_and_padding/
        # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/discussions/106
        pad_token_id=tokenizer.eos_token_id,
        **sampling_kwargs,
        return_full_text=False,
    )
    response = sequences[0]['generated_text']
    response = re.sub(r'[\'"]', '', response)
    if verbose:
        stop = time.time()
        time_taken = stop-start
        n_tokens = len(tokenizer.tokenize(response))
        print(f"Execution Time : {time_taken:.1f} s, tokens per second: {n_tokens/time_taken:.1f}")
    return response

In [None]:
def print_gpu_memory():
    for device in range(torch.cuda.device_count()):
        print(f'GPU {device} memory allocated: {torch.cuda.memory_allocated(device)/1024**3:.1f} GB, max memory allocated: {torch.cuda.max_memory_allocated(device)/1024**3:.1f} GB')

In [None]:
raise

## Flash attention speedup

https://huggingface.co/docs/transformers/en/model_doc/mixtral#speeding-up-mixtral-by-using-flash-attention

```bash
export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE  
pip install -U flash-attn --no-build-isolation
```

In [None]:
for _ in range(2):
    print(chat_with_mixtral('write a poem about real madrid', max_new_tokens=50))
print_gpu_memory()

In [None]:
with open('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/bible.txt', 'r') as f:
    bible = f.read()
print(chat_with_mixtral(bible[:24000], max_new_tokens=50))

```
# flash-attention
Execution Time : 5.4 s, tokens per second: 9.4
Execution Time : 9.1 s, tokens per second: 5.6

# baseline
Execution Time : 5.1 s, tokens per second: 10.0
Execution Time : 9.2 s, tokens per second: 5.5
```

Not clear difference.

## Verify that batch prediction works

In [None]:
prompts = [f'<s>[INST] What is the history of {country}? [/INST]' for country in ['Spain', 'France', 'Germany', 'Italy']]
pipe(prompts, do_sample=False, return_full_text=False, pad_token_id=tokenizer.eos_token_id, max_new_tokens=50)

In [None]:
[chat_with_mixtral(prompt, max_new_tokens=50) for prompt in prompts]

In [None]:
batch_size = 4
pipe_bs = pipeline(task="text-generation", model=model, tokenizer=tokenizer, batch_size=batch_size)
pipe_bs(prompts, do_sample=False, return_full_text=False, pad_token_id=tokenizer.eos_token_id, max_new_tokens=50)

In [None]:
pipe_bs(prompts[:3] + [bible[:24000]], do_sample=False, return_full_text=False, pad_token_id=tokenizer.eos_token_id, max_new_tokens=50)

It seems the duration when batching inputs is determined by the longest input and output. Thus it might have sense to order the inputs by length.

All the predictions are the same, despite the method used.

## Study public datasets

https://www.kaggle.com/competitions/llm-prompt-recovery/discussion/481811

How can I study the datasets? I have to read them

I could read the prompts, this needs time.

## Evaluation

### First steps

In [None]:
test = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/nbroad-v1.csv').head(64)
test.head()

In [None]:
MODEL_PROMPT = """<s>[INST]
# Guess the prompt

Your task is to guess which prompt was given to transform the original text into the rewritten text.
Read the two texts carefully and answer with the most likely prompt that was used to modify the original text.

- The response should be as short and concise as possible. Whenever possible use a single short sentence prompt.
- The prompt should be as generic as possible and content agnostic unless it is absolutely necessary to make reference to the content of the original text

## Original text

{original_text}

## Rewritten text

{rewritten_text}

[/INST]

## Prompt

"""

def truncate_txt(text, max_words=2000):
    text_list = text.split()
    return " ".join(text_list[:max_words])

prompts = []
for _, row in test.iterrows():
    prompts.append(MODEL_PROMPT.format(original_text=truncate_txt(row['original_text']),
                                       rewritten_text=truncate_txt(row['rewritten_text'])))

plt.hist([len(tokenizer.tokenize(prompt)) for prompt in prompts], bins=20);
plt.title('Distribution of prompt token length')
plt.xlabel('Prompt token length');

In [None]:
responses = [chat_with_mixtral(prompt, max_new_tokens=25, verbose=False) for prompt in tqdm(prompts, smoothing=0)]

Making the predictions one by one takes 3m2s

In [None]:
batch_size = 8
responses_bs = pipe(prompts, do_sample=False, return_full_text=False,
                    pad_token_id=tokenizer.eos_token_id, max_new_tokens=25,
                    batch_size=batch_size)

In [None]:
prompt_lengths = [len(tokenizer.tokenize(prompt)) for prompt in prompts]
sorted_prompts = [prompt for _, prompt in sorted(zip(prompt_lengths, prompts))]

In [None]:
batch_size = 8
responses_bs = pipe(sorted_prompts, do_sample=False, return_full_text=False,
                    pad_token_id=tokenizer.eos_token_id, max_new_tokens=25,
                    batch_size=batch_size)

In [None]:
from torch.utils.data import Dataset
class PromptsDataset(Dataset):
    def __init__(self, prompts):
        self.prompts = prompts

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, idx):
        return self.prompts[idx]

In [None]:
batch_size = 8
responses_bs = pipe(PromptsDataset(sorted_prompts), do_sample=False, return_full_text=False,
                    pad_token_id=tokenizer.eos_token_id, max_new_tokens=25,
                    batch_size=batch_size)
responses_bs = list(responses_bs)

In [None]:
batch_size = 16
responses_bs = pipe(sorted_prompts, do_sample=False, return_full_text=False,
                    pad_token_id=tokenizer.eos_token_id, max_new_tokens=25,
                    batch_size=batch_size)

In [None]:
responses_bs

In [None]:
print(MODEL_PROMPT)

- I cannot use a batch size higher than 8, which results in a very modest speedup of 2min vs 3min. And that is considering the current input tokens which may vary.
- The batch affects to some of the responses, it might be due to numerical errors
- Using a torch dataset does not speedup the inference

Thus I believe the best option is to use a simple sequential inference. To be fast I should use a small evaluation dataset.

### Some responses change when using batches

In [None]:
idx = 4
batch_size = 1
print(pipe(prompts[idx: idx + batch_size], do_sample=False, return_full_text=False, pad_token_id=tokenizer.eos_token_id, max_new_tokens=25, batch_size=batch_size)[0][0]['generated_text'])

batch_size = 4
print(pipe(prompts[idx: idx + batch_size], do_sample=False, return_full_text=False, pad_token_id=tokenizer.eos_token_id, max_new_tokens=25, batch_size=batch_size)[0][0]['generated_text'])

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    trust_remote_code=True,
    padding_side="left",
    padding='do_not_pad')
tokenizer.pad_token_id = tokenizer.eos_token_id # this is needed to do batch inference
tokenizer.pad_token = tokenizer.eos_token

gc.collect()
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)


batch_size = 2
pipe(prompts[:batch_size], do_sample=False, return_full_text=False, pad_token_id=tokenizer.eos_token_id, max_new_tokens=25, batch_size=batch_size)

### Evaluation

In [None]:
def evaluate_prompt_template(prompt_template, max_new_tokens, test_filepath, n_samples, random_seed=7):
    test = load_test_data(test_filepath, n_samples, random_seed)

    prompts = []
    for _, row in test.iterrows():
        prompts.append(prompt_template.format(original_text=truncate_txt(row['original_text']),
                                              rewritten_text=truncate_txt(row['rewritten_text'])))

    plt.hist([len(tokenizer.tokenize(prompt)) for prompt in prompts], bins=20);
    plt.title('Distribution of prompt token length')
    plt.xlabel('Prompt token length');
    plt.show()

    responses = [chat_with_mixtral(prompt, max_new_tokens=max_new_tokens, verbose=False) for prompt in tqdm(prompts, smoothing=0, desc='Inference')]
    test['predicted_prompt'] = responses

    hub_url = "https://www.kaggle.com/models/google/sentence-t5/frameworks/TensorFlow2/variations/st5-base/versions/1"
    encoder = hub.KerasLayer(hub_url)
    responses_embeddings = encoder(tf.constant(responses))[0].numpy()
    ground_truth_embeddings = encoder(tf.constant(test['rewrite_prompt']))[0].numpy()
    similarity = np.sum(responses_embeddings * ground_truth_embeddings, axis=1)
    test['cosine_similarity'] = similarity
    test['sharpened_cosine_similarity'] = similarity**3
    print(f'Mean sharpened cosine similarity: {test["sharpened_cosine_similarity"].mean():.3f} +- {test["sharpened_cosine_similarity"].std()/np.sqrt(len(test))*1.96:.3f}')
    plt.hist(test['sharpened_cosine_similarity'], bins=20);
    plt.title('Distribution of sharpened cosine similarity');
    plt.show()
    # TODO: save results

def load_test_data(test_filepath, n_samples, random_seed=7):
    test = pd.read_csv(test_filepath)
    np.random.seed(random_seed)
    keep_indices = np.random.choice(test.index, n_samples, replace=False)
    test = test.loc[keep_indices]
    return test

def truncate_txt(text, max_words=2000):
    text_list = text.split()
    return " ".join(text_list[:max_words])

In [None]:
evaluate_prompt_template(
    MODEL_PROMPT, max_new_tokens=75,
    test_filepath='/mnt/hdd0/Kaggle/llm_prompt_recovery/data/nbroad-v1.csv', n_samples=32)

In [None]:
evaluate_prompt_template(
    MODEL_PROMPT, max_new_tokens=75,
    test_filepath='/mnt/hdd0/Kaggle/llm_prompt_recovery/data/nbroad-v2.csv', n_samples=32)

In [None]:
forum_prompt = """<s>[INST] Original Text: Hey there! Just a heads up: our friendly dog may bark a bit, but don't worry, he's all bark and no bite![/INST]
Provide the modified text and I'll tell you a something general that changed about it.  I'll avoid any specifics though.  My tone will be neutral.</s>
[INST] Re-written Text: Warning: Protective dog on premises. May exhibit aggressive behavior. Ensure personal safety by maintaining distance and avoiding direct contact.[/INST]
The request was: Shift the narrative from offering an informal, comforting assurance to providing a more structured advisory note that emphasizes mindfulness and preparedness. This modification should mark a clear change in tone and purpose, aiming to responsibly inform and guide, while maintaining a positive and constructive approach.</s>
<s>[INST] Original Text: A lunar eclipse happens when Earth casts its shadow on the moon during a full moon. The moon appears reddish because Earth's atmosphere scatters sunlight, some of which refracts onto the moon's surface. Total eclipses see the moon entirely in Earth's shadow; partial ones occur when only part of the moon is shadowed.[/INST]
Provide the modified text and I'll tell you a something general that changed about it.  I'll avoid any specifics though.  My tone will be neutral.</s>
[INST] Re-written Text: Yo check it, when the Earth steps in, takes its place, casting shadows on the moon's face. It's a full moon night, the scene's set right, for a lunar eclipse, a celestial sight. The moon turns red, ain't no dread, it's just Earth's atmosphere playing with sunlight's thread, scattering colors, bending light, onto the moon's surface, making the night bright. Total eclipse, the moon's fully in the dark, covered by Earth's shadow, making its mark. But when it's partial, not all is shadowed, just a piece of the moon, slightly furrowed. So that's the rap, the lunar eclipse track, a dance of shadows, with no slack. Earth, moon, and sun, in a cosmic play, creating the spectacle we see today.[/INST]
The request was: Transform your communication from an academic delivery to a dynamic, rhythm-infused presentation. Keep the essence of the information intact but weave in artistic elements, utilizing rhythm, rhyme, and a conversational style. This approach should make the content more relatable and enjoyable, aiming to both educate and entertain your audience.</s>
<s>[INST] Original Text: {original_text} [/INST]
Provide the modified text and I'll tell you a something general that changed about it.  I'll avoid any specifics though.  My tone will be neutral.</s>
[INST] Re-written Text: {rewritten_text} [/INST]
The request was:
"""
evaluate_prompt_template(
    forum_prompt, max_new_tokens=75,
    test_filepath='/mnt/hdd0/Kaggle/llm_prompt_recovery/data/nbroad-v2.csv', n_samples=32)

In [None]:
evaluate_prompt_template(
    forum_prompt, max_new_tokens=75,
    test_filepath='/mnt/hdd0/Kaggle/llm_prompt_recovery/data/nbroad-v1.csv', n_samples=32)

In [None]:
test = pd.read_csv('/mnt/hdd0/Kaggle/llm_prompt_recovery/data/nbroad-v1.csv')
print(test.shape)
test.head()

In [None]:
np.random.seed(7)
keep_indices = np.random.choice(test.index, 256, replace=False)
test = test.loc[keep_indices]
print(test.shape)

In [None]:
MODEL_PROMPT = """<s>[INST]
# Guess the prompt

Your task is to guess which prompt was given to transform the original text into the rewritten text.
Read the two texts carefully and answer with the most likely prompt that was used to modify the original text.

- The response should be as short and concise as possible. Whenever possible use a single short sentence prompt.
- The prompt should be as generic as possible and content agnostic unless it is absolutely necessary to make reference to the content of the original text

## Original text

{original_text}

## Rewritten text

{rewritten_text}

[/INST]

## Prompt

"""

def truncate_txt(text, max_words=2000):
    text_list = text.split()
    return " ".join(text_list[:max_words])

prompts = []
for _, row in test.iterrows():
    prompts.append(MODEL_PROMPT.format(original_text=truncate_txt(row['original_text']),
                                       rewritten_text=truncate_txt(row['rewritten_text'])))

plt.hist([len(tokenizer.tokenize(prompt)) for prompt in prompts], bins=20);
plt.title('Distribution of prompt token length')
plt.xlabel('Prompt token length');

In [None]:
responses = [chat_with_mixtral(prompt, max_new_tokens=25, verbose=False) for prompt in tqdm(prompts, smoothing=0)]
test['predicted_prompt'] = responses

hub_url = "https://www.kaggle.com/models/google/sentence-t5/frameworks/TensorFlow2/variations/st5-base/versions/1"
encoder = hub.KerasLayer(hub_url)
responses_embeddings = encoder(tf.constant(responses))[0].numpy()
ground_truth_embeddings = encoder(tf.constant(test['rewrite_prompt']))[0].numpy()
similarity = np.sum(responses_embeddings * ground_truth_embeddings, axis=1)
test['cosine_similarity'] = similarity
test['sharpened_cosine_similarity'] = similarity**3
print(f'Mean sharpened cosine similarity: {test["sharpened_cosine_similarity"].mean():.3f} +- {test["sharpened_cosine_similarity"].std()/np.sqrt(len(test))*1.96:.3f}')
plt.hist(test['sharpened_cosine_similarity'], bins=20);
plt.title('Distribution of sharpened cosine similarity');

In [None]:
responses = [chat_with_mixtral(prompt, max_new_tokens=50, verbose=False) for prompt in tqdm(prompts, smoothing=0)]
test['predicted_prompt'] = responses

hub_url = "https://www.kaggle.com/models/google/sentence-t5/frameworks/TensorFlow2/variations/st5-base/versions/1"
encoder = hub.KerasLayer(hub_url)
responses_embeddings = encoder(tf.constant(responses))[0].numpy()
ground_truth_embeddings = encoder(tf.constant(test['rewrite_prompt']))[0].numpy()
similarity = np.sum(responses_embeddings * ground_truth_embeddings, axis=1)
test['cosine_similarity'] = similarity
test['sharpened_cosine_similarity'] = similarity**3
print(f'Mean sharpened cosine similarity: {test["sharpened_cosine_similarity"].mean():.3f} +- {test["sharpened_cosine_similarity"].std()/np.sqrt(len(test))*1.96:.3f}')
plt.hist(test['sharpened_cosine_similarity'], bins=20);
plt.title('Distribution of sharpened cosine similarity');

In [None]:
print(f'Mean sharpened cosine similarity: {test["sharpened_cosine_similarity"].mean():.3f} +- {test["sharpened_cosine_similarity"].std()/np.sqrt(len(test))*1.96:.3f}')

In [None]:
test["sharpened_cosine_similarity"].mean()

In [None]:
test["sharpened_cosine_similarity"].std()/np.sqrt(len(test))*1.96

In [None]:
test.sort_values('sharpened_cosine_similarity', ascending=False).head()[['rewrite_prompt', 'predicted_prompt', 'cosine_similarity', 'sharpened_cosine_similarity']]

In [None]:
plt.hist(test['sharpened_cosine_similarity'], bins=20);

## TODO

- [x] Evaluate with T5 transformer
- [ ] Select the best dataset for testing
- [ ] Estimate uncertainty on evaluation
- [ ] Save evaluation results to disk for traceability
- [ ] Pairwise comparison to have more power to take decisions
- [ ] Encapsulate the code to do a prompt evaluation