In [1]:
import kagglehub
import csv
import os

import numpy as np
import pandas as pd
import torch
from torch.nn import functional
from tqdm import tqdm

from transformers import AutoTokenizer, T5Tokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

assert torch.cuda.is_available()

torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [2]:
EPOCHS = 1
BATCH_SIZE = 32
LEARNING_RATE = 1e-5

max_new_tokens = 30
max_sentences_in_response = 1

# Dataset

In [3]:
# Prompt_dataset
path = kagglehub.dataset_download("what5up/concat-prompts")
files = os.listdir(path)
csv_file = [file for file in files if file.endswith('.csv')][0]
data = pd.read_csv(os.path.join(path, csv_file))
prompt_dataset = data['prompt'].tolist()

Downloading from https://www.kaggle.com/api/v1/datasets/download/what5up/concat-prompts?dataset_version_number=1...


100%|██████████| 157k/157k [00:00<00:00, 348kB/s]

Extracting files...





In [4]:
# Vocabulary dataset
tokenizer = T5Tokenizer.from_pretrained("sentence-transformers/sentence-t5-base")
vocabulary_dataset = tokenizer.get_vocab()
vocabulary_dataset.pop('<pad>', None)
vocabulary_dataset.pop('</s>', None)
vocabulary_dataset.pop('<unk>', None)
vocabulary_dataset = list(vocabulary_dataset.keys())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
print(prompt_dataset[:5])
print(vocabulary_dataset[:5])

['Convert the text into a vintage circus poster announcement', "Convert the text into a social media platform's community guidelines", 'Rewrite this as a college course description.', 'Rephrase this as a debate on furniture rights, featuring chairs.', "Make the text into a home improvement expert's tips for a bathroom remodel"]
['▁', 'X', '.', ',', 's']


# Mean Prompt

In [16]:
def hotflip(prompt_tokens, token_index, vocabulary_dataset, model, target_embeddings):
    possible_prompts = [' '.join(prompt_tokens[:token_index] + [vocab] + prompt_tokens[token_index + 1:]) for vocab in vocabulary_dataset]
    prompt_embeddings = torch.tensor(np.array([model.encode(prompt) for prompt in possible_prompts]), dtype=torch.float32)

    simularities = cosine_similarity(target_embeddings, prompt_embeddings)

    vocab_index = simularities.argmax().item()
    prompt_tokens[token_index] = vocabulary_dataset[vocab_index]

    return prompt_tokens

In [7]:
# Model
model_name = "sentence-transformers/sentence-t5-base"
model = SentenceTransformer(model_name)
for param in model.parameters():
    param.requires_grad = False

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

rust_model.ot:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

In [10]:
# Initial prompt
prompt = "Rewrite this text to make it more helpful"
prompt_embeddings = model.encode(prompt)
prompt_embeddings = torch.tensor(prompt_embeddings, dtype=torch.float32).unsqueeze(0)

# Prompt tokens
prompt_tokens = prompt.split(' ')

# Target
target_embeddings = model.encode(prompt_dataset)
target_embeddings = torch.tensor(target_embeddings, dtype=torch.float32).mean(dim=0).unsqueeze(0)

print(prompt_embeddings.shape)
print(target_embeddings.shape)

torch.Size([1, 768])
torch.Size([1, 768])


In [17]:
for epoch in range(EPOCHS):
    for token_index in range(len(prompt_tokens)):
        loss =  1 - cosine_similarity(target_embeddings, prompt_embeddings)
        prompt_tokens = hotflip(prompt_tokens, token_index, vocabulary_dataset, model, target_embeddings)
        prompt = ' '.join(prompt_tokens)
        prompt_embeddings = model.encode(prompt)
        prompt_embeddings = torch.tensor(prompt_embeddings, dtype=torch.float32).unsqueeze(0)

    print(f"Epoch: {epoch}, Loss: {loss.item()}")
    print(prompt)

Epoch: 0, Loss: 0.041653215885162354
<extra_id_20> ▁Alter ▁sentences lucrarea ▁Make ▁fiction ▁proposal ▁vibe


# Submission

In [None]:
test = pd.read_csv("../input/llm-prompt-recovery/test.csv")
!cp ../input/llm-prompt-recovery/test.csv

In [None]:
preds = []
for i in tqdm(range(len(test_df))):
    row = test_df.iloc[i]

    # Generate Prompt using template
    prompt = template.format(
        original_text=row.original_text,
        rewritten_text=row.rewritten_text,
        rewrite_prompt=""
    )

    # Infer
    output = gemma_lm.generate(prompt, max_length=512)
    pred = output.replace(prompt, "") # remove the prompt from output

    # Store predictions
    preds.append([row.id, pred])