In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from evaluate import load as load_metric
import json
from tabulate import tabulate
from tqdm import tqdm
import numpy as np
import random
import shutup
import pandas as pd

random.seed(1234)
shutup.please()

# Setup

## OpenAI API (ChatGPT 4o mini)

In [None]:
from openai import OpenAI
client = OpenAI(api_key='sk-proj-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')

def chatGPT(prompt):
    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "user", "content": prompt}
        ],
    temperature=0.0
    )
    return completion.choices[0].message.content


## LOPSIDED Model Setup

In [None]:
pmodel = AutoModelForCausalLM.from_pretrained('./privacymodel/')
rmodel = AutoModelForCausalLM.from_pretrained('./replacementmodel/')
tokenizer = AutoTokenizer.from_pretrained('./gemma-2-2b-it/')

In [None]:
privacy = pipeline('text-generation', model=pmodel, tokenizer=tokenizer, do_sample=True, temperature=0.1)replacement = pipeline('text-generation', model=rmodel, tokenizer=tokenizer)


## System Prompt

In [None]:
replacement_prompt = """You are ReplaceGPT, an entity replacement model. Your task is to take an input, and output a transformed response that replaces all of the entities specified.

The goal is to minimize the impact of changing the entities. The user should not be able to tell this transformation happened.

The user will provide input of the original text, and a list of the entities that must be changed.

You will output the modified text.

Do not make any unnecesary changes that effect the semantic quality of the text, the meaning should stay the same.

Only the entities themselves should change, not the meaning."""

In [None]:
privacy_prompt = """
You are PrivacyGPT. You will anonymize the user's prompt while maintaining the meaning whenever possible.

Your task is to revise the user's prompt. Your goal is to reword and change all private entities that are not strictly relevant to the text. You can change any names, places, organizations, etc as long as they wont effect the response when changed back.

Remember the following criteria:
* The meaning of the prompt **must not change**.
* Any replacement of names, places, organizations, etc must be semantically similar to their original values. 
    * Don't use placeholders like [NAME]. Opt for similar entities, such as names with the same gender, organizations in the same field, etc.
    * We will replace these replacements again when their response is completed, so the user does not notice the effect.
* If the private entities are crucial to the meaning of the prompt then they must stay as they appear.
    * For example, a location may remain in the prompt if it is absolutely needed to create a response and a replacement would not work.
    * Works of literature often do not rely on the entity remaining the same, but there are exceptions, for example if the user requests a rhyming poem or song.
* You are **maximizing the privacy** of the user, and **minimizing the effect on their request's reponse**.

You will return your reasoning for each change alongside the change itself. At the end, provide the fully modified prompt as well as the original prompt.

**REMEMBER: ONLY REPLACE THE WORD/TOKEN IF IT WILL NOT CHANGE THE ANSWER OR RESPONSE OF THE QUESTION OR TASK.**

Here is the prompt:

{prompt}
"""

## Helper Methods

In [None]:
def extract_reply(api_response):
    return api_response['response']['body']['choices'][0]['message']['content']

In [None]:
def extract_changes(r):
    try:
        replacements =  [x.group()[2:-2] for x in list(re.finditer(r'\*\*.+?\*\*', r.split('# Changes')[1].split('# New Prompt')[0].replace('****', '** **')))]
        rep_text = ""
        if not len(replacements) % 2 == 0:
            print(r)
        while len(replacements) > 0:
            r2 = replacements.pop(0)
            r1 = replacements.pop(0)
            if r2 == r1:
                continue
            rep_text += " - " + r1 + " -> " + r2 + "\n" 
        return rep_text
    except:
        return ""

def extract_prompt(r):
    return r.split("# New Prompt:\n\n")[-1]

In [None]:
def run(prompt, verbose=False):
    pseudonym_input = privacy_prompt.format(prompt=prompt)
    pseudonym_output = privacy([
        {
            'role': 'user',
            'content': pseudonym_input,
        }
    ], max_new_tokens=1024)[0]['generated_text'][-1]['content']
    
    privatized_prompt = pseudonym_output.split("# New Prompt:\n\n")[-1]
    changes = extract_changes(pseudonym_output)
    
    if verbose:
        print('[PSEUDONYMIZATION OUTPUT]')
        print(pseudonym_output, '\n\n')
    
    model_response = chatGPT(privatized_prompt)

    if verbose:
        print('[INITIAL OUTPUT]')
        print(model_response, '\n\n')

    replacement_input = replacement_prompt + '\n\n' + "# Entities to be replaced: \n\n" + changes + "\n\n# Text To Modify:\n\n" + model_response
    
    final_output = replacement([
        {
            'role': 'user',
            'content': replacement_input,
        },
    ], max_new_tokens=2048)[0]['generated_text'][-1]['content']

    if verbose:
        print('[FINAL OUTPUT]')
        print(final_output)
    return final_output

In [None]:
def run_privacy(inputs):
    inputs = [privacy_prompt.format(prompt=i) for i in inputs]
    inputs = [[{'role': 'user', 'content': i}] for i in inputs]
    return privacy(inputs, max_new_tokens=1024)
def run_replacement(privacy_output, model_responses):
    privacy_changes = [extract_changes(x) for x in privacy_output]
    replacement_inputs = [
        replacement_prompt + '\n\n' + "# Entities to be replaced: \n\n" + changes + "\n\n# Text To Modify:\n\n" + model_response
        for model_response, changes in  zip(model_responses, privacy_changes)
    ]
    replacement_inputs = [ [{'role': 'user', 'content': replacement_input}] for replacement_input in replacement_inputs]
    return replacement(replacement_inputs, max_new_tokens=2048)

## Processing the Test Set

In [None]:
with open('prompts.json') as f:
    test_data = json.load(f)
test_data = test_data[int(len(test_data) * 0.75):]
len(test_data)


In [None]:
test_data_privacy = run_privacy(test_data)

In [None]:
test_data_privacy_responses = [r[0]['generated_text'][-1]['content'] for r in test_data_privacy]

In [None]:
test_data_privacy_modprompts = [extract_prompt(r) for r in test_data_privacy_responses]

In [None]:
test_processed = []

for i, prompt in enumerate(test_data_privacy_modprompts):
    test_processed.append({
        "custom_id": "prompt" + str(i),
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {"model": "gpt-4o-mini", "messages": [{'role': 'user', 'content': prompt}], "temperature": 0.0}
    })

jsonl = '\n'.join([json.dumps(line) for line in test_processed])
with open('test_prompt_batch.jsonl', 'w') as f:
    f.write(jsonl)

batch_input_file = client.files.create(
  file=open("test_prompt_batch.jsonl", "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

In [None]:
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "Test set responses."
    }
)

In [None]:
with open('test_prompt_batch.jsonl') as f:
    test_data_privacy = [json.loads(x)['body']['messages'][0]['content'] for x in f.read().split('\n')[0:-1]]
test_data_privacy[-1] # Privatized prompts

In [None]:
with open('test_set_responses.jsonl') as f:
    test_data_responses = [extract_reply(json.loads(r)) for r in f.read().split('\n')[0:-1]]
# Responses
test_data_responses[-3
]

In [None]:
test_data_results = run_replacement(test_data_privacy, test_data_responses)

In [None]:
test_data_results_response = [r[0]['generated_text'][-1]['content'] for r in test_data_results]

In [None]:
with open('test_data_results.json', 'w') as f:
    json.dump(test_data_results, f)

In [None]:
with open('test_data_results.json') as f:
    test_data_results_response = [x[0]['generated_text'][-1]['content'] for x in json.load(f)]

# Classifier Based Evaluation

In [None]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   labels = np.argmax(labels, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [None]:
bert = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-uncased', num_labels=2)
bert_tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

In [None]:
with open('gold_replies.jsonl') as f:
    gold_replies = [extract_reply(json.loads(r)) for r in f.read().split('\n')[0:-1]]
gold_test = gold_replies[int(len(gold_replies)*.75):]
gold_test = [p + '\n\n' + g for p, g in zip(test_data, gold_test)]
bert_test = [p + '\n\n' + g for p, g in zip(test_data, test_data_results_response)]
len(gold_test)

In [None]:
tokenized_gold = bert_tokenizer(gold_test, truncation=True, padding="max_length")
tokenized_test = bert_tokenizer(bert_test, truncation=True, padding="max_length")

bert_data = tokenized_gold['input_ids'] + tokenized_test['input_ids']
bert_labels = ([[1.0, 0.0]]*len(tokenized_gold['input_ids'])) + ([[0, 1.0]]*len(tokenized_test['input_ids']))

bert_data = list(zip(bert_data, bert_labels))
random.shuffle(bert_data)
bert_data, bert_labels = list(zip(*bert_data))


bert_split = int(len(bert_data)*0.6)

bert_train_data = Dataset.from_dict({
    "input_ids": bert_data[:bert_split],
    "labels": bert_labels[:bert_split]
})
bert_eval_data = Dataset.from_dict({
    "input_ids": bert_data[bert_split:],
    "labels": bert_labels[bert_split:]
})




In [None]:
training_args = TrainingArguments(
   './bertresults/',
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   num_train_epochs=5,
   weight_decay=0.01,
   eval_steps=10,
   logging_steps=5,
   eval_strategy="steps",
   do_eval=True
)
trainer = Trainer(
   model=bert,
   args=training_args,
   train_dataset=bert_train_data,
   eval_dataset=bert_eval_data,
   tokenizer=bert_tokenizer,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

# Human Evaluation

In [None]:
import json

In [None]:
# Use if  you don't need the saved data
private_prompts = test_data_privacy_modprompts

In [None]:
with open('test_prompt_batch.jsonl') as f:
    private_prompts = [json.loads(l)['body']['messages'][0]['content'] for l in f.read().split('\n')[:-1]]
with open('aggregate.json') as f:
    human_eval = json.load(f)


In [None]:
def score(prompt, evaluation):
    privacy_error = False
    meaning_error = False
    error = False
    if evaluation['reject']:
        return False, False, False
    for tag, rel in zip(evaluation['tags'], evaluation['rel']):
        if tag in prompt and not rel:
            privacy_error = True
            error = True
        elif not tag in prompt and rel:
            meaning_error = True
            error = True
    
    return privacy_error, meaning_error, error
            
        
    

In [None]:
human_eval[0]

In [None]:
scores = [score(p, e) for p, e in zip(private_prompts, human_eval)]

In [None]:
privacy_scores, meaning_scores, errors = (zip(*scores))

In [None]:
human_eval_table = [
    ["Error Type", "# Errored Prompts", "% Errored Prompts"],
    ["Privacy", sum(privacy_scores), sum(privacy_scores)/len(privacy_scores)],
    ["Meaning", sum(meaning_scores), sum(meaning_scores)/len(meaning_scores)],
    ["Errors", sum(errors), sum(errors)/len(errors)]
]

In [None]:
pd.DataFrame(human_eval_table[1:], columns=human_eval_table[0])

In [None]:
def score2(prompts, evaluations):
    privacy_errored_tags = {}
    meaning_errored_tags = {}
    privacy_errors = 0
    meaning_errors = 0
    total = 0
    total_meaning = 0
    for prompt, evaluation in zip(prompts, evaluations):
        if evaluation['reject']:
            continue
        for tag, rel, t in zip(evaluation['tags'], evaluation['rel'], evaluation['types']):
            if tag in prompt and not rel:
                privacy_errors += 1
                privacy_errored_tags.setdefault(t, 0)
                privacy_errored_tags[t] += 1
            elif not tag in prompt and rel:
                meaning_errors += 1
                meaning_errored_tags.setdefault(t, 0)
                meaning_errored_tags[t] += 1
            total += 1
    return privacy_errors / total, meaning_errors / total, privacy_errored_tags, meaning_errored_tags

In [None]:
score2(private_prompts, human_eval)

## ROUGE and BLEU

In [None]:
with open('gold_replies.jsonl') as f:
    gold_replies = [extract_reply(json.loads(r)) for r in f.read().split('\n')[0:-1]]
gold_test = gold_replies[int(len(gold_replies)*.75):]
gold_test = [g for p, g in zip(test_data, gold_test)]
predicted_test = [g for p, g in zip(test_data, test_data_responses)]

In [None]:
len(gold_test)

In [None]:
len(test_data_responses)

In [None]:
from rouge_score import rouge_scorer

In [None]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [None]:
rouge_scores = [scorer.score(g, p) for g, p in zip(gold_test, predicted_test)]

In [None]:
gold_test[1]

In [None]:
rouge_scores[0]['rouge1'].fmeasure

In [None]:
avg_f_scores = {
    'rouge1': np.mean([x['rouge1'].fmeasure for x in rouge_scores]),
    'rouge2': np.mean([x['rouge2'].fmeasure for x in rouge_scores]),
    'rougeL': np.mean([x['rougeL'].fmeasure for x in rouge_scores]),
}

In [None]:
avg_f_scores

## BLEU Scores

In [None]:
from nltk.translate.bleu_score import sentence_bleu

In [None]:
references = [[x.split()] for x in gold_test]
samples = [x.split() for x in bert_test]

In [None]:
print('Cumulative 1-gram: %f' % np.mean([sentence_bleu(reference, sample, weights=(1, 0, 0, 0)) for reference, sample in zip(references, samples)]))
print('Cumulative 2-gram: %f' % np.mean([sentence_bleu(reference, sample, weights=(0.5, 0.5, 0, 0))  for reference, sample in zip(references, samples)]))
print('Cumulative 3-gram: %f' % np.mean([sentence_bleu(reference, sample, weights=(0.333, 0.333, 0.333, 0))  for reference, sample in zip(references, samples)]))
print('Cumulative 4-gram: %f' % np.mean([sentence_bleu(reference, sample, weights=(0.25, 0.25, 0.25, 0.25))  for reference, sample in zip(references, samples)]))