# Evolving Generations
This notebooks describes a process to generate a large amount of data of adversarial paraphrases.
1. Generate paraphrases of a question.
2. Split the generated texts from a random character.
3. Regenerate the completion.
4. Re-rank the outputs  a ranking function.
5. Repeat the process from (2) for `num_epochs`.

Score_Func = https://huggingface.co/domenicrosati/deberta-v3-large-finetuned-paws-paraphrase-detector

Ranking Function  = Minimize Score_Func for `(output_original, output_pp)`, Maximize Score_Func for `(input_original, input_pp)`

In [None]:
!pip install openai
!pip install transformers
!pip install datasets

In [None]:
import random, json, os, time
from tqdm.auto import tqdm
import pandas as pd
import openai
import requests
import torch
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification)

##LLM

In [None]:
%env OPENAI_API_KEY=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
openai.api_key = os.getenv("OPENAI_API_KEY")
class LLM(object):
    def __init__(self, model='text-davinci-003'):
        super(LLM, self).__init__()
        self.model = model

    def generate(self, prompt, topk=1, stop="\n"):
        response = None
        while response==None:
            try:
                response = openai.Completion.create(
                engine=self.model,
                prompt=prompt,
                max_tokens=256,
                n = topk,
                stop = stop,
                )
            except:
                print("sleeping...")
                time.sleep(30)

        return [response['choices'][i]['text'] for i in range(topk)]

env: OPENAI_API_KEY=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


##Paraphrase Detector/ Consistency Scorer

In [None]:
class PP_Detector():
    def __init__(self, tok_path="domenicrosati/deberta-v3-large-finetuned-paws-paraphrase-detector", \
                 model_path="domenicrosati/deberta-v3-large-finetuned-paws-paraphrase-detector", max_len=30):
        super(PP_Detector, self).__init__()
        self.detection_tokenizer = AutoTokenizer.from_pretrained(tok_path)
        self.detection_model = AutoModelForSequenceClassification.from_pretrained(model_path)
        self.detection_model.to(device)

    def score_binary(self, y_1, y_2):
        inputs = self.detection_tokenizer(y_1, y_2, return_tensors="pt", padding=True).to(device)
        outputs = self.detection_model(**inputs)
        scores = outputs.logits.softmax(dim=-1)
        # Return probabilites and scores for not paraphrase and paraphrase
        return scores.T[0].item(), scores.T[1].item()

##Evolve
generate --> rerank --> mutate

In [None]:
class Evolve(object):
    def __init__(self,):
        super(Evolve, self).__init__()
        self.llm = LLM()

    def generate(self, prompt, topk=6, stop="-----"):
        return self.llm.generate(prompt, topk, stop="-----")

    def mutate(self, prompt):
        """
        split from random position to generate new variations.
        """
        idx = random.randint(0, len(prompt.split(' ')))
        prompt = ' '.join(prompt.split(' ')[:idx])
        return prompt

    def rerank(self, orig_in, pp_ins, alpha=0.5):
        """
        Rerank the set of paraphrased texts based on consistency of outputs (from LLM)
        """
        orig_out = self.llm.generate(orig_in)[0]
        pp_outs = [self.llm.generate(pp_in)[0] for pp_in in pp_ins]
        scores_out = [pp_detector.score_binary(orig_out, pp_out)[1] for pp_out in pp_outs]

        scores_out = [x for (x, _) in sorted(zip(scores_out, pp_ins))]
        pp_ins = [x for (_, x) in sorted(zip(scores_out, pp_ins))]

        for i in range(len(pp_ins)):
            score_in = pp_detector.score_binary(orig_in, pp_ins[i])[1]
            if score_in>=alpha:
                return pp_ins[i], scores_out[i], score_in 
        return orig_in, 1.0, 1.0

#Prompt

In [None]:
def get_exemplars(data_df, idx_to_drop, k_shots=11):
    prompt = """Generate diverse paraphrases taking motivation from the examples given below."""
    template_body = """
    Sentence :{}
    Paraphrase :{}
    -----
    """
    data_df = data_df.drop(index=idx_to_drop).reset_index(drop=True)
    data_df = data_df.sample(n=k_shots).reset_index(drop=True)
    for i in range(len(data_df)):
        prompt += template_body.format(data_df['sentence1'][i], data_df['sentence2'][i])
    return prompt

#Run

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_epochs = 25
pp_detector = PP_Detector()
evolve = Evolve()

In [None]:
data_df = pd.DataFrame(load_dataset('paws', 'labeled_final')['train'])
data_df = data_df[data_df['label']==1].sample(frac=1).reset_index(drop=True)

In [None]:
save_path = 'gen_pp_data-evolve.csv'
if os.path.exists(save_path):
    result_df = pd.read_csv(save_path)
else:
    result_df = pd.DataFrame()

print('Length of data =', len(data_df))
for i in tqdm(range(len(data_df))):
    prompt = get_exemplars(data_df, i)
    prompt += f"""Sentence :{data_df['sentence1'][i]}
    Paraphrase :"""
    all_pps, all_scores_out, all_scores_in = [], [], []
    for epoch in tqdm(range(num_epochs)):
        pps = evolve.generate(prompt, 10, "-----")
        pps = [(prompt+pp).split('Paraphrase :')[-1] for pp in pps]
        pp, score_out, score_in = evolve.rerank(data_df['sentence1'][i], pps)
        all_pps.append(pp)
        all_scores_out.append(score_out)
        all_scores_in.append(score_in)
        selected_pp = pp
        pp = evolve.mutate(pp)
        prompt = 'Paraphrase :'.join(prompt.split('Paraphrase :')[:-1]) + 'Paraphrase :' + pp
    
    tmp_df = pd.DataFrame({
        "sentence": [data_df['sentence1'][i]]*num_epochs,
        "sentence-pp": all_pps,
        "output-consistency_score": all_scores_out,
        "input-consistency_score": all_scores_in,
    })
    result_df = pd.concat([result_df, tmp_df], axis=0)
    result_df.to_csv(save_path, index=False)