# Define model

In [None]:
import os.path
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from fuzzywuzzy.fuzz import token_sort_ratio
import torch
import argparse
from typing import List, Dict, Tuple

class ScribendiScore:
    def __init__(self, 
        threshold: float=0.8,
        model_id: str='gpt2',
        no_cuda: bool=False
    ) -> None:
        self.threshold = threshold
        self.model_id = model_id
        self.no_cuda = no_cuda
        self.tokenizer, self.model = self.load_model(model_id)
    
    def score(self,
        src_sents: List[str],
        pred_sents: List[str],
        batch_size: int=32,
        verbose: bool=False
    ) -> int:
        src_sents, pred_sents, count = self.remove_eq_sents(src_sents, pred_sents)
        src_ppls = self.ppl(src_sents, batch_size)
        pred_ppls = self.ppl(pred_sents, batch_size)
        score = 0
        score2freq = {-1:0, 0:count, 1:0}
        for i, (src, pred) in enumerate(zip(src_sents, pred_sents)):
            if src_ppls[i] <= pred_ppls[i]:
                score += -1
                score2freq[-1] += 1
                continue
            tsr = self.token_sort_ratio(src, pred)
            ldr = self.levenshtein_distance_ratio(src, pred)
            if max(tsr, ldr) >= self.threshold:
                score += 1
                score2freq[1] += 1
            else:
                score += -1
                score2freq[-1] += 1
        #print('score2freq ->', score2freq, ', score ->', score2freq[1] - score2freq[-1])
        return score
                
    def ppl(self, sents: List[str], batch_size: int=32) -> List[int]:
        ppls = []
        sents = [self.tokenizer.bos_token + sent for sent in sents]
        for i in range(len(sents)//batch_size+1):
            batch = sents[i*batch_size:(i+1)*batch_size]
            if len(batch) == 0:
                continue
            inputs = self.tokenizer(batch, return_tensors='pt', padding=True)
            if not self.no_cuda:
                inputs = {k: v.cuda() for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(
                    inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    labels=inputs['input_ids']
                )
                shift_logits = outputs.logits[:, :-1, :].contiguous()
                shift_labels = inputs['input_ids'][:, 1:].contiguous()
                shift_mask = inputs['attention_mask'][:, 1:].contiguous()
                batch_size, seq_len = shift_labels.shape
                loss_fn = torch.nn.CrossEntropyLoss(reduction='none')
                loss = loss_fn(
                    shift_logits.view(-1, shift_logits.size(-1)),
                    shift_labels.view(-1)
                ).view(batch_size, seq_len)
                loss = (loss * shift_mask).sum(dim=1) / shift_mask.sum(dim=1)
                ppls += torch.exp(loss).tolist()
        return ppls

    @staticmethod
    def token_sort_ratio(src: str, pred: str) -> float:
        return token_sort_ratio(src, pred) / 100
    
    @staticmethod
    def levenshtein_distance_ratio(src: str, pred: str) -> float:
        len_src = len(src)
        len_pred = len(pred)
        dp = [[0]*(len_pred+1) for _ in range(len_src+1)]
        # dp = np.zeros((len_src+1, len_pred+1))
        for i in range(1, len_src+1):
            dp[i][0] = i
        for j in range(1, len_pred+1):
            dp[0][j] = j
        for i in range(1, len_src+1):
            for j in range(1, len_pred+1):
                cost = 0
                if src[i-1] != pred[j-1]:
                    cost = 2 # Replacement cost is 2
                dp[i][j] = min(
                    dp[i-1][j-1] + cost,
                    min(dp[i-1][j] + 1, dp[i][j-1] + 1)
                )
        return 1 - dp[len_src][len_pred] / (len_src + len_pred)

    def load_model(self, 
        model_id: str
    ) -> Tuple[GPT2TokenizerFast, GPT2LMHeadModel]:
        local=os.path.exists(model_id)
        tokenizer = GPT2TokenizerFast.from_pretrained(model_id,
                local_files_only=local)
        model = GPT2LMHeadModel.from_pretrained(model_id,
                local_files_only=local)
        tokenizer.pad_token = tokenizer.eos_token
        if not self.no_cuda:
            model.to('cuda')
        return tokenizer, model
        
    @staticmethod
    def remove_eq_sents(
        src_sents: List[str],
        pred_sents: List[str]
    )-> Tuple[List[str], List[str], int]:
        new_src_sents = []
        new_pred_sents = []
        count = 0
        for src, pred in zip(src_sents, pred_sents):
            if src != pred:
                new_src_sents.append(src)
                new_pred_sents.append(pred)
            else:
                count += 1
        return new_src_sents, new_pred_sents, count

## Test inference

In [None]:
model_id = "gpt2"
threshold = 0.8
no_cuda = True

scorer = ScribendiScore(
    model_id=model_id,
    threshold=threshold,
    no_cuda=no_cuda
)
src = ["Once the test is done , whether the results should be open to his or her relatives has caused social extensive controversy."]
pred = ["Once the test is done , whether the results should be open to his or her relatives has caused extensive social controversy."]
print('src:', src)
print('pred:', pred)
print('ppl of src:', scorer.ppl(src)) # [198.90069580078125] Note: Cannot be reproduced
print('ppl of pred:', scorer.ppl(pred)) # [119.57299041748047] Note: Cannot be reproduced
print('levenshtein distance ratio:', scorer.levenshtein_distance_ratio(src[0], pred[0])) # 0.94308
print('token sort ratio:', scorer.token_sort_ratio(src[0], pred[0])) # 1.0
print('scribendi score:', scorer.score(src, pred)) # 1



# Generate Preference dataset

In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [None]:
df = pd.read_csv('troy-blogs.train.tokenized.csv')
df_sample = df.sample(50000)

In [None]:
from openai import OpenAI
client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="token-abc123",
)


In [None]:
def infer(text):
    completion = client.chat.completions.create(
      # model="mistralai/Mistral-7B-Instruct-v0.2",
        model="gec-llama2-7b-public/",
        temperature=1,
        n=5,
        messages=[
            {"role": "system", "content": "Rewrite this text to make it grammatically correct ."},
            {"role": "user", "content": text}
        ]
    )
    preds = []
    for choise in completion.choices:
        pred = choise.message.content
        prefixes = ["text:", "text is:", ":\n\n", ":\n"]
        for p in prefixes:
            if p in pred:
                pred = pred[pred.index(p)+len(p)+1:]
                pred = pred.lstrip()
                if "\n" in pred:
                    pred = pred[:pred.index('\n')]
                break
        preds.append(pred)
    return preds

infer("I hop ths mesage find u ..")

In [None]:
df_sample['pred-llama2-sample'] = df_sample.src.progress_map(infer)

In [None]:
df_sample.to_json('troy-llama7-gec-20k-sample.json')

# Score

In [None]:
df_sample['score'] = df_sample.apply(lambda x: scorer.score(x.src, x['pred-llama2-sample']), axis=1)

In [None]:
data_dpo = []

for i, row in df_samples.iterrows():
    scores = results[i]
    max_score = max(list( map(lambda x: x['score'], scores.values())))
    max_model = list(filter(lambda x: x[1]['score']==max_score, scores.items()))[0][0]

    min_score = min(list( map(lambda x: x['score'], scores.values())))
    min_model = list(filter(lambda x: x[1]['score']==min_score, scores.items()))[0][0]
    data_dpo.append({
        'src': row.src,
        'choosen': row[max_model],
        'rejected': row[min_model]
    })

In [None]:
pd.DataFrame(data_dpo).to_json('troy-llama7-gec-20k-sample-scribendi.json')