In [None]:
import sys
sys.path.append('../scripts')

In [None]:
import openai
from pathlib import Path
from transformers import AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
from evaluation import error_analysis, get_scores
from dataset import load_data, get_dataloader
from generative.transformers_util import get_training_args, get_trainer, get_tokenizer

In [None]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')

In [None]:
openai.api_key = config.api_key

In [None]:
training_args = get_training_args(config, report_to="none")
tokenizer = get_tokenizer(config)

base_path = Path('..')
train_df, val_df, test_df = load_data(base_path / config.data.cnf_tsv_path, base_path / config.data.controls_tsv_path)
train_dataset, val_dataset, test_dataset = get_dataloader(train_df, val_df, test_df, tokenizer)

model = AutoModelForSeq2SeqLM.from_pretrained("../data/model")

# Evaluation of taking the top 1 result from our LLM and comparing it to the resolution

In [None]:
pipeline = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, max_length=config.generation_max_length, device=0)

resolutions = list(val_df.full_resolution)
samples = list(val_df.raw_sentence)
predictions = pipeline(samples)

errors = error_analysis([prediction["generated_text"] for prediction in predictions], resolutions, samples)
get_scores(errors, "eval")

# Evaluating the improvement produced by taking the top k generations and comparing all of them to the resolution

In [None]:
def evaluate_top_k(model, tokenizer, data, beams, generation_max_length=config.generation_max_length):
    pipeline = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, max_length=generation_max_length, num_beams=beams, num_return_sequences=beams, device=0)

    originals = list(data.raw_sentence)
    resolutions = list(data.full_resolution)
    outputs = pipeline(originals)

    predictions = []
    for i, resolution in enumerate(resolutions):
        generations = [entry['generated_text'] for entry in outputs[i]]
        scores = [relative_edit_distance(gen, resolution, originals[i]) for gen in generations]
        if max(scores) == 1 and scores.index(max(scores)) != 0:
            print(generations)
            print(scores.index(max(scores)))
            print(resolution)
        predictions.append(generations[scores.index(max(scores))])

    errors = error_analysis(predictions, resolutions, list(val_df.raw_sentence))
    scores = get_scores(errors, "eval")

    return scores

In [None]:
k = 5
evaluate_top_k(model, tokenizer, val_df, k)

# Using ChatGPT/GPT to determine the best fit of the top k options

In [None]:
def generate_prompt(original, predictions):
    beginning = "Ich werde dir im Folgenden einen Satz zeigen, welcher sogennannte Koordinationsellipsen enthält. Das Ziel ist es diese zu aufzulösen. Ein Beispiel wäre 'Ibrutinib, ein Inhibitor der Bruton-Tyrosinkinase (BTK), ist in Deutschland als Erstlinien- und Rezidivtherapiee in der CLL zugelassen.' Die richtige Auflösung wäre 'Ibrutinib, ein Inhibitor der Bruton-Tyrosinkinase (BTK), ist in Deutschland als Erstlinientherapie und Rezidivtherapiee in der CLL zugelassen.' Ich werde dir für meine Beispiele Antwortmöglichkeiten geben und du sollst dann entscheiden, welche dieser Optionen die Koordinationsellipsen korrekt auflöst.\n\n"
    original = f"Mein Satz: '{original}'\n\n"
    answers = "Deine Antwortmöglichkeiten:\n" + "".join(f"{i+1}) '{prediction}'\n" for i, prediction in enumerate(predictions))
    end = "\nWelche Antwort ist die richtige? Antworte nur mit der Zahl und keiner Erklärung"

    return beginning + original + answers + end

In [None]:
def get_openai_response_chatgpt(prompt):
    return openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0301",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=100,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )["choices"][0]["message"]["content"]

In [None]:
def get_openai_response_gpt3(prompt):
    return openai.Completion.create(
        model="text-davinci-003",
        prompt= prompt,
        temperature=0,
        max_tokens=100,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )["choices"][0]["text"]

In [None]:
from transformers import Text2TextGenerationPipeline
from evaluation import relative_edit_distance, error_analysis, get_scores
import re

def generate_best_fit(samples, resolutions, outputs):
    predictions = []
    for i, resolution in enumerate(resolutions):
        generations = [entry['generated_text'] for entry in outputs[i]]
        answer = get_openai_response_chatgpt(generate_prompt(samples[i], generations))

        numbers = re.findall(r'\d+', answer)
        if len(numbers) > 1:
            print(f'more numbers than expected {numbers}')
        if len(numbers) == 0:
            print(f'no numbers found')
            index = 0
        else:
            index = int(numbers[0]) - 1
            if index > 4:
                print(f'Index is out of bounds. Something went wrong with the API Answer. Defaulting to 0')
                index = 0

        print(f'{i}) answer: {index}')
        print('--------------------------------')
        predictions.append(generations[index])

    return predictions

In [None]:
k=5
pipeline = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, max_length=config.generation_max_length, num_beams=k, num_return_sequences=k, device=0)

samples = list(val_df.raw_sentence)
resolutions = list(val_df.full_resolution)
outputs = pipeline(samples)

In [None]:
predictions = generate_best_fit(samples, resolutions, outputs)

errors = error_analysis(predictions, resolutions, samples)
scores = get_scores(errors, "eval")