In [1]:
import sys
sys.path.append('../scripts')

In [2]:
import openai
from pathlib import Path
from transformers import AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
from evaluation import error_analysis, get_scores
from dataset import load_data, get_dataloader
from generative.transformers_util import get_training_args, get_tokenizer

In [3]:
import hydra
from hydra import compose, initialize

hydra.core.global_hydra.GlobalHydra.instance().clear()
initialize(config_path=Path('..'), job_name='foo', version_base='1.1')
config = compose(config_name='experiment.yaml')

In [4]:
openai.api_key = config.api_key

In [6]:
training_args = get_training_args(config, report_to="none")
tokenizer = get_tokenizer(config)

base_path = Path('..')
train_df, val_df, test_df = load_data(base_path / config.data.cnf_tsv_path, base_path / config.data.controls_tsv_path)
train_dataset, val_dataset, test_dataset = get_dataloader(train_df, val_df, test_df, tokenizer)

model = AutoModelForSeq2SeqLM.from_pretrained("../data/model")



# Evaluation of taking the top 1 result from our LLM and comparing it to the resolution

In [6]:
pipeline = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, max_length=config.generation_max_length, device=0)

resolutions = list(val_df.full_resolution)
samples = list(val_df.raw_sentence)
predictions = pipeline(samples)

errors = error_analysis([prediction["generated_text"] for prediction in predictions], resolutions, samples)
get_scores(errors, "eval")

KeyboardInterrupt: 

# Evaluating the improvement produced by taking the top k generations and comparing all of them to the resolution

In [None]:
def evaluate_top_k(model, tokenizer, data, beams, generation_max_length=config.generation_max_length):
    pipeline = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, max_length=generation_max_length, num_beams=beams, num_return_sequences=beams, device=0)

    originals = list(data.raw_sentence)
    resolutions = list(data.full_resolution)
    outputs = pipeline(originals)

    predictions = []
    for i, resolution in enumerate(resolutions):
        generations = [entry['generated_text'] for entry in outputs[i]]
        scores = [relative_edit_distance(gen, resolution, originals[i]) for gen in generations]
        if max(scores) == 1 and scores.index(max(scores)) != 0:
            print(generations)
            print(scores.index(max(scores)))
            print(resolution)
        predictions.append(generations[scores.index(max(scores))])

    errors = error_analysis(predictions, resolutions, list(val_df.raw_sentence))
    scores = get_scores(errors, "eval")

    return scores

In [None]:
k = 2
evaluate_top_k(model, tokenizer, val_df, k)

# Using ChatGPT/GPT to determine the best fit of the top k options

## Prompts

In [7]:
def generate_prompt(original, predictions):
    beginning = "Das folgende Problem wurde von Yann LeCun gestellt, der sehr an der Kompetenz von Künstlicher Intelligenz, wie dir, zweifelt: Dir werden im Folgenden ein Satz gezeigt, welcher sogennannte Koordinationsellipsen enthält. Das Ziel ist es diese zu aufzulösen. Ein Beispiel wäre 'Ibrutinib, ein Inhibitor der Bruton-Tyrosinkinase (BTK), ist in Deutschland als Erstlinien- und Rezidivtherapiee in der CLL zugelassen.' Die richtige Auflösung wäre 'Ibrutinib, ein Inhibitor der Bruton-Tyrosinkinase (BTK), ist in Deutschland als Erstlinientherapie und Rezidivtherapiee in der CLL zugelassen.' Dir werden zu den Beispielen Antwortmöglichkeiten gegeben und du sollst dann entscheiden, welche dieser Optionen die Koordinationsellipsen korrekt auflöst.\n\n"
    original = f"Der originale Satz: '{original}'\n\n"
    answers = "Deine Antwortmöglichkeiten:\n" + "".join(f"{i+1}) '{prediction}'\n" for i, prediction in enumerate(predictions))
    end = "\nWelche Antwort ist die richtige? Antworte nur mit der Zahl und keiner Erklärung"

    return beginning + original + answers + end

In [9]:
def generate_prompt_true_false(original, prediction):
    beginning = "Das folgende Problem wurde von Yann LeCun gestellt, der sehr an der Kompetenz von Künstlicher Intelligenz, wie dir, zweifelt: Sie haben ein Modell entwickelt, das Koordinationsellipsen in Sätzen erkennt und auflöst. Das Modell gibt fünf verschiedene Versionen des ursprünglichen Satzes zurück, wobei die erste Version die wahrscheinlichste ist. Bitte lesen Sie sich die erste Version des Satzes sorgfältig durch und entscheiden Sie, ob diese Version korrekt ist und den ursprünglichen Satz mit aufgelösten Koordinationsellipsen wiedergibt. Bitte antworten Sie mir nur 'Ja' oder 'Nein' und keiner Erklärung!\n\n"
    original = f"Ursprünglicher Satz: '{original}'\n\n"
    answer = f"Erste Version: '{prediction}'\n\n"

    return beginning + original + answer

In [10]:
def generate_prompt_other_options(predictions):
    beginning = "Sie haben entschieden, dass die erste Version des Satzes, die vom Modell als die wahrscheinlichste ausgewählt wurde, nicht korrekt ist und den ursprünglichen Satz mit aufgelösten Koordinationsellipsen nicht vollständig wiedergibt. Das Modell gibt vier weitere Versionen des Satzes zurück, die als die nächst wahrscheinlichsten Versionen ausgewählt wurden. Bitte lesen Sie sich diese vier Versionen sorgfältig durch und wählen Sie die Version aus, die Ihrer Meinung nach am besten den ursprünglichen Satz mit aufgelösten Koordinationsellipsen wiedergibt.\n\n"
    answers = "" + "".join(f"{i+2} - '{prediction}'\n" for i, prediction in enumerate(predictions))
    end = "\nBitte antworten Sie nur mit der richtigen Nummer und ohne Erklärung."

    return beginning + answers + end

## OpenAI API calls

In [12]:
def get_openai_response_chatgpt(messages):
    return openai.ChatCompletion.create(
        model="gpt-3.5-turbo-0301",
        messages=[
            {"role": role, "content": text} for (role, text) in messages
        ],
        temperature=0,
        max_tokens=100,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )["choices"][0]["message"]["content"]

In [13]:
def get_openai_response_gpt3(prompt):
    return openai.Completion.create(
        model="text-davinci-003",
        prompt= prompt,
        temperature=0,
        max_tokens=100,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0
    )["choices"][0]["text"]

## Directly infer best fit by giving GPT all of the options

In [14]:
from transformers import Text2TextGenerationPipeline
from evaluation import error_analysis, get_scores

def generate_best_fit(samples, outputs):
    predictions = []
    for i, sample in enumerate(samples):
        generations = [entry['generated_text'] for entry in outputs[i]]

        if generations[0] == sample:
            print(f'{i}) answer: {0}')
            print('--------------------------------')
            predictions.append(generations[0])

        else:
            message = [("user", generate_prompt(samples[i], generations))]
            answer = get_openai_response_chatgpt(message)

            numbers = re.findall(r'\d+', answer)
            if len(numbers) > 1:
                print(f'more numbers than expected {numbers}')
            if len(numbers) == 0:
                print(f'no numbers found')
                index = 0
            else:
                index = int(numbers[0]) - 1
                if index > 4:
                    print(f'Index is out of bounds. Something went wrong with the API Answer. Defaulting to 0')
                    index = 0

            print(f'{i}) answer: {index}')
            print('--------------------------------')
            predictions.append(generations[index])

    return predictions

In [15]:
k=5
pipeline = Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, max_length=config.generation_max_length, num_beams=k, num_return_sequences=k, device=0)

samples = list(val_df.raw_sentence)
resolutions = list(val_df.full_resolution)
outputs = pipeline(samples)

In [17]:
predictions = generate_best_fit(samples, outputs)

errors = error_analysis(predictions, resolutions, samples)
scores = get_scores(errors, "eval")

0) answer: 2
--------------------------------
1) answer: 0
--------------------------------
2) answer: 4
--------------------------------
3) answer: 1
--------------------------------
4) answer: 3
--------------------------------
5) answer: 0
--------------------------------
6) answer: 3
--------------------------------
7) answer: 3
--------------------------------
8) answer: 2
--------------------------------
9) answer: 3
--------------------------------
10) answer: 0
--------------------------------
11) answer: 2
--------------------------------
12) answer: 0
--------------------------------
13) answer: 2
--------------------------------
14) answer: 0
--------------------------------
15) answer: 0
--------------------------------
16) answer: 1
--------------------------------
17) answer: 0
--------------------------------
18) answer: 0
--------------------------------
19) answer: 1
--------------------------------
20) answer: 2
--------------------------------
21) answer: 1
---------

In [18]:
scores

{'eval/tp': 0.648397976391231,
 'eval/tp_abs': 769,
 'eval/fn': 0.05733558178752108,
 'eval/fn_abs': 68,
 'eval/fp': 0.01433389544688027,
 'eval/fp_abs': 17,
 'eval/replace': 0.07588532883642496,
 'eval/replace_abs': 90,
 'eval/insert': 0.12141652613827993,
 'eval/insert_abs': 144,
 'eval/delete': 0.06323777403035413,
 'eval/delete_abs': 75,
 'eval/complex': 0.0193929173693086,
 'eval/complex_abs': 23,
 'eval/edit_distance_rel': 0.8884194921216337,
 'eval/exact_match': 0.648397976391231,
 'eval/gleu': 0.9693054237177179}

## Ask GPT if the first generation is true and only if it doesn't think so offer the other options

In [19]:
from transformers import Text2TextGenerationPipeline
from evaluation import relative_edit_distance, error_analysis, get_scores
import re

def generate_best_fit2(samples, outputs):
    predictions = []
    for i, sample in enumerate(samples):
        generations = [entry['generated_text'] for entry in outputs[i]]

        if generations[0] == sample:
            print(f'{i}) answer: {0}')
            print('--------------------------------')
            predictions.append(generations[0])
        else:

            prompt = generate_prompt_true_false(samples[i], generations[0])
            message = [("user", prompt)]
            answer = get_openai_response_chatgpt(message)
            extracted_answer = re.findall(r'Ja|Nein', answer)
            if len(extracted_answer) > 1:
                print(f'more answers than expected: {answer}. Defaulting to 0')
                print(f'{i}) answer: {0}')
                print('--------------------------------')
                predictions.append(generations[0])
            elif len(extracted_answer) == 0:
                print(f'No answer provided: {answer}. Defaulting to 0')
                print(f'{i}) answer: {0}')
                print('--------------------------------')
                predictions.append(generations[0])
            elif extracted_answer[0] == 'Ja':
                print(f'{i}) answer: {0}')
                print('--------------------------------')
                predictions.append(generations[0])
            elif extracted_answer[0] == 'Nein':
                prompt2 = generate_prompt_other_options(generations[1:])
                messages = [("user", prompt), ("assistant", answer), ("user", prompt2)]
                answer2 = get_openai_response_chatgpt(messages)

                numbers = re.findall(r'\d+', answer2)
                if len(numbers) > 1:
                    print(f'more numbers than expected {numbers}')
                if len(numbers) == 0:
                    print(f'no numbers found')
                    index = 0
                else:
                    index = int(numbers[0]) - 1
                    if index > 4:
                        print(f'Index is out of bounds. Something went wrong with the API Answer. Defaulting to 0')
                        index = 0

                print(f'{i}) answer: {index}')
                print('--------------------------------')
                predictions.append(generations[index])

    return predictions

In [20]:
predictions = generate_best_fit2(samples, outputs)

errors = error_analysis(predictions, resolutions, samples)
scores = get_scores(errors, "eval")

0) answer: 0
--------------------------------
1) answer: 0
--------------------------------
2) answer: 0
--------------------------------
3) answer: 0
--------------------------------
4) answer: 0
--------------------------------
5) answer: 0
--------------------------------
6) answer: 0
--------------------------------
7) answer: 0
--------------------------------
8) answer: 0
--------------------------------
9) answer: 0
--------------------------------
10) answer: 0
--------------------------------
11) answer: 2
--------------------------------
12) answer: 0
--------------------------------
13) answer: 1
--------------------------------
14) answer: 0
--------------------------------
15) answer: 0
--------------------------------
16) answer: 0
--------------------------------
17) answer: 0
--------------------------------
18) answer: 0
--------------------------------
19) answer: 0
--------------------------------
20) answer: 0
--------------------------------
21) answer: 0
---------

APIError: HTTP code 502 from API (<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js ie6 oldie" lang="en-US"> <![endif]-->
<!--[if IE 7]>    <html class="no-js ie7 oldie" lang="en-US"> <![endif]-->
<!--[if IE 8]>    <html class="no-js ie8 oldie" lang="en-US"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js" lang="en-US"> <!--<![endif]-->
<head>


<title>api.openai.com | 502: Bad gateway</title>
<meta charset="UTF-8" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta http-equiv="X-UA-Compatible" content="IE=Edge" />
<meta name="robots" content="noindex, nofollow" />
<meta name="viewport" content="width=device-width,initial-scale=1" />
<link rel="stylesheet" id="cf_styles-css" href="/cdn-cgi/styles/main.css" />


</head>
<body>
<div id="cf-wrapper">

    

    <div id="cf-error-details" class="p-0">
        <header class="mx-auto pt-10 lg:pt-6 lg:px-8 w-240 lg:w-full mb-8">
            <h1 class="inline-block sm:block sm:mb-2 font-light text-60 lg:text-4xl text-black-dark leading-tight mr-2">
              
              <span class="inline-block">Bad gateway</span>
              <span class="code-label">Error code 502</span>
            </h1>
            <div>
               Visit <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=api.openai.com" target="_blank" rel="noopener noreferrer">cloudflare.com</a> for more information.
            </div>
            <div class="mt-3">2023-04-03 12:01:02 UTC</div>
        </header>
        
        <div class="my-8 bg-gradient-gray">
            <div class="w-240 lg:w-full mx-auto">
                <div class="clearfix md:px-8">
                  
<div id="cf-browser-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
  <div class="relative mb-10 md:m-0">
    
    <span class="cf-icon-browser block md:hidden h-20 bg-center bg-no-repeat"></span>
    <span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
    
  </div>
  <span class="md:block w-full truncate">You</span>
  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
    
    Browser
    
  </h3>
  <span class="leading-1.3 text-2xl text-green-success">Working</span>
</div>

<div id="cf-cloudflare-status" class=" relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
  <div class="relative mb-10 md:m-0">
    <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=api.openai.com" target="_blank" rel="noopener noreferrer">
    <span class="cf-icon-cloud block md:hidden h-20 bg-center bg-no-repeat"></span>
    <span class="cf-icon-ok w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
    </a>
  </div>
  <span class="md:block w-full truncate">Berlin</span>
  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
    <a href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=api.openai.com" target="_blank" rel="noopener noreferrer">
    Cloudflare
    </a>
  </h3>
  <span class="leading-1.3 text-2xl text-green-success">Working</span>
</div>

<div id="cf-host-status" class="cf-error-source relative w-1/3 md:w-full py-15 md:p-0 md:py-8 md:text-left md:border-solid md:border-0 md:border-b md:border-gray-400 overflow-hidden float-left md:float-none text-center">
  <div class="relative mb-10 md:m-0">
    
    <span class="cf-icon-server block md:hidden h-20 bg-center bg-no-repeat"></span>
    <span class="cf-icon-error w-12 h-12 absolute left-1/2 md:left-auto md:right-0 md:top-0 -ml-6 -bottom-4"></span>
    
  </div>
  <span class="md:block w-full truncate">api.openai.com</span>
  <h3 class="md:inline-block mt-3 md:mt-0 text-2xl text-gray-600 font-light leading-1.3">
    
    Host
    
  </h3>
  <span class="leading-1.3 text-2xl text-red-error">Error</span>
</div>

                </div>
              
            </div>
        </div>

        <div class="w-240 lg:w-full mx-auto mb-8 lg:px-8">
            <div class="clearfix">
                <div class="w-1/2 md:w-full float-left pr-6 md:pb-10 md:pr-0 leading-relaxed">
                    <h2 class="text-3xl font-normal leading-1.3 mb-4">What happened?</h2>
                    <p>The web server reported a bad gateway error.</p>
                </div>
              
                <div class="w-1/2 md:w-full float-left leading-relaxed">
                    <h2 class="text-3xl font-normal leading-1.3 mb-4">What can I do?</h2>
                    <p class="mb-6">Please try again in a few minutes.</p>
                </div>
            </div>
              
        </div>

        <div class="cf-error-footer cf-wrapper w-240 lg:w-full py-10 sm:py-4 sm:px-8 mx-auto text-center sm:text-left border-solid border-0 border-t border-gray-300">
  <p class="text-13">
    <span class="cf-footer-item sm:block sm:mb-1">Cloudflare Ray ID: <strong class="font-semibold">7b2120020a7558de</strong></span>
    <span class="cf-footer-separator sm:hidden">&bull;</span>
    <span id="cf-footer-item-ip" class="cf-footer-item hidden sm:block sm:mb-1">
      Your IP:
      <button type="button" id="cf-footer-ip-reveal" class="cf-footer-ip-reveal-btn">Click to reveal</button>
      <span class="hidden" id="cf-footer-ip">89.14.36.143</span>
      <span class="cf-footer-separator sm:hidden">&bull;</span>
    </span>
    <span class="cf-footer-item sm:block sm:mb-1"><span>Performance &amp; security by</span> <a rel="noopener noreferrer" href="https://www.cloudflare.com/5xx-error-landing?utm_source=errorcode_502&utm_campaign=api.openai.com" id="brand_link" target="_blank">Cloudflare</a></span>
    
  </p>
  <script>(function(){function d(){var b=a.getElementById("cf-footer-item-ip"),c=a.getElementById("cf-footer-ip-reveal");b&&"classList"in b&&(b.classList.remove("hidden"),c.addEventListener("click",function(){c.classList.add("hidden");a.getElementById("cf-footer-ip").classList.remove("hidden")}))}var a=document;document.addEventListener&&a.addEventListener("DOMContentLoaded",d)})();</script>
</div><!-- /.error-footer -->


    </div>
</div>
</body>
</html>

)

In [None]:
scores