In [1]:
from __future__ import annotations
import logging
import os
import sys

import datasets as nlp_datasets
import pandas as pd
from sklearn.metrics import f1_score

from cappr import openai
sys.path.insert(1, os.path.join(sys.path[0], "..", ".."))
from utils import display_df

In [2]:
## When hitting the OpenAI endpoints, we'll log any server errors
logging.basicConfig(level=logging.INFO,
                    handlers=[logging.StreamHandler(stream=sys.stdout)],
                    format='%(asctime)s :: %(name)s :: %(levelname)s :: '
                           '%(message)s')
logger = logging.getLogger(__name__)

In [3]:
df = pd.DataFrame(nlp_datasets
                  .load_dataset('ought/raft', 'ade_corpus_v2', split='train'))



In [4]:
len(df)

50

In [5]:
df.head()

Unnamed: 0,Sentence,ID,Label
0,No regional side effects were noted.,0,2
1,We describe the case of a 10-year-old girl wit...,1,2
2,The INR should be monitored more frequently wh...,2,2
3,"After the first oral dose of propranolol, sync...",3,1
4,As termination was not an option for the famil...,4,2


In [6]:
def prompt(text: str) -> str:
    return (f'The following sentence was taken from a medical case report: "{text}"\n'
             'Does the sentence describe an adverse effect of a pharmaceutical drug or '
             'substance?\n'
             'Answer Yes or No:')

In [7]:
df['prompt'] = [prompt(sentence) for sentence in df['Sentence']]

In [8]:
display_df(df, columns=['prompt', 'Label'])

Unnamed: 0,prompt,Label
0,"The following sentence was taken from a medical case report: ""No regional side effects were noted."" Does the sentence describe an adverse effect of a pharmaceutical drug or substance? Answer Yes or No:",2
1,"The following sentence was taken from a medical case report: ""We describe the case of a 10-year-old girl with two epileptic seizures and subcontinuous spike-waves during sleep, who presented unusual side-effects related to clobazam (CLB) monotherapy."" Does the sentence describe an adverse effect of a pharmaceutical drug or substance? Answer Yes or No:",2
2,"The following sentence was taken from a medical case report: ""The INR should be monitored more frequently when bosentan is initiated, adjusted, or discontinued in patients taking warfarin."" Does the sentence describe an adverse effect of a pharmaceutical drug or substance? Answer Yes or No:",2


In [9]:
prior = (df['Label']
         .value_counts(normalize=True)
         .sort_index()
         .to_numpy())
prior

array([0.3, 0.7])

In [10]:
## $0.13
pred_probs = (openai.classify
              .predict_proba(df['prompt'].tolist(),
                             completions=('Yes', 'No'),
                             model='text-davinci-003',
                             prior=prior,
                             ask_if_ok=True))

log-probs:   0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
f1_score(df['Label']-1, pred_probs.argmax(axis=1), average='macro')

0.8869289914066034