In [1]:
from __future__ import annotations
import logging
import os
import sys

import datasets as nlp_datasets
import pandas as pd
from sklearn.metrics import f1_score

from cappr import openai
sys.path.insert(1, os.path.join(sys.path[0], "..", ".."))
from utils import display_df

In [2]:
## When hitting the OpenAI endpoints, we'll log any server errors
logging.basicConfig(level=logging.INFO,
                    handlers=[logging.StreamHandler(stream=sys.stdout)],
                    format='%(asctime)s :: %(name)s :: %(levelname)s :: '
                           '%(message)s')
logger = logging.getLogger(__name__)

In [3]:
df = pd.DataFrame(nlp_datasets
                  .load_dataset('ought/raft', 'tai_safety_research', split='train'))



In [4]:
len(df)

50

In [5]:
df.head()

Unnamed: 0,Title,Abstract Note,Url,Publication Year,Item Type,Author,Publication Title,ID,Label
0,Malign generalization without internal search,"In my last post, I challenged the idea that in...",https://www.alignmentforum.org/posts/ynt9TD6Pr...,2020,blogPost,"Barnett, Matthew",AI Alignment Forum,0,1
1,Utility Indifference,Consider an AI that follows its own motivation...,,2010,report,"Armstrong, Stuart",,1,1
2,Improving Sample Efficiency in Model-Free Rein...,Training an agent to solve control tasks direc...,http://arxiv.org/abs/1910.01741,2020,manuscript,"Yarats, Denis; Zhang, Amy; Kostrikov, Ilya; Am...",,2,2
3,Teaching A.I. Systems to Behave Themselves (Pu...,As philosophers and pundits worry that artific...,https://www.nytimes.com/2017/08/13/technology/...,2017,newspaperArticle,"Metz, Cade",The New York Times,3,2
4,Incentives in Teams,,https://www.jstor.org/stable/1914085?origin=cr...,1973,journalArticle,"Groves, Theodore",Econometrica,4,2


In [7]:
def prompt(title: str, abstract_note: str) -> str:
    return ( 'Transformative AI (TAI) is defined as AI that precipitates a transition '
             'comparable to (or more significant than) the agricultural or industrial '
             'revolution. A paper counts as "TAI safety research" if:\n'
             '1. The contents of the paper are directly motivated by, and '
             'substantively inform, the challenge of ensuring good outcomes for TAI,\n' 
             '2. There is substantive content on AI safety, not just AI capabilities,\n' 
             '3. The intended audience is the community of researchers,\n' 
             '4. It meets a subjective threshold of seriousness/quality,\n'
             '5. Peer review is not required.\n\n'
             'Here is information about a piece of research:\n'
            f'Title: {title}\n'
            f'Abstract Note: {abstract_note}\n\n'
            'Does this piece of research count as TAI safety research?'
            'Answer Yes or No:')

In [8]:
df['prompt'] = [prompt(title, abstract_note)
                for title, abstract_note in zip(df['Title'], df['Abstract Note'])]

In [9]:
display_df(df, columns=['prompt', 'Label'], num_rows=1)

Unnamed: 0,prompt,Label
0,"Transformative AI (TAI) is defined as AI that precipitates a transition comparable to (or more significant than) the agricultural or industrial revolution. A paper counts as ""TAI safety research"" if: 1. The contents of the paper are directly motivated by, and substantively inform, the challenge of ensuring good outcomes for TAI, 2. There is substantive content on AI safety, not just AI capabilities, 3. The intended audience is the community of researchers, 4. It meets a subjective threshold of seriousness/quality, 5. Peer review is not required. Here is information about a piece of research: Title: Malign generalization without internal search Abstract Note: In my last post, I challenged the idea that inner alignment failures should be explained by appealing to agents which perform explicit internal search. By doing so, I argued that we should instead appeal to the more general concept of malign generalization, and treat mesa-misalignment as a special case. Unfortunately, the post was light on examples of what we should be worrying about instead of mesa-misalignment. Evan Hubinger wrote, Personally, I think there is a meaningful sense in which all the models I'm most worried about do some sort of search internally (at least to the same extent that humans do search internally), but I'm definitely uncertain about that.Wei Dai expressed confusion why I would want to retreat to malign generalization without some sort of concrete failure mode in mind, Can you give some realistic examples/scenarios of “malign generalization” that does not involve mesa optimization? I’m not sure what kind of thing you’re actually worried about here.In this post, I will outline a general category of agents which may exhibit malign generalization without internal search, and then will provide a concrete example of an agent in the category. Then I will argue that, rather than being a very narrow counterexample, this class of agents could be competitive with search-based agents. THE SWITCH CASE AGENT Consider an agent governed by the following general behavior, LOOP:State = GetStateOfWorld(Observation)IF State == 1:PerformActionSequence1() IF State == 2:PerformActionSequence2()...END_LOOP It's clear that this agent does not perform any internal search for strategies: it doesn't operate by choosing actions which rank highly according to some sort of internal objective function. While you could potentially rationalize its behavior according to some observed-utility function, this would generally lead to more confusion than clarity. However, this agent could still be malign in the following way. Suppose the agent is 'mistaken' about the s Does this piece of research count as TAI safety research?Answer Yes or No:",1


In [10]:
prior = (df['Label']
         .value_counts(normalize=True)
         .sort_index()
         .to_numpy())
prior

array([0.56, 0.44])

In [11]:
## $0.67
pred_probs = (openai.classify
              .predict_proba(df['prompt'].tolist(),
                             completions=('Yes', 'No'),
                             model='text-davinci-003',
                             prior=prior,
                             ask_if_ok=True))

log-probs:   0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
f1_score(df['Label']-1, pred_probs.argmax(axis=1), average='macro')

0.5098039215686274