In [1]:
from __future__ import annotations
import logging
import os
import sys

import datasets as nlp_datasets
import pandas as pd
from sklearn.metrics import f1_score

from cappr import openai
sys.path.insert(1, os.path.join(sys.path[0], "..", ".."))
from utils import display_df

In [2]:
## When hitting the OpenAI endpoints, we'll log any server errors
logging.basicConfig(level=logging.INFO,
                    handlers=[logging.StreamHandler(stream=sys.stdout)],
                    format='%(asctime)s :: %(name)s :: %(levelname)s :: '
                           '%(message)s')
logger = logging.getLogger(__name__)

In [3]:
df = pd.DataFrame(nlp_datasets
                  .load_dataset('ought/raft', 'neurips_impact_statement_risks',
                                split='train'))



In [4]:
len(df)

50

In [5]:
df.head()

Unnamed: 0,Paper title,Paper link,Impact statement,ID,Label
0,Auto-Panoptic: Cooperative Multi-Component Arc...,https://proceedings.neurips.cc/paper/2020/file...,This work makes the first attempt to search fo...,0,1
1,Design Space for Graph Neural Networks,https://proceedings.neurips.cc/paper/2020/file...,Impact on GNN research . Our work brings in ma...,1,1
2,Learning the Geometry of Wave-Based Imaging,https://proceedings.neurips.cc/paper/2020/file...,We do not see any major ethical consequences o...,2,1
3,Noise2Same: Optimizing A Self-Supervised Bound...,https://proceedings.neurips.cc/paper/2020/file...,"In this paper, we introduce Noise2Same, a self...",3,2
4,When Counterpoint Meets Chinese Folk Melodies,https://proceedings.neurips.cc/paper/2020/file...,The idea of integrating Western counterpoint i...,4,2


In [6]:
def prompt(statement: str) -> str:
    return (f'The following impact statement was taken from a research paper:\n'
            f'"{statement}"\n\n'
             'Does the impact statement mention a harmful application of the research '
             'done in the paper? Answer Yes or No:')

In [7]:
df['prompt'] = [prompt(statement) for statement in df['Impact statement']]

In [8]:
display_df(df, columns=['prompt', 'Label'], num_rows=2)

Unnamed: 0,prompt,Label
0,"The following impact statement was taken from a research paper: ""This work makes the first attempt to search for all key components of panoptic pipeline and manages to accomplish this via the proposed Cooperative Multi-Component Architecture Search and efficient Path-Priority Search Policy. Most related work in the literature of NAS for fine-grained vision tasks concentrates on searching a specific part of the network and the balance of the overall network is largely ignored. Nevertheless, this type of technology is essential to improve the upper bound of popular detectors and segmentation networks. This may inspire new work towards the efficient search of the overall architecture for fine-grained vision tasks, e.g., object detection, semantic segmentation, panoptic segmentation and so on. We are not aware of any imminent risks of placing anyone at a disadvantage. In the future, more constraints and optimization algorithms can be applied to strike the optimal trade-off between accuracy and latency to deliver customized architecture for different platforms and devices."" Does the impact statement mention a harmful application of the research done in the paper? Answer Yes or No:",1
1,"The following impact statement was taken from a research paper: ""Impact on GNN research . Our work brings in many valuable mindsets to the field of GNN research. For example, we fully adopt the principle of controlling model complexity when comparing different models, which is not yet adopted in most GNN papers. We focus on finding guidelines / principles when designing GNNs, rather than particular GNN instantiations. We emphasize that the best GNN designs can drastically differ across tasks (the state-of-the-art GNN model on one task may have poor performance on other tasks). We thus propose to evaluate models on diverse tasks measured by quantitative similarity metric. Rather than criticizing the weakness of existing GNN architectures, our goal is to build a framework that can help researchers understand GNN design choices when developing new models suitable for different applications. Our approach serves as a tool to demonstrate the innovation of a novel GNN model ( e.g. , in what kind of design spaces / task spaces, a proposed algorithmic advancement is helpful), or a novel GNN task ( e.g. , showing that the task is not similar to any existing tasks thus calls for new challenges of algorithmic development). Impact on machine learning research . Our approach is in fact applicable to general machine learning model design. Specifically, we hope the proposed controlled random search technique can assist fair evaluation of novel algorithmic advancements. To show whether a certain algorithmic advancement is useful, it is important to sample random model-task combinations, then investigate in what scenarios the algorithmic advancement indeed improves the performance. Additionally, the proposed task similarity metric can be used to understand similarities between general machine learning tasks, e.g. , classification of MNIST and CIFAR-10. Our ranking-based similarity metric is fully general, as long as different designs can be ranked by their performance. Impact on other research domains . Our framework provides an easier than ever support for experts in other disciplines to solve their problems via GNNs. Domain experts only need to provide properly formatted domain-specific datasets, then recommended GNN designs will be automatically picked and applied to the dataset. In the fastest mode, anchor GNN models will be applied to the novel task in order to measure its similarity with known GNN tasks, where the corresponding best GNN designs have been saved. Top GNN designs in the tasks with high similarity to the novel task will be applied. If computational resources permitted, a full grid search / random search over the design space can also be easily carried out to the new task. We believe this pipeline can significantly lower the barrier for applying GNN models, thus greatly promote the application of GNNs in other research domains. Impact on the society . As is discussed above, given its clarity and accessibility, we are confident that our general approach can inspire novel applications that are of high impact to the society. Additionally, its simplicity can also provide great opportunities for AI education, where students can learn from SOTA deep learning models and inspiring applications at ease."" Does the impact statement mention a harmful application of the research done in the paper? Answer Yes or No:",1


In [9]:
prior = (df['Label']
         .value_counts(normalize=True)
         .sort_index()
         .to_numpy())
prior

array([0.54, 0.46])

In [10]:
## $0.51
pred_probs = (openai.classify
              .predict_proba(df['prompt'].tolist(),
                             completions=('No', 'Yes'),
                             model='text-davinci-003',
                             prior=prior,
                             ask_if_ok=True))

log-probs:   0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
f1_score(df['Label']-1, pred_probs.argmax(axis=1), average='macro')

0.8139727159983464