In [1]:
from __future__ import annotations
import logging
import os
import sys

import datasets as nlp_datasets
import pandas as pd
from sklearn.metrics import f1_score

from cappr import openai
sys.path.insert(1, os.path.join(sys.path[0], "..", ".."))
from utils import display_df

In [2]:
## When hitting the OpenAI endpoints, we'll log any server errors
logging.basicConfig(level=logging.INFO,
                    handlers=[logging.StreamHandler(stream=sys.stdout)],
                    format='%(asctime)s :: %(name)s :: %(levelname)s :: '
                           '%(message)s')
logger = logging.getLogger(__name__)

In [3]:
df = pd.DataFrame(nlp_datasets
                  .load_dataset('ought/raft', 'banking_77', split='train'))



In [4]:
len(df)

50

In [5]:
df.head()

Unnamed: 0,Query,ID,Label
0,Is it possible for me to change my PIN number?,0,23
1,I'm not sure why my card didn't work,1,27
2,I don't think my top up worked,2,60
3,Can you explain why my payment was charged a fee?,3,17
4,How long does a transfer from a UK account tak...,4,7


In [6]:
def prompt(query: str) -> str:
    return (f'This online banking customer service query: "{query}"\n'
             'is best summarized as:')

In [7]:
df['prompt'] = [prompt(query) for query in df['Query']]

In [8]:
original_class_names = ["Refund_not_showing_up", "activate_my_card", "age_limit", "apple_pay_or_google_pay", "atm_support", "automatic_top_up", "balance_not_updated_after_bank_transfer", "balance_not_updated_after_cheque_or_cash_deposit", "beneficiary_not_allowed", "cancel_transfer", "card_about_to_expire", "card_acceptance", "card_arrival", "card_delivery_estimate", "card_linking", "card_not_working", "card_payment_fee_charged", "card_payment_not_recognised", "card_payment_wrong_exchange_rate", "card_swallowed", "cash_withdrawal_charge", "cash_withdrawal_not_recognised", "change_pin", "compromised_card", "contactless_not_working", "country_support", "declined_card_payment", "declined_cash_withdrawal", "declined_transfer", "direct_debit_payment_not_recognised", "disposable_card_limits", "edit_personal_details", "exchange_charge", "exchange_rate", "exchange_via_app", "extra_charge_on_statement", "failed_transfer", "fiat_currency_support", "get_disposable_virtual_card", "get_physical_card", "getting_spare_card", "getting_virtual_card", "lost_or_stolen_card", "lost_or_stolen_phone", "order_physical_card", "passcode_forgotten", "pending_card_payment", "pending_cash_withdrawal", "pending_top_up", "pending_transfer", "pin_blocked", "receiving_money", "request_refund", "reverted_card_payment?", "supported_cards_and_currencies", "terminate_account", "top_up_by_bank_transfer_charge", "top_up_by_card_charge", "top_up_by_cash_or_cheque", "top_up_failed", "top_up_limits", "top_up_reverted", "topping_up_by_card", "transaction_charged_twice", "transfer_fee_charged", "transfer_into_account", "transfer_not_received_by_recipient", "transfer_timing", "unable_to_verify_identity", "verify_my_identity", "verify_source_of_funds", "verify_top_up", "virtual_card_not_working", "visa_or_mastercard", "why_verify_identity", "wrong_amount_of_cash_received", "wrong_exchange_rate_for_cash_withdrawal"]

https://huggingface.co/datasets/ought/raft/blob/main/data/banking_77/task.json

In [9]:
class_names = [' '.join(class_name.split('_')).capitalize()
               for class_name in original_class_names]

In [10]:
df['class_name'] = [class_names[label_idx-1] for label_idx in df['Label']]

In [11]:
display_df(df, columns=['prompt', 'class_name'])

Unnamed: 0,prompt,class_name
0,"This online banking customer service query: ""Is it possible for me to change my PIN number?"" is best summarized as:",Change pin
1,"This online banking customer service query: ""I'm not sure why my card didn't work"" is best summarized as:",Declined card payment
2,"This online banking customer service query: ""I don't think my top up worked"" is best summarized as:",Top up failed


There are so many classes that `text-davinci-003` is prohibitively expensive. If we
could cache the prompt, it'd be non-prohibitively expensive.

In [12]:
## $0.23
pred_probs = (openai.classify
              .predict_proba(df['prompt'].tolist(),
                             completions=class_names,
                             model='text-curie-001',
                             prior=None,
                             ask_if_ok=True))

log-probs:   0%|          | 0/3850 [00:00<?, ?it/s]

In [13]:
f1_score(df['Label']-1, pred_probs.argmax(axis=1), average='macro')

0.19918699186991867

In [14]:
(pred_probs.argmax(axis=1) == df['Label']-1).mean()

0.22

It only got 11/50 right lol. What's the accuracy of randomly guessing?

In [15]:
1/len(class_names)

0.012987012987012988

In [16]:
df['class_pred'] = [class_names[pred_class_idx]
                    for pred_class_idx in pred_probs.argmax(axis=1)]

In [17]:
# display_df(df[df['class_name'] != df['class_pred']],
#            columns=['prompt', 'class_name', 'class_pred'],
#            num_rows=None)