In [None]:
import sklearn.datasets as datasets
import numpy as np
import pandas as pd
import copy
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
import openai
import time

openai.api_key = "xxx" ### Enter your openai key

In [None]:
dataset = datasets.fetch_openml(data_id = 1511)
df = dataset['frame']
df = df.dropna()
df.head()

In [None]:
x = df.drop(['Channel', 'Region'], axis = 'columns')
x = x.to_numpy()
scaler = MinMaxScaler()
scaler.fit(x)
x = scaler.transform(x)
y = df['Channel'].to_numpy()
train_idx = np.load('data/train_idx.npy')
test_idx = np.load('data/test_idx.npy')
print(len(train_idx), len(test_idx))

In [None]:
df_npy = df.to_numpy()
df_test = df_npy[test_idx]
df_train = df_npy[train_idx]
xtrain = x[train_idx]
xtest = x[test_idx]

In [None]:
def question(shot_idx):
    text = "Read a given information and questions. Think step by step, and then then choose the most important feature to predict whether its value is class1 or class2. You must choose in [annual spending on fresh product, annual spending on milk products, annual spending on grocery products, annual spending on frozen products, annual spending on detergents and paper products, annual spending on delicatessen products, and customer’s region]."
    text += "\nThe dataset consists of 7 input variables: annual spending on fresh product, annual spending on milk products, annual spending on grocery products, annual spending on frozen products, annual spending on detergents and paper products, annual spending on delicatessen products, and customer’s region. The output variable is: Class1 indicates Horeca (Hotel, Restaurant, Cafe) channel, and class2 indicates Retail channel.\n"
    
    rand_perm = np.random.permutation(len(shot_idx))
    shot_idx = shot_idx[rand_perm]
    for i in shot_idx:
        text += "Question: If the"
        text += f"annual spending on fresh product is {df_train[i][1]}, "
        text += f"annual spending on milk products is {df_train[i][2]}, "
        text += f"annual spending on grocery products is {df_train[i][3]}, "
        text += f"annual spending on frozen products is {df_train[i][4]}, "
        text += f"annual spending on detergents and paper products is {df_train[i][5]}, "
        text += f"annual spending on delicatessen products is {df_train[i][6]}, "
        text += f"customer’s region (1 indicates Lisbon, 2 indicates Porto, and 3 indicates Other) is {df_train[i][8]}, "
        text += f"then what is the customer’s channel? Choose between [class1, class2]. Class1 indicates Horeca (Hotel, Restaurant, Cafe) channel, and class2 indicates Retail channel. Answer: class{df_train[i][7]}\n"
            
    text += "Choose the most important feature to predict predict its value is class1 or class2. Answer:"
    return text

In [None]:
def use_api(prompt):
    waiting_time = 0.5
    response = None
    while response is None:
        try:
            response = openai.ChatCompletion.create(
                model = 'model_name', ### Enter your model name
                messages = [{"role":"user", "content":prompt}]
            )
        except:
            time.sleep(waiting_time)
            if waiting_time < 5:
                waiting_time += 0.5
    return response["choices"][0]['message']['content']

In [None]:
def ours(test_idx, shot_idx):
    if len(shot_idx) > 0:
        rand_perm = np.random.permutation(len(shot_idx))
        shot_idx = shot_idx[rand_perm]
        
    unlabeled_idx = np.setdiff1d(range(len(xtrain)), shot_idx)
    u_x = xtrain[unlabeled_idx]
    
    u_idx = []
    for idx in shot_idx:
        dist = (u_x - xtrain[idx])**2
        dist = np.sum(dist, axis = 1)
        tmp = np.argsort(dist)[:int(30/len(shot_idx))]
        u_idx.append(tmp)
    u_idx = np.concatenate(u_idx)
    u_idx = unlabeled_idx[u_idx]
    rand_perm = np.random.permutation(len(u_idx))
    u_idx = u_idx[rand_perm]

    text = "Read a given information and questions. Think step by step, and then predict whether its value is class1 or class2. You must choose in [class1, class2]. Class1 indicates Horeca (Hotel, Restaurant, Cafe) channel, and class2 indicates Retail channel."
    text += "\nThe dataset consists of 7 input variables: annual spending on fresh product, annual spending on milk products, annual spending on grocery products, annual spending on frozen products, annual spending on detergents and paper products, annual spending on delicatessen products, and customer’s region. The output variable is the customer’s channel.\n"

    for i in u_idx:
        text += "Question: If the"
        text += f"annual spending on fresh product is {df_train[i][1]}, "
        text += f"annual spending on milk products is {df_train[i][2]}, "
        # text += f"annual spending on grocery products is {df_train[i][3]}, "
        text += f"annual spending on frozen products is {df_train[i][4]}, "
        text += f"annual spending on detergents and paper products is {df_train[i][5]}, "
        text += f"annual spending on delicatessen products is {df_train[i][6]}, "
        text += f"customer’s region (1 indicates Lisbon, 2 indicates Porto, and 3 indicates Other) is {df_train[i][8]}, "
        text += f"then what is the annual spending on grocery products. Answer: {df_train[i][3]}\n"

    for i in shot_idx:
        text += "Question: If the"
        text += f"annual spending on fresh product is {df_train[i][1]}, "
        text += f"annual spending on milk products is {df_train[i][2]}, "
        text += f"annual spending on grocery products is {df_train[i][3]}, "
        text += f"annual spending on frozen products is {df_train[i][4]}, "
        text += f"annual spending on detergents and paper products is {df_train[i][5]}, "
        text += f"annual spending on delicatessen products is {df_train[i][6]}, "
        text += f"customer’s region (1 indicates Lisbon, 2 indicates Porto, and 3 indicates Other) is {df_train[i][8]}, "
        text += f"then what is the customer’s channel? Choose between [class1, class2]. Class1 indicates Horeca (Hotel, Restaurant, Cafe) channel, and class2 indicates Retail channel. Answer: class{df_train[i][7]}\n"

    text += "Question: If the"
    text += f"annual spending on fresh product is {df_test[test_idx][1]}, "
    text += f"annual spending on milk products is {df_test[test_idx][2]}, "
    text += f"annual spending on grocery products is {df_test[test_idx][3]}, "
    text += f"annual spending on frozen products is {df_test[test_idx][4]}, "
    text += f"annual spending on detergents and paper products is {df_test[test_idx][5]}, "
    text += f"annual spending on delicatessen products is {df_test[test_idx][6]}, "
    text += f"customer’s region (1 indicates Lisbon, 2 indicates Porto, and 3 indicates Other) is {df_test[test_idx][8]}, "
    text += f"then what is the customer’s channel? Choose between [class1, class2]. Class1 indicates Horeca (Hotel, Restaurant, Cafe) channel, and class2 indicates Retail channel. Answer:"
    return text

In [None]:
shot_idx = np.load('./data/labeled_idx.npy')

### Ours

In [None]:
response_list = []
for i in tqdm(range(df_test.shape[0])):
    response = use_api(ours(i, shot_idx))
    response_list.append(response)