In [1]:
import numpy as np
import pandas as pd

In [2]:
DATA  = 'incidents'
LABEL = 'hazard-category'
SHOTS = 2

# Load data:

In [3]:
prompts = pd.read_csv(f'prompts_{LABEL}_{SHOTS:d}-shot.csv', index_col=0).fillna('')

In [4]:
prompt_cols = [col for col in prompts.columns if col.startswith('prompt_')]
prompts.head()

Unnamed: 0,cv_split,prompt_all,prompt_conformal_5%,prompt_max-5,prompt_max-10,label,prompt_sim-5,prompt_sim-10,prompt_sim-20,output_raw_all,...,output_min_sim-5,output_min_sim-10,output_min_sim-20,output_max_all,output_max_conformal_5%,output_max_max-5,output_max_max-10,output_max_sim-5,output_max_sim-10,output_max_sim-20
0,0,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,chemical,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,,...,chemical,allergens,organoleptic aspects,chemical,chemical,chemical,chemical,chemical,chemical,chemical
1,0,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,fraud,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,,...,biological,foreign bodies,food additives and flavourings,fraud,fraud,fraud,fraud,allergens,fraud,fraud
2,0,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,fraud,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,,...,biological,foreign bodies,biological,fraud,fraud,fraud,fraud,fraud,fraud,fraud
3,0,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,fraud,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,,...,allergens,foreign bodies,fraud,fraud,fraud,fraud,fraud,fraud,fraud,fraud
4,0,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,fraud,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,Context start:\nWe are looking for food hazard...,,...,fraud,biological,allergens,fraud,fraud,fraud,fraud,fraud,fraud,fraud


# Sanity Checks:

In [5]:
for row in prompts[prompt_cols].values:
    last_text = None
    for text in row:
        if text != '':
            text = text.split('\nContext end:\n')[1]
            text = text.split('\n')[1]
            text = text[1:-5]
            
            if last_text is not None: assert text == last_text
            
            last_text = text

# Count unique classes per prompt:

In [6]:
class_count = pd.DataFrame(columns = ['cv_split'] + prompt_cols)
class_count['cv_split'] = np.arange(5)

for cv_split in class_count.cv_split:
    for col in prompt_cols:
        counts = []
        for prompt in prompts[prompts.cv_split==cv_split][col].values:
            prompt = prompt.split('\nContext end:')[0]
            prompt = prompt.split('\n')[2:]
            labels = set([sample.split(' -> ')[1] for sample in prompt])
            
            counts.append(len(labels))

        class_count.loc[class_count.cv_split == cv_split, col] = np.mean(counts)

class_count.set_index('cv_split', inplace=True)
class_count

Unnamed: 0_level_0,prompt_all,prompt_conformal_5%,prompt_max-5,prompt_max-10,prompt_sim-5,prompt_sim-10,prompt_sim-20
cv_split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,10.0,2.658278,5.0,10.0,3.225828,5.797351,10.0
1,10.0,2.658278,5.0,10.0,3.239073,5.765563,10.0
2,10.0,2.726309,5.0,10.0,3.273028,5.749503,10.0
3,10.0,2.241882,5.0,10.0,3.269052,5.741551,10.0
4,10.0,2.76607,5.0,10.0,3.279655,5.782638,10.0


In [7]:
class_count.mean()

prompt_all                 10.0
prompt_conformal_5%    2.610163
prompt_max-5                5.0
prompt_max-10              10.0
prompt_sim-5           3.257327
prompt_sim-10          5.767321
prompt_sim-20              10.0
dtype: object

# Count Characters per prompt:

In [8]:
char_count = pd.DataFrame(columns = ['cv_split'] + prompt_cols)
char_count['cv_split'] = np.arange(5)

for cv_split in char_count.cv_split:
    for col in prompt_cols:
        counts = []
        for prompt in prompts[prompts.cv_split==cv_split][col].values:
            counts.append(len(prompt))

        char_count.loc[char_count.cv_split == cv_split, col] = np.mean(counts)

char_count.set_index('cv_split', inplace=True)
char_count

Unnamed: 0_level_0,prompt_all,prompt_conformal_5%,prompt_max-5,prompt_max-10,prompt_sim-5,prompt_sim-10,prompt_sim-20
cv_split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2414.680132,820.537748,1327.852318,2414.680132,802.308609,1319.338411,2414.680132
1,2350.619868,803.393377,1293.668212,2350.619868,789.42649,1285.884106,2350.619868
2,2379.183565,822.552021,1294.286945,2379.183565,790.63552,1288.659377,2379.183565
3,2362.444665,732.048376,1297.467197,2362.444665,788.41882,1294.402916,2362.444665
4,2351.83499,826.249172,1290.622266,2351.83499,787.037111,1283.026508,2351.83499


In [9]:
char_count.mean()

prompt_all             2371.752644
prompt_conformal_5%     800.956139
prompt_max-5           1300.779388
prompt_max-10          2371.752644
prompt_sim-5             791.56531
prompt_sim-10          1294.262263
prompt_sim-20          2371.752644
dtype: object