## Setup

In [None]:
# %%bash
# !(stat -t /usr/local/lib/*/dist-packages/google/colab > /dev/null 2>&1) && exit 
# rm -rf jem-3-bert
# git clone https://github.com/jfanghrv/jem-3-bert.git

In [28]:
# ! pip install datasets
# ! pip install transformers

In [2]:
import json 
import os
from tqdm import tqdm
import csv

from datasets import load_dataset
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

import pandas as pd
import numpy as np
import re

import matplotlib
import matplotlib.pyplot as plt

## Data Preprocessing

For model training and inference, PICARD needs the table names and corresponding column names in each database. 

`[question] | [db_id] | [table] : [column] ( [content] , [content] ) , [column] ( ... ) , [...] | [table] : ... | ...`

An example input looks like this:

`How many singers do we have? | concert_singer | stadium : stadium_id, location, name, capacity, highest, lowest, average | singer : singer_id, name, country, song_name, song_release_year, age, is_male | concert : concert_id, concert_name, theme, stadium_id, year | singer_in_concert : concert_id, singer_id`


In [3]:
data_path = './data/tables.json'
with open(data_path) as f:
    data_json = f.read()
data = json.loads(data_json)

In [4]:
dbs = dict()
for db in data:
    table_cols = [[] for _ in range(len(db['table_names']))]
    for i, colname in db['column_names_original']:
        if i >= 0:
            table_cols[i].append(colname.lower())
    table_cols = [ table + ' : ' + ', '.join(cols) for cols, table in zip(table_cols, db['table_names']) ] 
    dbs[db['db_id']] = ' | '.join(table_cols)

In [5]:
# load dataset
train_ds, val_ds, test_ds = load_dataset('spider', split=['train[:5000]', 'train[5000:7000]','validation[:]'])

Found cached dataset spider (/Users/mengyuanli/.cache/huggingface/datasets/spider/spider/1.0.0/4e5143d825a3895451569c8b9b55432b91a4bc2d04d390376c950837f4680daa)


  0%|          | 0/3 [00:00<?, ?it/s]

## PICARD Model

The base model we use is PICARD pretrained on Spider dataset. 

In [5]:
tokenizer = AutoTokenizer.from_pretrained("tscholak/1zha5ono")

In [6]:
# model = AutoModelForSeq2SeqLM.from_pretrained("tscholak/cxmefzzi")
model = AutoModelForSeq2SeqLM.from_pretrained("tscholak/1zha5ono")

In [None]:
# model.eval()
# for i in tqdm(range(10)):
#     input_ = test_ds[i]
#     input_str = f"{input_['question']} | {input_['db_id']} | {dbs[input_['db_id']]}"
#     token_out = tokenizer([input_str], return_tensors='pt')
#     model_out = model.generate(token_out['input_ids'])

#     print(input_['question'])
#     print('Truth:', ' '.join(input_['query_toks_no_value']))
#     print('Model Output:',tokenizer.batch_decode(model_out, skip_special_tokens=True)[0].split(' | ')[1])
#     print()
    
    

## Run Inference

In [None]:
model.eval()
out_eval = {}
eval_df = val_ds
write_to_file = True # whether to write resulting inference dataframe to file
inference_data_path = 'data/spider_val_ds_eval.csv' # if writing df to file, this is the file path
for i in tqdm(range(len(eval_df))):
    input_ = test_ds[i]
    input_str = f"{input_['question']} | {input_['db_id']} | {dbs[input_['db_id']]}"
    token_out = tokenizer([input_str], return_tensors='pt')
    #model_out = model.generate(token_out['input_ids'])
    generation_output = model.generate(
                    input_ids=token_out['input_ids'],
                    return_dict_in_generate=True, 
                    output_scores=True
                    )
    preds = generation_output['sequences'].cpu() 
    sequences_scores = generation_output['sequences_scores'].cpu() 
    logits = torch.stack(generation_output['scores'], dim=1).cpu()
    output_prob = torch.softmax(logits, dim=2)
    log_prob = torch.log_softmax(logits, dim=2)
    sequences_entropy = ( torch.sum(output_prob * log_prob, dim=2) * (-1) ).numpy()
    
        
    #text = tokenizer.decode(token_out['input_ids'][j], skip_special_tokens=True)
    pred = tokenizer.decode(preds[0], skip_special_tokens=True).split(' | ')[1]


    pred_tensor = preds[1:]
    entropy = sequences_entropy[0].tolist()
    if tokenizer.eos_token_id in pred_tensor:
        pred_eos_idx = torch.nonzero(pred_tensor==tokenizer.eos_token_id)[0].item()
        entropy = entropy[:pred_eos_idx+1]
    result = {}
    result['question'] = input_['question']
    result['real'] = input_['query']
    result['pred'] = pred
    result['sequence_entropy'] = [list(np.round(entropy,6))]
    out_eval[i] = result

reformed_dict = {'question': [], 'real': [], 'pred': [],'sequence_entropy': []}
for outerKey, innerDict in out_eval.items():
    for innerKey, values in innerDict.items():
        reformed_dict[innerKey].append(values)
inference_df = pd.DataFrame(reformed_dict)

if write_to_file:
    inference_df.to_csv(inference_data_path,index=False)

## Create Evaluation Files

In [6]:
#inference_df = pd.read_csv('data/spider_val_ds_eval.csv')
inference_data = 'data/spider_val_ds_eval.csv' # if writing df to file, this is the file path
eval_df = val_ds
dest_path = 'evaluation/spider_val' # do not include file type
def write_eval_files(inference_data, db_ids, dest_path):
    '''
    Creates gold.txt and predict.txt files in the format that spider evaluation suite wants
    df_path: path to the dataset containing results from running inference (see Run Inference section above)
    db_ids: the db_ids, which are needed to create the txt files. Example: eval_df['db_id']
    '''
    if type(inference_data) == str:
        eval_df = pd.read_csv(inference_data)
    else:
        eval_df = inference_data.copy()
    eval_df['real'] = eval_df['real'].apply(lambda x: x.replace("\t",''))
    eval_df['real_with_db_id'] = eval_df.real + "\t" + db_ids
    eval_df['real_with_db_id'].to_csv(dest_path+'_gold.txt', header=False, index=False, quoting=csv.QUOTE_NONE,sep='\n')
    (eval_df.index.astype(str) + ' ' + eval_df.pred).to_csv(dest_path+'_predict.txt', header=False, index=False, quoting=csv.QUOTE_NONE,sep='\n')



In [64]:
t = [['0 sd bfb'],['1 gdg fdgs'],['12 adggf nf']]
[int(i[0][:i[0].find(' ')]) for i in t]

[0, 1, 12]

In [63]:
[[i[0][i[0].find(' ')+1:]] for i in t]

[['sd bfb'], ['gdg fdgs'], ['adggf nf']]

In [None]:
write_eval_files(inference_data, eval_df['db_id'], dest_path)

inference_data = 'data/spider_test_ds_eval.csv' 
eval_df = test_ds
dest_path = 'evaluation/spider_test' # do not include file type

write_eval_files(inference_data, eval_df['db_id'], dest_path)

## Entropy Thresholding

In [7]:
val_inference_df = pd.read_csv('data/spider_val_ds_eval.csv')
test_inference_df = pd.read_csv('data/spider_test_ds_eval.csv')

### Different representations of the sequence entropy


#### max

In [8]:
def calc_max_entropy(sequence_entropy):
    max_entropy_l = []
    for i in sequence_entropy: 
        max_entropy = max(np.array(re.sub(r"(\[|\])", r"", i).split(', ')).astype(float))
        max_entropy_l.append(max_entropy)
    return max_entropy_l

#### l2 norm

In [9]:
def calc_l2norm_entropy(sequence_entropy):
    l2norm_entropy_l = []
    for i in sequence_entropy: 
        l2norm_entropy = np.linalg.norm(np.array(re.sub(r"(\[|\])", r"", i).split(', ')).astype(float))
        l2norm_entropy_l.append(l2norm_entropy)
    return l2norm_entropy_l

In [37]:
# val_inference_df['max_entropy'] = calc_max_entropy(val_inference_df.sequence_entropy)
# test_inference_df['max_entropy'] = calc_max_entropy(test_inference_df.sequence_entropy)
# val_inference_df['max_entropy_pct_rank'] = val_inference_df.max_entropy.rank(pct=True)
# test_inference_df['max_entropy_pct_rank'] = test_inference_df.max_entropy.rank(pct=True)
# Example of thresholding entropy at 75th percentile
# val_inference_df[val_inference_df['max_entropy_pct_rank']>=0.75].min()

question                        Compute the average score of submissions.
real                    SELECT  T1.fname ,  T1.lname FROM Faculty AS T...
pred                    select * from products as t1 join manufacturer...
sequence_entropy        [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...
max_entropy                                                      0.604665
max_entropy_pct_rank                                                 0.75
dtype: object

In [38]:
# dest_path = 'evaluation/input/spider_test_75pct_thresh' # do not include file type
# eval_df = test_inference_df.merge(pd.DataFrame({'db_id': test_ds['db_id'], 'question': test_ds['question']}), on='question')

# write_eval_files(test_inference_df[test_inference_df.max_entropy <= 0.604665], eval_df['db_id'], dest_path)

### change different percentile

In [10]:
percentile = 0.5
dest_path = f'evaluation/input/spider_test_{percentile*100:.0f}pct_thresh'
dest_path

'evaluation/input/spider_test_50pct_thresh'

In [11]:
percentiles = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8]
def thred_experiments(percentiles, entropy_func, func_name, val_inference_df = val_inference_df, test_inference_df = test_inference_df):
    print(f"generate gold/predict txt for {func_name}")
    val_inference_df[func_name] = entropy_func(val_inference_df.sequence_entropy)
    test_inference_df[func_name] = entropy_func(test_inference_df.sequence_entropy)
    val_inference_df[func_name+'_pct_rank'] = val_inference_df[func_name].rank(pct=True)
    test_inference_df[func_name+'_pct_rank'] = test_inference_df[func_name].rank(pct=True)
    for percentile in percentiles:
        threshold = val_inference_df[val_inference_df[func_name+'_pct_rank']>=percentile].min()[func_name]
        print(f"percentile: {percentile}  threshold: {threshold}")
        dest_path = f'evaluation/input/spider_test_{func_name}_{percentile*100:.0f}pct' # do not include file type
        eval_df = test_inference_df.merge(pd.DataFrame({'db_id': test_ds['db_id'], 'question': test_ds['question']}), on='question')
        write_eval_files(test_inference_df[test_inference_df[func_name] <= threshold], eval_df['db_id'], dest_path)


In [12]:
thred_experiments(percentiles, calc_max_entropy, "max_entropy")

generate gold/predict txt for max_entropy
percentile: 0.2  threshold: 3.1e-05
percentile: 0.3  threshold: 0.000715
percentile: 0.4  threshold: 0.010854
percentile: 0.5  threshold: 0.067097
percentile: 0.6  threshold: 0.212395
percentile: 0.7  threshold: 0.463692
percentile: 0.75  threshold: 0.604665
percentile: 0.8  threshold: 0.692826


In [13]:
thred_experiments(percentiles, calc_l2norm_entropy, "l2norm_entropy")

generate gold/predict txt for l2norm_entropy
percentile: 0.2  threshold: 3.330165161069343e-05
percentile: 0.3  threshold: 0.000736
percentile: 0.4  threshold: 0.011135004535248292
percentile: 0.5  threshold: 0.06945788969584378
percentile: 0.6  threshold: 0.223931
percentile: 0.7  threshold: 0.48318811482485785
percentile: 0.75  threshold: 0.6182586698607629
percentile: 0.8  threshold: 0.757731


Then you have to go to evaluation folder to get the exact/execution accuracy:

`./run_experiments.sh`

 ### K means boundary

#### on max entropy

In [14]:
from sklearn.cluster import KMeans
val_max_entropy = calc_max_entropy(val_inference_df.sequence_entropy)
val_max_entropy = np.array(val_max_entropy).reshape(-1, 1)
kmeans = KMeans(n_clusters=2, random_state=0).fit(val_max_entropy)

In [22]:
boundary = kmeans.cluster_centers_.mean()
print(f"Kmeans boundary is: {boundary}")
dest_path = f'evaluation/input/spider_test_max_entropy_Kmeans' # do not include file type
test_inference_df['max_entropy'] = calc_max_entropy(test_inference_df.sequence_entropy)
eval_df = test_inference_df.merge(pd.DataFrame({'db_id': test_ds['db_id'], 'question': test_ds['question']}), on='question')
write_eval_files(test_inference_df[test_inference_df['max_entropy'] <= boundary], eval_df['db_id'], dest_path)

Kmeans boundary is: 0.5599353501091002


`python3 evaluation.py --gold ./input/spider_test_max_entropy_Kmeans_gold.txt --pred ./input/spider_test_max_entropy_Kmeans_predict.txt --db database --table tables.json --etype all --output spider_test_max_entropy_Kmeans`

#### on l2 norm entropy

In [23]:
val_l2norm_entropy = calc_l2norm_entropy(val_inference_df.sequence_entropy)
val_l2norm_entropy = np.array(val_l2norm_entropy).reshape(-1, 1)
kmeans = KMeans(n_clusters=2, random_state=0).fit(val_l2norm_entropy)
boundary = kmeans.cluster_centers_.mean()
print(f"Kmeans boundary is: {boundary}")
dest_path = f'evaluation/input/spider_test_l2norm_entropy_Kmeans' # do not include file type
test_inference_df['l2norm_entropy'] = calc_l2norm_entropy(test_inference_df.sequence_entropy)
eval_df = test_inference_df.merge(pd.DataFrame({'db_id': test_ds['db_id'], 'question': test_ds['question']}), on='question')
write_eval_files(test_inference_df[test_inference_df['l2norm_entropy'] <= boundary], eval_df['db_id'], dest_path)

Kmeans boundary is: 0.6648939268506927


`python3 evaluation.py --gold ./input/spider_test_l2norm_entropy_Kmeans_gold.txt --pred ./input/spider_test_l2norm_entropy_Kmeans_predict.txt --db database --table tables.json --etype all --output spider_test_l2norm_entropy_Kmeans`

## Evaluate Error Detection

In [25]:
def print_metrics(df_full, df_err_detect, eval_type):
    true_pos = len(df_full[(df_full.exec_idx.isin(df_err_detect.exec_idx)==False) & (df_full[eval_type]==0)])
    false_pos = len(df_full[(df_full.exec_idx.isin(df_err_detect.exec_idx)==False) & (df_full[eval_type]==1)])
    true_neg = len(df_full[(df_full.exec_idx.isin(df_err_detect.exec_idx)==True) & (df_full[eval_type]==1)])
    false_neg = len(df_full[(df_full.exec_idx.isin(df_err_detect.exec_idx)==True) & (df_full[eval_type]==0)])

    precision = true_pos / (true_pos+false_pos) if true_pos != true_pos+false_pos else 1
    recall = true_pos / (true_pos+false_neg) if true_pos != true_pos+false_neg else 1
    f1 = 2*true_pos / (2*true_pos+false_pos+false_neg)
    confusion_mat = pd.DataFrame(data=np.array([[true_pos, false_neg], [false_pos, true_neg]]), columns=['positive', 'negative'], index=['positive', 'negative'])
    
    print(f"Error detection precision {eval_type} = {precision:.4f}")
    print(f"Error detection recall {eval_type} = {recall:.4f}")
    print(f"Error detection F1 {eval_type} = {f1:.4f}")
    print(confusion_mat)
    
    return precision, recall, f1, confusion_mat


In [120]:
thresh_eval_results = pd.DataFrame(json.load(open('evaluation/output/result/spider_test_max_entropy_75pct.json')))
eval_results =  pd.DataFrame(json.load(open('evaluation/output/result/spider_test.json')))

print_metrics(eval_results,thresh_eval_results, 'exec')
print_metrics(eval_results,thresh_eval_results, 'exact')
print("")

Error detection precision exec = 0.7579
Error detection recall exec = 0.5797
Error detection F1 exec = 0.6569
          positive  negative
positive       360       261
negative       115       298
Error detection precision exact = 0.7832
Error detection recall exact = 0.5776
Error detection F1 exact = 0.6649
          positive  negative
positive       372       272
negative       103       287



In [26]:
def print_formated_s(row_name, l, element_format):
    template = "{:20} " + ' '.join([element_format] * len(l))
    print(template.format(row_name, *l))

def print_scores(scores, etype, include_turn_acc=True, include_partial = False):
    turns = ['turn 1', 'turn 2', 'turn 3', 'turn 4', 'turn > 4']
    levels = ['easy', 'medium', 'hard', 'extra', 'all']
    if include_turn_acc:
        levels.append('joint_all')
    partial_types = ['select', 'select(no AGG)', 'where', 'where(no OP)', 'group(no Having)',
                     'group', 'order', 'and/or', 'IUEN', 'keywords']

    print_formated_s("", levels, '{:20}')
    counts = [scores[level]['count'] for level in levels]
    print_formated_s("count", counts, '{:<20d}')

    if etype in ["all", "exec"]:
        print ('=====================   EXECUTION ACCURACY     =====================')
        exec_scores = [scores[level]['exec'] for level in levels]
        print_formated_s("execution", exec_scores, '{:<20.3f}')

    if etype in ["all", "match"]:
        print ('\n====================== EXACT MATCHING ACCURACY =====================')
        exact_scores = [scores[level]['exact'] for level in levels]
        print_formated_s("exact match", exact_scores, '{:<20.3f}')
        if include_partial:
            print ('\n---------------------PARTIAL MATCHING ACCURACY----------------------')
            for type_ in partial_types:
                this_scores = [scores[level]['partial'][type_]['acc'] for level in levels]
                print_formated_s(type_, this_scores, '{:<20.3f}')

            print ('---------------------- PARTIAL MATCHING RECALL ----------------------')
            for type_ in partial_types:
                this_scores = [scores[level]['partial'][type_]['rec'] for level in levels]
                print_formated_s(type_, this_scores, '{:<20.3f}')

            print ('---------------------- PARTIAL MATCHING F1 --------------------------')
            for type_ in partial_types:
                this_scores = [scores[level]['partial'][type_]['f1'] for level in levels]
                print_formated_s(type_, this_scores, '{:<20.3f}')

    if include_turn_acc:
        print()
        print()
        print_formated_s("", turns, '{:20}')
        counts = [scores[turn]['count'] for turn in turns]
        print_formated_s("count", counts, "{:<20f}")

        if etype in ["all", "exec"]:
            print ('=====================   TURN EXECUTION ACCURACY     =====================')
            exec_scores = [scores[turn]['exec'] for turn in turns]
            print_formated_s("execution", exec_scores, '{:<20.3f}')

        if etype in ["all", "match"]:
            print ('\n====================== TURN EXACT MATCHING ACCURACY =====================')
            exact_scores = [scores[turn]['exact'] for turn in turns]
            print_formated_s("exact match", exact_scores, '{:<20.3f}')


In [119]:
thresh_eval_scores = json.load(open('evaluation/output/scores/spider_test_max_entropy_75pct_score.json'))
eval_scores =  pd.DataFrame(json.load(open('evaluation/output/scores/spider_test_score.json')))


In [110]:
print_scores(eval_scores, "all", False, True)

                     easy                 medium               hard                 extra                all                 
count                248                  446                  174                  166                  1034                
execution            0.540                0.422                0.322                0.211                0.399               

exact match          0.528                0.392                0.299                0.193                0.377               

---------------------PARTIAL MATCHING ACCURACY----------------------
select               0.941                0.898                0.986                0.952                0.929               
select(no AGG)       0.961                0.919                0.986                0.952                0.945               
where                0.875                0.819                0.634                0.556                0.754               
where(no OP)         0.950                0.831

In [111]:
print_scores(thresh_eval_scores, "all", False)

                     easy                 medium               hard                 extra                all                 
count                161                  252                  82                   64                   559                 
execution            0.634                0.560                0.439                0.297                0.533               

exact match          0.634                0.540                0.390                0.266                0.513               


##### experiment pipeline

In [171]:
import os
res_path = r'evaluation/output/result'
score_path = r'evaluation/output/scores'
result = os.listdir(res_path)
scores = os.listdir(score_path)

In [172]:
eval_results =  pd.DataFrame(json.load(open('evaluation/output/result/spider_test.json')))
exec_experiment = {'name':[], 'precision':[], 'recall':[], "F1":[]}
exact_experiment = {'name':[], 'precision':[], 'recall':[], "F1":[]}
for res in result:
    if res == 'spider_val.json': continue
    thresh_eval_results = pd.DataFrame(json.load(open('evaluation/output/result/'+res)))
    p1, r1, f1, _ = print_metrics(eval_results,thresh_eval_results, 'exec')
    p2, r2, f2, _ = print_metrics(eval_results,thresh_eval_results, 'exact')
    exec_experiment['name'].append(res)
    exec_experiment['precision'].append(p1)
    exec_experiment['recall'].append(r1)
    exec_experiment['F1'].append(f1)
    exact_experiment['name'].append(res)
    exact_experiment['precision'].append(p2)
    exact_experiment['recall'].append(r2)
    exact_experiment['F1'].append(f2)
exec_experiment = pd.DataFrame(exec_experiment)
exact_experiment = pd.DataFrame(exact_experiment)
    

Error detection precision exec = 0.7890
Error detection recall exec = 0.5298
Error detection F1 exec = 0.6339
          positive  negative
positive       329       292
negative        88       325
Error detection precision exact = 0.8082
Error detection recall exact = 0.5233
Error detection F1 exact = 0.6352
          positive  negative
positive       337       307
negative        80       310
Error detection precision exec = 0.6161
Error detection recall exec = 0.9871
Error detection F1 exec = 0.7587
          positive  negative
positive       613         8
negative       382        31
Error detection precision exact = 0.6392
Error detection recall exact = 0.9876
Error detection F1 exact = 0.7761
          positive  negative
positive       636         8
negative       359        31
Error detection precision exec = 0.6021
Error detection recall exec = 0.9968
Error detection F1 exec = 0.7508
          positive  negative
positive       619         2
negative       409         4
Error det

In [173]:
exec_experiment.sort_values(by=['name'])

Unnamed: 0,name,precision,recall,F1
8,spider_test.json,1.0,0.0,0.0
5,spider_test_l2norm_entropy_20pct.json,0.60214,0.996779,0.750758
4,spider_test_l2norm_entropy_30pct.json,0.615848,0.988728,0.758962
9,spider_test_l2norm_entropy_40pct.json,0.655134,0.94525,0.773896
10,spider_test_l2norm_entropy_50pct.json,0.678887,0.864734,0.760623
12,spider_test_l2norm_entropy_60pct.json,0.716763,0.798712,0.755522
11,spider_test_l2norm_entropy_70pct.json,0.758681,0.703704,0.730159
0,spider_test_l2norm_entropy_80pct.json,0.788969,0.529791,0.633911
2,spider_test_max_entropy_20pct.json,0.60214,0.996779,0.750758
1,spider_test_max_entropy_30pct.json,0.61608,0.987118,0.758663


In [174]:
exact_experiment.sort_values(by=['name'])

Unnamed: 0,name,precision,recall,F1
8,spider_test.json,1.0,0.0,0.0
5,spider_test_l2norm_entropy_20pct.json,0.624514,0.996894,0.767943
4,spider_test_l2norm_entropy_30pct.json,0.638917,0.98913,0.776356
9,spider_test_l2norm_entropy_40pct.json,0.678571,0.944099,0.78961
10,spider_test_l2norm_entropy_50pct.json,0.702908,0.863354,0.774913
12,spider_test_l2norm_entropy_60pct.json,0.742775,0.798137,0.769461
11,spider_test_l2norm_entropy_70pct.json,0.782986,0.700311,0.739344
0,spider_test_l2norm_entropy_80pct.json,0.808153,0.523292,0.63525
2,spider_test_max_entropy_20pct.json,0.624514,0.996894,0.767943
1,spider_test_max_entropy_30pct.json,0.639196,0.987578,0.776083


count & accuracy

In [175]:
# eval_scores =  pd.DataFrame(json.load(open('evaluation/output/scores/spider_test_score.json')))
experiment = {'name':[], 'count':[], 'exec_accuracy':[], 'exact_accuracy':[]}
for score in scores:
    if score == 'spider_val_score.json': continue
    final_scores = pd.DataFrame(json.load(open('evaluation/output/scores/'+score)))
    experiment['name'].append(score)
    experiment['count'].append(final_scores['all']['count'])
    experiment['exec_accuracy'].append(final_scores['all']['exec'])
    experiment['exact_accuracy'].append(final_scores['all']['exact'])
experiment = pd.DataFrame(experiment)


In [176]:
experiment.sort_values(by=['name'])

Unnamed: 0,name,count,exec_accuracy,exact_accuracy
6,spider_test_l2norm_entropy_20pct_score.json,6,0.666667,0.666667
3,spider_test_l2norm_entropy_30pct_score.json,37,0.810811,0.810811
4,spider_test_l2norm_entropy_40pct_score.json,138,0.753623,0.73913
5,spider_test_l2norm_entropy_50pct_score.json,243,0.654321,0.63786
2,spider_test_l2norm_entropy_60pct_score.json,342,0.634503,0.619883
1,spider_test_l2norm_entropy_70pct_score.json,458,0.598253,0.578603
0,spider_test_l2norm_entropy_80pct_score.json,617,0.526742,0.502431
10,spider_test_max_entropy_20pct_score.json,6,0.666667,0.666667
7,spider_test_max_entropy_30pct_score.json,39,0.794872,0.794872
8,spider_test_max_entropy_40pct_score.json,141,0.751773,0.737589


#### Kmeans experiment

max

In [27]:
thresh_eval_results = pd.DataFrame(json.load(open('evaluation/output/result/spider_test_max_entropy_Kmeans.json')))
eval_results =  pd.DataFrame(json.load(open('evaluation/output/result/spider_test.json')))

print_metrics(eval_results,thresh_eval_results, 'exec')
print_metrics(eval_results,thresh_eval_results, 'exact')
print("")

Error detection precision exec = 0.7624
Error detection recall exec = 0.6200
Error detection F1 exec = 0.6838
          positive  negative
positive       385       236
negative       120       293
Error detection precision exact = 0.7822
Error detection recall exact = 0.6134
Error detection F1 exact = 0.6876
          positive  negative
positive       395       249
negative       110       280



In [30]:
thresh_eval_scores = json.load(open('evaluation/output/scores/spider_test_max_entropy_Kmeans_score.json'))
print_scores(thresh_eval_scores, "all", False)

                     easy                 medium               hard                 extra                all                 
count                156                  236                  80                   57                   529                 
execution            0.654                0.581                0.450                0.316                0.554               

exact match          0.647                0.555                0.400                0.281                0.529               


norm

In [31]:
thresh_eval_results = pd.DataFrame(json.load(open('evaluation/output/result/spider_test_l2norm_entropy_Kmeans.json')))
eval_results =  pd.DataFrame(json.load(open('evaluation/output/result/spider_test.json')))

print_metrics(eval_results,thresh_eval_results, 'exec')
print_metrics(eval_results,thresh_eval_results, 'exact')
print("")

Error detection precision exec = 0.7823
Error detection recall exec = 0.6135
Error detection F1 exec = 0.6877
          positive  negative
positive       381       240
negative       106       307
Error detection precision exact = 0.8008
Error detection recall exact = 0.6056
Error detection F1 exact = 0.6897
          positive  negative
positive       390       254
negative        97       293



In [32]:
thresh_eval_scores = json.load(open('evaluation/output/scores/spider_test_l2norm_entropy_Kmeans_score.json'))
print_scores(thresh_eval_scores, "all", False)

                     easy                 medium               hard                 extra                all                 
count                163                  247                  81                   56                   547                 
execution            0.638                0.587                0.469                0.357                0.561               

exact match          0.638                0.559                0.420                0.304                0.536               
