# Miscelleneous
Code for all sorts of thing that may be required again afterwards

In [14]:
import os
import sys
import pandas as pd
import numpy as np

import pprint
pp = pprint.PrettyPrinter(indent=2).pprint

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# Helper functions
def report_overlap(**kwargs):
    assert len(kwargs)==2
    (nm1, s1), (nm2, s2) = tuple(kwargs.items())
    s1, s2 = set(s1), set(s2)
    print(f"|{nm1}|={len(s1)}, |{nm2}|={len(s2)};   Union: {len(s1|s2)}  Intersection: {len(s1&s2)} ")
    print(f"|{nm1}-{nm2}|={len(s1-s2)}, |{nm2}-{nm1}|={len(s2-s1)};   ")

## Evaluate nrl-parser output

We took nrl-parser output on qasrl-gs test set (jsonl), converted to CSV using the `qasrl-state-machine` docker environment, and here we run the "qanom evaluation protocol" on it.

In [10]:
from datasets import load_dataset

qasrl_gs = load_dataset("biu-nlp/qa_srl2020")
test_set = qasrl_gs["test"]
df_gold = pd.DataFrame(test_set)
df_gold['qa_position'] = df_gold.groupby(["sent_id", "predicate_idx"]).cumcount()

df_parsed_predictions = pd.read_csv("../nrl_parser_output_qasrl_gs_test.csv")
df_parsed_predictions['qa_position'] = df_parsed_predictions.groupby(["qasrl_id", "verb_idx"]).cumcount()
df_parsed_predictions['qasrl_id'] = '---'
df_parsed_predictions['verb_form'] = '---'

# we have no alignment on the sentence or qasrl_id level,
# so we will sort by sentence string which should be sufficiently aligned
df_gold_sorted = df_gold.sort_values(by=['sentence', 'predicate_idx'])  
df_predictions_sorted = df_parsed_predictions.sort_values(by=['sentence', 'verb_idx'])  


No config specified, defaulting to: qa_srl2020/default
Reusing dataset qa_srl2020 (/home/nlp/kleinay/.cache/huggingface/datasets/biu-nlp___qa_srl2020/default/1.1.0/3b94572da7f8e6e95317517c89e9a702ba59775614c59c12d5d5cd3f6532b33a)


  0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# report_overlap(pred=set(df_parsed_predictions.sentence), gold=set(df_gold.sentence))
missed_sents = []
missed_sents_pred = []
for gs, ps in zip(df_gold_sorted.sentence.unique(), df_predictions_sorted.sentence.unique()):
    if gs != ps:
        # print(gs, "\n"+ ps, "\n")
        missed_sents.append(gs)
        missed_sents_pred.append(ps)
real_misses = [89, 110, 505] # indexes found manually in `sent_mismatch.txt`, verified
real_missed_sents = [missed_sents[i] for i in real_misses]
df_gold_sorted_aligned = df_gold_sorted.loc[~df_gold_sorted.sentence.isin(real_missed_sents)]
# get qasrl_id
sent2id = {}
for gs, ps, qid in zip(df_gold_sorted_aligned.sentence.unique(), df_predictions_sorted.sentence.unique(), df_gold_sorted_aligned.sent_id.unique()):
    print(gs, "\n"+ ps, "\n")
    sent2id[ps] = qid
# set the right qasrl_id in predictions 
df_parsed_predictions["qasrl_id"] = df_parsed_predictions.apply(lambda r: sent2id[r.sentence], axis=1)


import evaluation
eval_measures = evaluation.run_qanom_evaluation(df_parsed_predictions.copy(), df_gold.copy())
# print(eval_measures[:3])
#** Evaluation Results:
# UA: P: 0.585   R: 0.445   F1: 0.505 
# LA: P: 0.387   R: 0.294   F1: 0.334


, alluding to the Shakespearean phrase `` Et tu , Brute ? '' 
, alluding to the Shakespearean phrase `` Et tu , Brute ? '' 

-LRB- -LRB- WN -RRB- -RRB- Do you study the periodical cicadas with anyone else ? 
-LRB- -LRB- WN -RRB- -RRB- Do you study the periodical cicadas with anyone else ? 

-LRB- -LRB- WN -RRB- -RRB- How often do the Gliders get together ? 
-LRB- -LRB- WN -RRB- -RRB- How often do the Gliders get together ? 

-LRB- -LRB- WN -RRB- -RRB- I was looking around the North Greenwich Arena ... And that arena ! 
-LRB- -LRB- WN -RRB- -RRB- I was looking around the North Greenwich Arena ... And that arena ! 

-LRB- -LRB- WN -RRB- -RRB- That brings us to the 2000 Paralympics . 
-LRB- -LRB- WN -RRB- -RRB- That brings us to the 2000 Paralympics . 

-LRB- -LRB- WN -RRB- -RRB- You study the emergence of the periodical cicadas . 
-LRB- -LRB- WN -RRB- -RRB- You study the emergence of the periodical cicadas . 

-LRB- -LRB- Wikinews -RRB- -RRB- When and how did DuckDuckGo start using Perl 

9it [00:00, 85.46it/s]ungrammatical question: Who was something opposed by?
19it [00:00, 88.49it/s]ungrammatical question: When did something fell?
ungrammatical question: What felled?
ungrammatical question: What did something fell into?
39it [00:00, 91.58it/s]ungrammatical question: When isn't something be delayed?
ungrammatical question: What did someone found?
ungrammatical question: When did someone found something?
ungrammatical question: Who founded something?
49it [00:00, 91.20it/s]ungrammatical question: What did something remain into?
ungrammatical question: What wasn't something be recovered from?
ungrammatical question: What did someone carry up?
ungrammatical question: What did someone arrive at?
59it [00:00, 90.67it/s]ungrammatical question: Who was someone killed by?
ungrammatical question: What was something followed by?
69it [00:00, 89.99it/s]ungrammatical question: What did someone die from?
ungrammatical question: What did someone see?
79it [00:00, 89.38it/s]ungramma

Results above are rather low, and unlikely to be correct.
So I suspect these is a bug in all 150 non-matching sentences where tokenization differs.

Here let's try exclusing all these and see whether we get a more reasonable figure.  


In [19]:
df_predictions_aligned = df_parsed_predictions.loc[~df_parsed_predictions.sentence.isin(missed_sents_pred)]
df_gold_aligned = df_gold.loc[~df_gold.sentence.isin(missed_sents)]

# eval_measures = evaluation.run_qanom_evaluation(df_predictions_aligned, df_gold_aligned)
# print(eval_measures[:2])
df_parsed_predictions.answer_range
# print(df_gold.shape)
# print(df_predictions_aligned.shape)
# print(df_gold_aligned.shape)
# print(df_parsed_predictions.columns)
# print(df_gold.columns)
# print(df_predictions_aligned.columns)
# print(df_gold_aligned.columns)


0                 NaN
1                 0:4
2                 6:7
3         8:13~!~8:25
4           0:4~!~8:8
            ...      
5906            13:13
5907            14:18
5908            13:13
5909            18:18
5910    22:22~!~25:26
Name: answer_range, Length: 5911, dtype: object

# Wandb API 

### Retrieve Aggragated Measures of sweep experiment



In [22]:
import pandas as pd 
from collections import defaultdict
import wandb
api = wandb.Api()

# Project is specified by <entity/project-name>
# all_runs = api.runs("kleinay/qasrl")
# runs = []

# for run in all_runs: 
#     # run.summary contains the output keys/values for metrics like accuracy.
#     # run.config contains the hyperparameters.
#     # run.name is the human-readable name of the run.
    
    
#     # Filter relevant runs
#     sweep_experiment_run_name = "linerization exp grid sweep qanom baseline"
#     if run.name != sweep_experiment_run_name:
#         continue

sweep_id = "pe0nvjzp"   # go to sweep overview, ID
exp_sweep = api.sweep(f"kleinay/qasrl/{sweep_id}")
runs = exp_sweep.runs

variable = "preprocess_output_func" # the independant variable whose effect we want to assess 
measures = ['Unlabled Arg f1', 'Unlabled Arg precision', 'Unlabled Arg recall',
            'Labled Arg f1', 'Labled Arg precision', 'Labled Arg recall']
agg_func = np.max
collected = {m:defaultdict(list) for m in measures} # collect measure by 

linearization2runs = defaultdict(list)
for run in runs:
    for measure in measures:
        if measure in run.summary:
            # get aggregated evaluations
            collected[measure][run.config[variable]].append(run.summary[measure])   
    linearization2runs[run.config["preprocess_output_func"]].append(run) 
             
    
aggregated = {measure: {variable_value: agg_func(collected[measure][variable_value])
                        for variable_value in collected[measure]}
              for measure in measures}

print(exp_sweep.name)

# get hyperparamters of best run per linearization method
hyperparamters = ("learning_rate", "gradient_accumulation_steps", "dropout_rate")
linearization2best_run = {}
for lin, runs in linearization2runs.items():
    results = [run.summary['Labled Arg f1'] if 'Labled Arg f1' in run.summary else -1 
               for run in runs ] 
    linearization2best_run[lin] = runs[np.argmax(results)]
linearization2best_hparams = {lin: {hparam: run.config[hparam] 
                                    for hparam in hyperparamters}
                              for lin, run in linearization2best_run.items()}
    
pp(linearization2best_hparams)

    


basic grid for linearization - qasrl-small dev (v2)
{ 'all_by_answer_ordering': { 'dropout_rate': 0.1,
                              'gradient_accumulation_steps': 8,
                              'learning_rate': 0.001},
  'all_by_role_ordering': { 'dropout_rate': 0.1,
                            'gradient_accumulation_steps': 14,
                            'learning_rate': 0.001},
  'all_shuffled': { 'dropout_rate': 0.15,
                    'gradient_accumulation_steps': 8,
                    'learning_rate': 0.001},
  'permutate_all': { 'dropout_rate': 0.15,
                     'gradient_accumulation_steps': 8,
                     'learning_rate': 0.001},
  'permutate_sample_fixed': { 'dropout_rate': 0.1,
                              'gradient_accumulation_steps': 14,
                              'learning_rate': 0.001},
  'permutate_sample_num_of_qas': { 'dropout_rate': 0.1,
                                   'gradient_accumulation_steps': 14,
                               

In [23]:
pp(aggregated)


{ 'Labled Arg f1': { 'all_by_answer_ordering': 0.4821536600120992,
                     'all_by_role_ordering': 0.4877375843041079,
                     'all_shuffled': 0.4808003530969545,
                     'permutate_all': 0.4998511461744567,
                     'permutate_sample_fixed': 0.47554957379991025,
                     'permutate_sample_num_of_qas': 0.487527352297593},
  'Labled Arg precision': { 'all_by_answer_ordering': 0.5198956294846706,
                            'all_by_role_ordering': 0.5342511752854264,
                            'all_shuffled': 0.513677811550152,
                            'permutate_all': 0.5293190416141236,
                            'permutate_sample_fixed': 0.5062082139446036,
                            'permutate_sample_num_of_qas': 0.5299055613850997},
  'Labled Arg recall': { 'all_by_answer_ordering': 0.4540327129159616,
                         'all_by_role_ordering': 0.45346869712351945,
                         'all_shuffled': 0.4