# Data Analyses

In [8]:
from typing import Iterable, List, Tuple, Any, Optional, Dict, Union, Callable
import json 
import pandas as pd
import numpy as np
import torch
import itertools
from collections import defaultdict, Counter, OrderedDict
%matplotlib inline
import matplotlib.pyplot as plt
import scipy.stats as stats

import datasets
import qanom
from qanom.annotations.common import read_annot_csv
import qanom.evaluation.roles as qanom_roles # import SemanticRole, question_to_sem_role

# Helper functions
def report_overlap(**kwargs):
    assert len(kwargs)==2
    (nm1, s1), (nm2, s2) = tuple(kwargs.items())
    s1, s2 = set(s1), set(s2)
    print(f"|{nm1}|={len(s1)}, |{nm2}|={len(s2)};   Union: {len(s1|s2)}  Intersection: {len(s1&s2)} ")
    print(f"|{nm1}-{nm2}|={len(s1-s2)}, |{nm2}-{nm1}|={len(s2-s1)};   ")

def plot_counter_as_pie_chart(counter, title=None):
    items = list(counter.items())
    items = sorted(items, key=lambda kv: kv[0]) # sort by key
    labels, sizes = zip(*items)

    fig1, ax1 = plt.subplots()
    ax1.pie(sizes, labels=labels, autopct='%1.1f%%',
            shadow=True, startangle=90, textprops={"color":"orange"})
    if title:
        fig1.suptitle(title, fontsize=14, color="orange")
    

def set_key_column(df, sent_id_lbl, pred_idx_lbl):
    df['key'] = df.apply(lambda r: f"{r[sent_id_lbl]}_{r[pred_idx_lbl]}", axis=1)


### Prepare datasets

In [2]:
# Datasets
qanom_dataset = datasets.load_dataset("biu-nlp/qanom")
qasrl_dataset = datasets.load_dataset("kleinay/qa_srl")

qanom_test_df = qanom_dataset['test'].to_pandas()
qanom_dev_df = qanom_dataset['validation'].to_pandas()
qasrl_dev_df = qasrl_dataset['validation'].to_pandas()
qanom_train_df = qanom_dataset['train'].to_pandas()

for df in (qanom_test_df, qanom_dev_df, qanom_train_df, qasrl_dev_df):
    set_key_column(df, 'sent_id', 'predicate_idx')

No config specified, defaulting to: qanom/default
Reusing dataset qanom (/home/nlp/kleinay/.cache/huggingface/datasets/biu-nlp___qanom/default/1.1.0/44d54349c6d3f70e326208bf63485003c5410d38a6aae87eb80d74cf887627d0)


  0%|          | 0/3 [00:00<?, ?it/s]

No config specified, defaulting to: qa_srl/plain_text
Reusing dataset qa_srl (/home/nlp/kleinay/.cache/huggingface/datasets/kleinay___qa_srl/plain_text/1.0.0/9aaf099b628da9c576ebbc49bd242c93d0e6cc79ffdb2e0e1d3daf409f696820)


  0%|          | 0/3 [00:00<?, ?it/s]

## Argument Position Flexibility - verbs vs. nominals

An interesting subject is argument order in verbs vs. nominals; 
This is part of analyzing why permutations help qanom and not qasrl - whether in is a matter of data-augmentation (qanom scarcity) or a matter of **more flexible argument position in nominals**.

Here we first empirically check in qasrl and qanom data whether nominalizations have more "flexible" argument positions than verbs.

We will quantify the variance of argument position for each Role (WH-word, conflating Who+What). 
To do that, we map every argument within an instance to an integer stating its relative position, where negative number indicate left of predicate and positive number indicate right of predicate. Then we compute variance of relative positions per Role. 

Total *flexibility* is computed as the weighted avreage of the variances of all roles (weighted by role frequency).
 


In [17]:

def compute_position_variances(df):
    df = df[df.question.str.len()>0]    # filter out non-QA rows
    df['role'] = df.question.map(lambda a: a.item(0)) 
    df['role'] = df.role.map(lambda r: "what-who" if r in ("what", "who") else r)  
    def get_position(ranges):
        # if len(ranges)==0: return np.nan
        span=min([tuple(sp) for sp in ranges])
        return span[0]
    df['arg_idx'] = df.answer_ranges.map(get_position) 
    df.loc[:, 'position'] = 0
    # assign relative 'position' per argument (== row, QA; we take the first token in first answer span as position)
    for key,df_pred in df.groupby('key'):
        idxs = df_pred.arg_idx.to_numpy()
        pred_idx: int = df_pred.predicate_idx.iloc[0]
        # * First Approach: Ordered relative positions, w.r.t. predicate 
        # positive_positions = np.append(idxs, pred_idx).argsort()    # adding predicate_idx to argsort for setting it as anchor
        # positions = (positive_positions - positive_positions[-1])[:-1] # predicate_idx position is 0, the rest are relative to it
        # *** but this yield greater flexibility to qasrl, probably since it has higher variance of #-QAs . Trying another approach: **
        # * Second Approach: taking relative position in fractions
        positive_positions = idxs.argsort()
        positions = (positive_positions + 1) / (max(positive_positions) + 1)
        # * Third Approach: simple relative position (token distance from predicate)
        # positions = idxs - pred_idx
        
        df.loc[df.key==key, 'position'] = positions 
    role2position_variance = OrderedDict()
    role2position_mean = OrderedDict()
    role2freq = OrderedDict()
    # compute position variance per role
    for role,df_role in df.groupby('role'):
        # role2position_variance[role] = df_role.position.var()
        # * Instead of taking variance, another option is to take the entropy of the relative position disribution (regarded as categorical).
        # *  Higher entropy means position is closer to uniform, meaning higher variance and flexibility of argument position.
        role2position_variance[role] = data_entropy(df_role.position)  
        role2position_mean[role] = df_role.position.mean()
        role2freq[role] = len(df_role)
    # compute total flexibity 
    flexibility = np.average(list(role2position_variance.values()), weights=list(role2freq.values()))  
    print(f"flexibility={flexibility}")  
    print("variance of arg position per role: ", role2position_variance)
    return df

nom_df= compute_position_variances(qanom_dev_df)
verb_df = compute_position_variances(qasrl_dev_df)
print("variances (noms, verbs):\n", [np.var(df.position) for df in (nom_df, verb_df)])
print("Levenve test: ", stats.levene(nom_df.position, verb_df.position, center='trimmed'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['role'] = df.question.map(lambda a: a.item(0))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['role'] = df.role.map(lambda r: "what-who" if r in ("what", "who") else r)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['arg_idx'] = df.answer_ranges.map(get_position)


flexibility=2.1711872689262437
variance of arg position per role:  OrderedDict([('how', 2.04507877843706), ('how long', 2.2186885704510977), ('how much', 2.1730341663990758), ('what-who', 2.1364742590779513), ('when', 2.509126170949904), ('where', 2.2836285616867835), ('why', 2.0863635102597624)])
flexibility=2.619280126596887
variance of arg position per role:  OrderedDict([('how', 2.4705889654883046), ('how long', 2.077819531114783), ('how much', 2.4024164119159215), ('what-who', 2.5963652010369778), ('when', 2.8215299427492924), ('where', 2.751353303164841), ('why', 2.5658670472860936)])
variances (noms, verbs):
 [0.07400759036500482, 0.07587787094458415]
Levenve test:  LeveneResult(statistic=18.918426914713965, pvalue=1.3822956657254566e-05)


In [14]:
def data_entropy(lst: Iterable[Any], base=2):
    ser = pd.Series(lst) if not isinstance(lst, pd.Series) else lst 
    p = ser.value_counts() / ser.sum()
    return stats.entropy(p, base=base)

df.question

0           [how, did, someone, say, something, _, _, ?]
1                  [what, did, someone, say, _, _, _, ?]
2          [when, did, someone, say, something, _, _, ?]
3                  [who, _, _, said, something, _, _, ?]
4           [why, did, someone, say, something, _, _, ?]
                              ...                       
2890            [what, should, _, continued, _, _, _, ?]
2891    [when, should, something, continued, _, _, _, ?]
2892     [why, should, something, continued, _, _, _, ?]
2893      [what, is, _, attributed, _, to, something, ?]
2894      [what, is, something, attributed, _, to, _, ?]
Name: question, Length: 2895, dtype: object