# Annotating pivots

Using the pigeon library.

In [1]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8686')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8686  Dashboard: http://192.168.62.207:8687/status,Cluster  Workers: 7  Cores: 56  Memory: 471.41 GB


In [36]:
%%time
import dask.bag as db
from takco.util import robust_json_loads_lines
from takco.table import from_tabel
import takco

steps = takco.config.build('step', load=['resources/graphs/wikidata.toml','resources/pipelines/TabEL.toml'])
prefix_header_rules = steps[0]['prefix_header_rules']
unpivot_heuristics = {h.name: h for h in steps[0]['unpivot_heuristics']}

# fnames = 'hdfs://bricks07:9000/user/kruit/tabel/2-*'
fnames = '/export/scratch1/home/kruit/scratch/tabel/*'
bag = db.read_text(fnames).map_partitions(robust_json_loads_lines)\
    .map_partitions(lambda ts: list(takco.reshape.restructure(ts, prefix_header_rules=prefix_header_rules)))\
    .persist()
bag.count().compute()

CPU times: user 53.7 ms, sys: 5.87 ms, total: 59.5 ms
Wall time: 2.09 s


5621

In [27]:
headers = bag.distinct(key=takco.reshape.table_get_headerId).map(lambda t: t.get('tableHeaders')).persist()
headers.count().compute()

2041

In [10]:
from dask.bag.random import sample

for up in bag.map_partitions(lambda ts: list(takco.reshape.build_heuristics(ts, unpivot_heuristics.values()))):
    unpivot_heuristics[up.name].merge(up)

hsample = sample(headers, 100).compute()
header_tables = {}
for h in hsample:
    h = {'tableHeaders': h}
    header_tables[takco.reshape.table_get_headerId(h)] = h

for p in takco.reshape.yield_pivots(hsample, unpivot_heuristics.values()):
    header_tables[p['headerId']]['pivot'] = p


def show_pivot(h):
    df = takco.table.from_tabel(h)
    if h.get('pivot') and not h['pivot'].get('discard'):
        print(h.get('pivot'))
        display.display(df.takco.highlight_pivot(**h.get('pivot')))
    else:
        display.display(df.takco)

from pigeon import annotate
annotations = annotate(
    header_tables.values(),
    options=['good', 'bad'],
    display_fn = show_pivot
)

HTML(value='0 examples annotated, 101 examples left')

HBox(children=(Button(description='good', style=ButtonStyle()), Button(description='bad', style=ButtonStyle())…

Output()

Annotation done.


In [24]:
import json

with open('pivot-annotation-recall.json', 'w') as fw:
    json.dump(annotations, fw)
    
import pandas as pd
p = pd.Series([bool(h.get('pivot') and not h['pivot'].get('discard')) for h,a in annotations])
ok = pd.Series([a == 'good' for h,a in annotations])

tp = sum(p & ok)
fp = sum(p & (~ok))
fn = sum((~p) & (~ok))
print(f"Precision: {(tp/(tp+fp)):.2f}, Recall: {(tp/(tp+fn)):.2f}")

Precision: 1.00, Recall: 0.68


In [37]:
from dask.bag.random import sample
import tqdm

hsample = tqdm.tqdm(headers.compute())

header_tables = {}
for h in hsample:
    h = {'tableHeaders': h}
    header_tables[takco.reshape.table_get_headerId(h)] = h

heuristic_pivots = {}
for p in takco.reshape.yield_pivots(hsample, unpivot_heuristics.values()):
    h = header_tables[p['headerId']]
    h['pivot'] = p
    if not h['pivot'].get('discard'):
        heuristic_pivots.setdefault(p.get('heuristic'), []).append(h)
    
{h:len(ps) for h,ps in heuristic_pivots.items()}

 21%|██▏       | 436/2041 [01:41<01:56, 13.80it/s]
  0%|          | 0/2041 [00:00<?, ?it/s][A
  1%|          | 13/2041 [00:00<00:21, 95.16it/s][A
  2%|▏         | 41/2041 [00:00<00:17, 114.62it/s][A
  3%|▎         | 63/2041 [00:00<00:14, 132.83it/s][A
  4%|▍         | 88/2041 [00:00<00:13, 149.96it/s][A
  5%|▌         | 104/2041 [00:00<00:14, 129.87it/s][A
  6%|▋         | 128/2041 [00:00<00:12, 149.35it/s][A
  7%|▋         | 145/2041 [00:00<00:12, 153.65it/s][A
  8%|▊         | 162/2041 [00:00<00:12, 148.10it/s][A
  9%|▉         | 186/2041 [00:01<00:11, 157.36it/s][A
 10%|▉         | 203/2041 [00:01<00:11, 156.97it/s][A
 11%|█         | 220/2041 [00:01<00:11, 152.78it/s][A
 12%|█▏        | 248/2041 [00:01<00:10, 176.12it/s][A
 13%|█▎        | 275/2041 [00:01<00:09, 188.90it/s][A
 15%|█▌        | 308/2041 [00:01<00:08, 215.35it/s][A
 16%|█▋        | 335/2041 [00:01<00:07, 225.74it/s][A
 18%|█▊        | 360/2041 [00:01<00:08, 195.97it/s][A
 19%|█▊        | 382/2041 [00:

{'year-suffix': 79,
 'NumSuffix': 117,
 'SpannedRepeat': 83,
 'sports-round': 27,
 'crater-template': 8,
 'train-template': 13,
 'short-uppercase-acronyms': 41,
 'year-prefix': 14,
 'NumPrefix': 8}

In [39]:
def show_pivot(h):
    df = takco.table.from_tabel(h)
    if h.get('pivot') and not h['pivot'].get('discard'):
        print(h.get('pivot'))
        display.display(df.takco.highlight_pivot(**h.get('pivot')))
    else:
        display.display(df.takco)

per_heuristic_tables = [p for h,ps in heuristic_pivots.items() for p in ps[:50]]
        
from pigeon import annotate
per_heuristic_annotations = annotate(
    per_heuristic_tables,
    options=['good', 'bad'],
    display_fn = show_pivot
)

HTML(value='0 examples annotated, 262 examples left')

HBox(children=(Button(description='good', style=ButtonStyle()), Button(description='bad', style=ButtonStyle())…

Output()

Annotation done.


In [46]:
import pandas as pd
import json

with open('pivot-annotation-precision.json', 'w') as fw:
    json.dump(per_heuristic_annotations, fw)

heuristic_annotation = {}
for h,a in per_heuristic_annotations:
    hname = h['pivot']['heuristic']
    heuristic_annotation.setdefault(hname, []).append((h,a))
    
for hname, anns in heuristic_annotation.items():
    p = pd.Series([bool(h.get('pivot')) for h,a in anns])
    ok = pd.Series([a == 'good' for h,a in anns])

    tp = sum(p & ok)
    fp = sum(p & (~ok))
    fn = sum((~p) & (~ok))
    print(f"{hname:>30s}  Precision: {(tp/(tp+fp)):.2f}")

                   year-suffix  Precision: 1.00
                     NumSuffix  Precision: 0.92
                 SpannedRepeat  Precision: 0.52
                  sports-round  Precision: 0.26
               crater-template  Precision: 1.00
                train-template  Precision: 1.00
      short-uppercase-acronyms  Precision: 0.76
                   year-prefix  Precision: 0.93
                     NumPrefix  Precision: 0.38
