# Comparison to SNoW

> Given a set of web tables and a target knowledge base, the SNoW method extends each web table with additional context columns, stitches matching web tables into larger tables, and applies functional dependency discovery to identify the relations that are represented in the web tables. Further, it normalises the stitched tables, guided by the schema of the knowledge base, to create an integrated schema.

We assume that the tables are already context-enriched, and schema-unioned per Pay-Level Domain. Our task is now to match and stitch these supertables into universal tables, and decompose them into normalised relations.

In [9]:
import takco
from pathlib import Path

ROOT = Path('~/snow/datasets/').expanduser().absolute()

def get_snow_datasets():
    for d in ROOT.iterdir():
        if d.is_dir():
            union_path = d.joinpath('union_dedup_json')
            input_dataset = takco.evaluate.dataset.WebDataCommons(fnames=list(union_path.glob("*.json")))
            
            reference_path = d.joinpath('evaluation/normalised_fd_relations')
            output_dataset = takco.evaluate.dataset.WebDataCommons(fnames=list(union_path.glob("*.json")))
            
            yield d.name, (input_dataset, output_dataset)
            
benchmark_datasets = dict(get_snow_datasets())
print(f"Got {len(benchmark_datasets)} benchmark datasets")
list(benchmark_datasets)

Got 9 benchmark datasets


['d3football.com',
 'www.vgchartz.com',
 'www.cia.gov',
 'www.nndb.com',
 'flightaware.com',
 'itunes.apple.com',
 'seatgeek.com',
 'www.amoeba.com',
 'data.bls.gov']

In [7]:
from collections import Counter
ts = list(takco.TableSet.dataset(benchmark_datasets['www.vgchartz.com'][0]))

def get_context_cols(df):
    prefixes = ['page title', 'table heading','uri']
    return [cs for cs in df.columns if any(c.startswith(i) for i in prefixes for c in cs) ]

import re
re_bracket = re.compile(r"\(([^\)]*)\)")
def extract_bracket_disambiguation(df):
    df = df.copy()
    for col in df: 
        if df[col].str.contains('\(').any():
            df[(f"disambiguation of {col[0]}",)] = df[col].str.extract(re_bracket, expand=False)
            df[col] = df[col].str.replace(re_bracket, '')
    return df

tane = takco.link.Tane(ROOT.parent.joinpath('tane'))
det_count = Counter()
for t in ts:
    print(t._id)
    df = t.df
    print(*map('|'.join, zip(*df.columns)))
    cols = set(df.columns) - set(get_context_cols(df))
    for det, dep in tane.rundf(df, level=6).items():
        missing = cols - (set(det)|set(dep))
        if not missing:
            print(set(det), '->', dep)
            det_count[tuple(set(det))] += 1
        elif len(missing) < 3:
            print(set(det), '->', dep, 'missing:', missing)
    print()
    
print({det:c for det,c in det_count.items() if c>1})

2.json
page title|table heading|uri 0|uri 1|uri 2|game|namerica|europe
{('game',)} -> {('namerica',), ('europe',)}

7.json
page title|table heading|uri 0|uri 1|uri 2|uri 3|uri 4|uri 5|uri 6|title|publisher|region|date|distribution
{('publisher',)} -> {('title',), ('uri 1',), ('uri 2',), ('distribution',), ('page title',)} missing: {('region',), ('date',)}
{('date',)} -> {('title',), ('uri 1',), ('publisher',), ('uri 2',), ('distribution',), ('page title',)} missing: {('region',)}
{('region',), ('page title',)} -> {('date',), ('publisher',)} missing: {('title',), ('distribution',)}
{('uri 1',), ('region',)} -> {('date',), ('publisher',)} missing: {('title',), ('distribution',)}
{('uri 2',), ('region',)} -> {('date',), ('publisher',)} missing: {('title',), ('distribution',)}
{('title',), ('region',)} -> {('date',), ('publisher',)} missing: {('distribution',)}
{('region',), ('publisher',)} -> {('date',)} missing: {('title',), ('distribution',)}
{('distribution',), ('region',)} -> {('date'

In [391]:
# make our own Normalised FD tables from the Union tables
import pandas as pd
def indexed(t, indexes=()):
    indexes = ['page title', 'table heading','uri'] + list(indexes)
    index_cols = [cs for cs in t.df.columns if any(c.startswith(i) for i in indexes for c in cs) ]
    return t.df.set_index(index_cols)

import re
re_bracket = re.compile(r"\(([^\)]*)\)")
def extract_bracket_disambiguation(df):
    df = df.copy()
    for col in df: 
        if df[col].str.contains('\(').any():
            df[(f"disambiguation of {col[0]}",)] = df[col].str.extract(re_bracket, expand=False)
            df[col] = df[col].str.replace(re_bracket, '')
    return df

def looks_numeric(col, threshold = .5):
    return (col.str.count('[\d\.]') / col.str.len()).mean() > threshold

def make_numeric(col):
    return pd.to_numeric(col.replace('[^\d\.]', '', regex=True), errors='coerce')

def split_fk(df, fk):
    cat = df[(fk,)].astype('category').cat
    fk_df = pd.DataFrame(cat.categories).reset_index()
    fk_df['index'] = fk_df['index'].astype('str')
    fk_df.columns = [('PK',), ('rdf-schema#label',)]
    df = df.drop(columns=(fk,))
    df['FK'] = cat.codes.astype('str')
    return df, fk_df

def decompose_fd_tables(df, keys):
    for c in df.columns:
        if (c not in keys) and (len(set(df[c])) > 1):
            fd_df = df[[c] + list(keys)]
            
            # get filled unique rows
            filled_mask = fd_df.fillna(False).applymap(bool).all(axis=1)
            fd_df = fd_df[filled_mask].drop_duplicates(ignore_index=True)
            
            _id = f"fd_{c[0].replace(' ','_')}"
            yield takco.Table(head=zip(*fd_df.columns), body=fd_df.values, _id=_id)

def binary_stitch(ts, key):
    df = pd.concat([indexed(t, [key]) for t in ts if key in t.head[0]]).reset_index().fillna('')
    for att in df:
        if looks_numeric(df[att]):
            df[att] = make_numeric(df[att]).fillna('').astype('str')
    return df
        
def binary_fds(dfs, key):
    fd_dfs = [] 
    for df in dfs:
        keys = [('FK',)]
        if all(k in df.columns for k in keys):
            for fdt in decompose_fd_tables(df, keys):
                yield fdt, True
        else:
            _id = '_'.join('-'.join(c) for c in df.columns)
            fdt = takco.Table(head=zip(*df.columns), body=df.values, _id=_id)
            yield fdt, False

def write_snow(t, name, fd_path):
    doc = takco.evaluate.dataset.WebDataCommons.convert_back(t, snow=True)
    fname = Path(fd_path).joinpath(name)
    with open(fname, 'w') as fw:
        json.dump(doc, fw, ensure_ascii=False)


fd_path = ROOT.joinpath('flightaware.com/normalised_X_fd_relations')
!rm -r $fd_path
Path(fd_path).mkdir(parents=True,exist_ok=True)

ts = list(takco.TableSet.dataset(benchmark_datasets[0][0]))
key = 'carrier'
stitched = extract_bracket_disambiguation(binary_stitch(ts, key))
print('Stitched:', stitched.shape)
print(' ', '|'.join(c for cs in stitched.columns for c in cs) )

prefix = 'Airline_flightaware.com'
n_fds = 0
for t, is_fd in binary_fds(split_fk(stitched, key), key):
    if is_fd:
        name = f"{prefix}_fd_{n_fds}.json"
        n_fds += 1
    else:
        name = f"{prefix}.json"
    print(name)
    write_snow(t, name, fd_path)
    
fd_dataset = takco.evaluate.dataset.WebDataCommons(fnames=list(fd_path.glob("*.json")))
takco.TableSet.dataset(fd_dataset).preview(ntables=None)

Stitched: (19603, 24)
  page title|table heading|uri 0|uri 1|uri 2|uri 3|uri 4|carrier|average per flight|cargo weight (lbs)|flights performed|flights scheduled|mail transport (lbs)|maximum|median|minimum|passengers|percentage flown|percentage of seats filled|popularity|routing|total seats|disambiguation of page title|disambiguation of carrier
Airline_flightaware.com_fd_0.json
Airline_flightaware.com_fd_1.json
Airline_flightaware.com_fd_2.json
Airline_flightaware.com_fd_3.json
Airline_flightaware.com_fd_4.json
Airline_flightaware.com_fd_5.json
Airline_flightaware.com_fd_6.json
Airline_flightaware.com_fd_7.json
Airline_flightaware.com_fd_8.json
Airline_flightaware.com_fd_9.json
Airline_flightaware.com_fd_10.json
Airline_flightaware.com_fd_11.json
Airline_flightaware.com_fd_12.json
Airline_flightaware.com_fd_13.json
Airline_flightaware.com_fd_14.json
Airline_flightaware.com_fd_15.json
Airline_flightaware.com_fd_16.json
Airline_flightaware.com_fd_17.json
Airline_flightaware.com_fd_18.json

?,0,1
Unnamed: 0_level_1,total seats,FK
,100.0,38
,97910.0,90
,100.0,3
,9632.0,56
,12202.0,59

?,0,1
Unnamed: 0_level_1,uri 4,FK
,JIA,90
,JIA,59
,JIA,72
,JIA,56
,JIA,71

?,0,1
Unnamed: 0_level_1,flights scheduled,FK
,37.0,56
,14.0,71
,1.0,3
,579.0,59
,1232.0,90

?,0,1
Unnamed: 0_level_1,passengers,FK
,100.0,38
,84630.0,90
,77.0,3
,7264.0,56
,8565.0,59

?,0,1
Unnamed: 0_level_1,uri 3,FK
,KMSY,90
,KMSY,59
,KMSY,72
,KMSY,56
,KMSY,71

?,0,1
Unnamed: 0_level_1,mail transport (lbs),FK
,1109.0,90
,0.0,59
,0.0,72
,0.0,56
,0.0,71

?,0,1
Unnamed: 0_level_1,average per flight,FK
,50.0,38
,117.0,90
,39.0,3
,65.0,56
,60.0,59

?,0,1
Unnamed: 0_level_1,page title,FK
,Airline Statistics ✈ KCLT to KMSY ✈ FlightAware,90
,Airline Statistics ✈ KCLT to KMSY ✈ FlightAware,59
,Airline Statistics ✈ KCLT to KMSY ✈ FlightAware,72
,Airline Statistics ✈ KCLT to KMSY ✈ FlightAware,56
,Airline Statistics ✈ KCLT to KMSY ✈ FlightAware,71

?,0,1
Unnamed: 0_level_1,cargo weight (lbs),FK
,42544000.0,90
,11536960.0,59
,1639176.0,72
,832500.0,56
,229820.0,71

?,0,1
Unnamed: 0_level_1,routing,FK
,non-stop,91
,via klga,3
,via katl,27
,non-stop,88
,non-stop,64

?,0,1
Unnamed: 0_level_1,disambiguation of carrier,FK
,operated by air wisconsin,91
,operated by psa airlines,91
,operated by piedmont,91
,operated by psa airlines,88
,operated by piedmont,88

?,0,1
Unnamed: 0_level_1,maximum,FK
,792.0,91
,523.69,91
,288.38,91
,298.55,3
,1297.99,91

?,0,1
Unnamed: 0_level_1,uri 2,FK
,KCLT,90
,KCLT,59
,KCLT,72
,KCLT,56
,KCLT,71

?,0,1
Unnamed: 0_level_1,disambiguation of page title,FK
,PSA Airlines,90
,PSA Airlines,59
,PSA Airlines,72
,PSA Airlines,56
,PSA Airlines,71

?,0,1
Unnamed: 0_level_1,median,FK
,170.49,91
,179.15,91
,186.5,91
,180.08,3
,138.5,91

?,0,1
Unnamed: 0_level_1,percentage flown,FK
,100.0,56
,100.0,71
,100.0,3
,99.0,59
,99.0,90

?,0,1
Unnamed: 0_level_1,popularity,FK
,69.0,91
,22.0,91
,5.0,91
,3.0,3
,57.0,91

?,0,1
Unnamed: 0_level_1,PK,rdf-schema#label
,0,abx air
,1,aero-micronesia
,2,air transport international llc
,3,air wisconsin
,4,airtran

?,0,1
Unnamed: 0_level_1,minimum,FK
,69.45,91
,75.75,91
,110.0,91
,92.19,3
,53.51,91

?,0,1
Unnamed: 0_level_1,flights performed,FK
,37.0,56
,14.0,71
,1.0,3
,576.0,59
,1222.0,90

?,0,1
Unnamed: 0_level_1,percentage of seats filled,FK
,100.0,38
,87.0,90
,77.0,3
,76.0,56
,70.0,59


In [392]:
def read_eval_scores(fname):
    with open(fname) as fo:
        lines = fo.readlines()
        try:
            ind = lines.index('Overall Performance:\n')
            scores = [float(l.split(' ')[1]) for l in lines[ind+1:ind+4]]
        except:
            scores = (0,0,0)
        return dict(zip('prf', scores))
    
import subprocess
cmd = 'bash', 'evaluate_containment_specific', 'datasets/flightaware.com', 'X'
eval_out = subprocess.run(cmd, cwd=ROOT.parent, check=True, capture_output=True)
print(eval_out.stderr.decode())
print(read_eval_scores(ROOT.joinpath('flightaware.com/evaluate_containment_X.log')))

/export/scratch1/home/kruit/snow/target/snow-1.0.jar
de.uni_mannheim.informatik.dws.tnt.match.cli.EvaluateEntityStitchedUnionTables version 2021-02-15 22:36:40
		 __      __.___        __                     
		 /  \    /  \   | _____/  |_  ____     _______ 
		 \   \/\/   /   |/    \   __\/ __ \    \_  __ \
		  \        /|   |   |  \  | \  ___/     |  | \/
		   \__/\  / |___|___|  /__|  \___  > /\ |__|   
		        \/           \/          \/  \/        

{'p': 0.875573, 'r': 0.349084, 'f': 0.499158}


In [373]:
gold_fd_path = ROOT.joinpath('flightaware.com/normalised_S')
gold_fd_dataset = takco.evaluate.dataset.WebDataCommons(fnames=list(gold_fd_path.glob("*.json")))
takco.TableSet.dataset(gold_fd_dataset).preview(ntables=None)

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,average per flight,FK
,kcmh,kjfk,41.0,Airline_flightaware.com~Row2
,kcmh,kjfk,37.0,Airline_flightaware.com~Row14
,kdfw,kdtw,111.0,Airline_flightaware.com~Row20
,kavp,kord,42.0,Airline_flightaware.com~Row1
,kdfw,kdtw,86.0,Airline_flightaware.com~Row10

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,flights scheduled,FK
,krdu,ktpa,0.0,Airline_flightaware.com~Row10
,kelp,ksat,2.0,Airline_flightaware.com~Row1
,kelp,ksat,1.0,Airline_flightaware.com~Row22
,kelp,ksat,1159.0,Airline_flightaware.com~Row28
,krdu,ktpa,0.0,Airline_flightaware.com~Row8

?,0,1,2,3,4
Unnamed: 0_level_1,page title,routing,minimum,Disambiguation of carrier,FK
,airline statistics ✈ kcmh to kjfk ✈ flightaware,via klga,92.02,,Airline_flightaware.com~Row14
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,50.04,,Airline_flightaware.com~Row20
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,55.27,,Airline_flightaware.com~Row10
,airline statistics ✈ kclt to kilm ✈ flightaware,non-stop,73.46,operated by republic,Airline_flightaware.com~Row23
,airline statistics ✈ kcmh to kjfk ✈ flightaware,non-stop,69.99,operated by chautauqua,Airline_flightaware.com~Row4

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,passengers,FK
,kcmh,kjfk,162.0,Airline_flightaware.com~Row2
,kcmh,kjfk,9536.0,Airline_flightaware.com~Row14
,kdfw,kdtw,199779.0,Airline_flightaware.com~Row20
,kavp,kord,22449.0,Airline_flightaware.com~Row1
,kdfw,kdtw,35523.0,Airline_flightaware.com~Row10

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,flights performed,FK
,krdu,ktpa,1.0,Airline_flightaware.com~Row10
,kelp,ksat,2.0,Airline_flightaware.com~Row1
,kelp,ksat,1.0,Airline_flightaware.com~Row22
,kelp,ksat,1162.0,Airline_flightaware.com~Row28
,krdu,ktpa,2.0,Airline_flightaware.com~Row8

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,percentage of seats filled,FK
,kcmh,kjfk,78.0,Airline_flightaware.com~Row2
,kcmh,kjfk,76.0,Airline_flightaware.com~Row14
,kdfw,kdtw,76.0,Airline_flightaware.com~Row20
,kavp,kord,78.0,Airline_flightaware.com~Row1
,kdfw,kdtw,80.0,Airline_flightaware.com~Row10

?,0,1,2
Unnamed: 0_level_1,page title,cargo weight (lbs),FK
,airline statistics ✈ kelp to ksat ✈ flightaware,22500.0,Airline_flightaware.com~Row22
,airline statistics ✈ kelp to ksat ✈ flightaware,42842800.0,Airline_flightaware.com~Row28
,airline statistics ✈ krdu to ktpa ✈ flightaware,72200.0,Airline_flightaware.com~Row8
,airline statistics ✈ kelp to ksat ✈ flightaware,55500.0,Airline_flightaware.com~Row19
,airline statistics ✈ kelp to ksat ✈ flightaware,35000.0,Airline_flightaware.com~Row38

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,total seats,FK
,kcmh,kjfk,200.0,Airline_flightaware.com~Row2
,kcmh,kjfk,12970.0,Airline_flightaware.com~Row14
,kdfw,kdtw,253875.0,Airline_flightaware.com~Row20
,kavp,kord,26598.0,Airline_flightaware.com~Row1
,kdfw,kdtw,43229.0,Airline_flightaware.com~Row10

?,0,1,2,3,4
Unnamed: 0_level_1,page title,routing,median,Disambiguation of carrier,FK
,airline statistics ✈ kcmh to kjfk ✈ flightaware,via klga,116.39,,Airline_flightaware.com~Row14
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,214.97,,Airline_flightaware.com~Row20
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,176.97,,Airline_flightaware.com~Row10
,airline statistics ✈ kclt to kilm ✈ flightaware,non-stop,144.02,operated by republic,Airline_flightaware.com~Row23
,airline statistics ✈ kcmh to kjfk ✈ flightaware,non-stop,122.49,operated by chautauqua,Airline_flightaware.com~Row4

?,0,1,2
Unnamed: 0_level_1,page title,mail transport (lbs),FK
,airline statistics ✈ kelp to ksat ✈ flightaware,0.0,Airline_flightaware.com~Row22
,airline statistics ✈ kelp to ksat ✈ flightaware,0.0,Airline_flightaware.com~Row28
,airline statistics ✈ krdu to ktpa ✈ flightaware,0.0,Airline_flightaware.com~Row8
,airline statistics ✈ kelp to ksat ✈ flightaware,0.0,Airline_flightaware.com~Row19
,airline statistics ✈ kelp to ksat ✈ flightaware,0.0,Airline_flightaware.com~Row38

?,0,1,2,3,4
Unnamed: 0_level_1,page title,routing,popularity,Disambiguation of carrier,FK
,airline statistics ✈ kcmh to kjfk ✈ flightaware,via klga,2.0,,Airline_flightaware.com~Row14
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,37.0,,Airline_flightaware.com~Row20
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,6.0,,Airline_flightaware.com~Row10
,airline statistics ✈ kclt to kilm ✈ flightaware,non-stop,3.0,operated by republic,Airline_flightaware.com~Row23
,airline statistics ✈ kcmh to kjfk ✈ flightaware,non-stop,13.0,operated by chautauqua,Airline_flightaware.com~Row4

?,0,1
Unnamed: 0_level_1,PK,rdf-schema#label
,Airline_flightaware.com~Row0,chautauqua
,Airline_flightaware.com~Row1,skywest
,Airline_flightaware.com~Row2,endeavor air
,Airline_flightaware.com~Row3,american eagle
,Airline_flightaware.com~Row4,delta

?,0,1,2,3,4
Unnamed: 0_level_1,page title,routing,Disambiguation of carrier,maximum,FK
,airline statistics ✈ kcmh to kjfk ✈ flightaware,via klga,,206.4,Airline_flightaware.com~Row14
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,,2166.96,Airline_flightaware.com~Row20
,airline statistics ✈ kdfw to kdtw ✈ flightaware,non-stop,,786.94,Airline_flightaware.com~Row10
,airline statistics ✈ kclt to kilm ✈ flightaware,non-stop,operated by republic,296.5,Airline_flightaware.com~Row23
,airline statistics ✈ kcmh to kjfk ✈ flightaware,non-stop,operated by chautauqua,442.48,Airline_flightaware.com~Row4

?,0,1,2,3
Unnamed: 0_level_1,uri 2,uri 3,percentage flown,FK
,krdu,ktpa,0.0,Airline_flightaware.com~Row10
,kelp,ksat,100.0,Airline_flightaware.com~Row1
,kelp,ksat,100.0,Airline_flightaware.com~Row22
,kelp,ksat,100.0,Airline_flightaware.com~Row28
,krdu,ktpa,0.0,Airline_flightaware.com~Row8


## annotation analysis

In [58]:
# what is the difference between clusters in entity_structure and union_goldstandard?

import pandas as pd
from pathlib import Path

root = Path('~/snow/datasets/d3football.com/').expanduser().absolute()

ev = root.joinpath('evaluation/')
entity_structure = pd.read_csv(ev.joinpath('entity_structure.tsv'), sep='\t', header=None)
es = entity_structure[2].map(lambda x:tuple(set(x.split(','))))

union_goldstandard = pd.read_csv(ev.joinpath('union_goldstandard.tsv'), sep='\t', header=None)
ug = union_goldstandard[1].map(lambda x:tuple(set(x.split(','))))
ug_name = dict(zip(ug, union_goldstandard[0]))

def make_gs(root):
    fname = root.joinpath('evaluation/union_goldstandard.tsv')
    ugs_cols_df = pd.read_csv(fname, sep='\t', header=None)
    ugs_cols_df[1] = ugs_cols_df[1].map(lambda x: set(x.split(',')))
    return ugs_cols_df

gs = make_gs(root)
# for c in (set(ug) - set(es)):
#     print(c, ug_name[c])
    
# set(es) & set(ug)
gs

Unnamed: 0,0,1
0,-20: [-20],{7.json~Col12}
1,all: [all],{0.json~Col8}
2,att: [att],{4.json~Col7}
3,avg: [avg],"{7.json~Col9, 1.json~Col9, 9.json~Col9, 4.json..."
4,c-a: [c-a],{2.json~Col7}
5,collin janssen jr (baseball): [collin janssen ...,{10.json~Col7}
6,conf: [conf],{0.json~Col7}
7,email: [email],{11.json~Col7}
8,fg: [fg],{8.json~Col7}
9,int: [int],{2.json~Col10}


In [57]:
# if we cluster on attribute name only, is that similar to some annotations?

from takco.evaluate.dataset import WebDataCommons
import takco
from pathlib import Path

# root = Path('~/snow/datasets/data.bls.gov/').expanduser().absolute()
root = Path('~/snow/datasets/d3football.com/').expanduser().absolute()

fnames = root.joinpath('union_dedup_json/')
inp = WebDataCommons(fnames=list(fnames.glob("*.json")))
tableset = takco.TableSet.dataset(inp)

# Get columns per header value
head_cols = {}
for t in tableset.tables:
    fname = Path(t['fname']).name
    hs = [c.get('text') for cs in t['tableHeaders'] for c in cs]
    for hi, h in enumerate(hs):
        cid = f"{fname}~Col{hi}"
        head_cols.setdefault(h, set()).add(cid)

trivial = set(map(tuple, head_cols.values()))
print(len(set(es) & set(ug)), 'gold and entity_struct', )
print(len(set(trivial) & set(ug)), 'trivial and gold')
print(len(set(es) & set(trivial)), 'trivial and entity_struct')

20 gold and entity_struct
20 trivial and gold
16 trivial and entity_struct
