# Comparison to SNoW

> Given a set of web tables and a target knowledge base, the SNoW method extends each web table with additional context columns, stitches matching web tables into larger tables, and applies functional dependency discovery to identify the relations that are represented in the web tables. Further, it normalises the stitched tables, guided by the schema of the knowledge base, to create an integrated schema.

We assume that the tables are already context-enriched, and schema-unioned per Pay-Level Domain. Our task is now to match and stitch these supertables into universal tables, and decompose them into normalised relations.

## Matching without FDs

In [248]:
%%time
from snow_pipeline import *
log.getLogger().setLevel(log.DEBUG)

kb = KB(snow_root)

Loading KB classes: 100%|██████████| 20/20 [00:02<00:00,  7.33it/s]
DEBUG:root:Made KB feature matrix of shape (20, 1857134)


CPU times: user 16.9 s, sys: 559 ms, total: 17.5 s
Wall time: 17.6 s


In [322]:
%%time
from snow_pipeline import *
log.getLogger().setLevel(log.DEBUG)

dataset_name = 'flightaware.com'
snow_root = Path('~/snow/').expanduser().absolute()
benchmark_datasets = dict(get_snow_datasets(snow_root))
tabid_table = {t._id: t for t in takco.TableSet.dataset(benchmark_datasets[dataset_name][0])}

tane = takco.link.Tane(snow_root.joinpath('tane'))

tabid_df = {tid:extract_bracket_disambiguation(t.df) for tid, t in tabid_table.items()}
tabid_df = {tid:extract_named_columns(t.df) for tid, t in tabid_table.items()}

CPU times: user 597 ms, sys: 7.01 ms, total: 604 ms
Wall time: 607 ms


In [311]:
%%time
# colid_to_partcolid, colid_to_tabid_and_colnr = match_columns(tabid_df, agg_threshold_col = 0.05)
colid_to_partcolid, colid_to_tabid_and_colnr = load_gold_colmatches(snow_root, dataset_name)

partcolid_to_colids = aggr_by_val(colid_to_partcolid.items())
display('partcolid_colids', partcolid_to_colids)

partitions = stitch_colclustered_tables(
    tabid_df,
    colid_to_partcolid, 
    colid_to_tabid_and_colnr
)
partid_df = {t._id:t.df for t in partitions}
decomposed = list(iter_fkclass_binary_decomposed(partid_df, dataset_name, kb))
takco.TableSet(decomposed).preview(ntables=None)

'partcolid_colids'

{0: {'4.json~Col0', '5.json~Col0'},
 1: {'0.json~Col0', '1.json~Col0', '2.json~Col0', '3.json~Col0'},
 2: {'4.json~Col1', '5.json~Col1'},
 3: {'0.json~Col1', '1.json~Col1', '2.json~Col1', '3.json~Col1'},
 4: {'4.json~Col2', '5.json~Col2'},
 5: {'0.json~Col2', '1.json~Col2', '2.json~Col2', '3.json~Col2'},
 6: {'4.json~Col3', '5.json~Col3'},
 7: {'0.json~Col3', '1.json~Col3', '2.json~Col3', '3.json~Col3'},
 8: {'0.json~Col4', '1.json~Col4', '2.json~Col4', '3.json~Col4'},
 9: {'4.json~Col4', '5.json~Col4'},
 10: {'0.json~Col5', '1.json~Col5', '2.json~Col5', '3.json~Col5'},
 11: {'4.json~Col5', '5.json~Col5'},
 12: {'0.json~Col6', '1.json~Col6', '2.json~Col6', '3.json~Col6'},
 13: {'0.json~Col11'},
 14: {'2.json~Col8'},
 15: {'0.json~Col7', '1.json~Col8', '2.json~Col7', '3.json~Col7'},
 16: {'5.json~Col6'},
 17: {'3.json~Col8'},
 18: {'3.json~Col9'},
 19: {'5.json~Col7'},
 20: {'2.json~Col9'},
 21: {'1.json~Col12'},
 22: {'1.json~Col11'},
 23: {'1.json~Col10'},
 24: {'0.json~Col10'},
 25: 

DEBUG:root:Stitching 4 aligned tables
DEBUG:root:Stitching 2 aligned tables
DEBUG:root:[flightaware.com] [part-0] Class predictions {('uri 0',): 'TelevisionShow', ('uri 1',): 'TelevisionShow', ('uri 2',): 'VideoGame', ('uri 3',): 'VideoGame', ('uri 4',): 'Company', ('carrier',): 'Airline', ('routing',): 'Film'}
DEBUG:root:[flightaware.com] [part-0] Predicted class Airline for col 7 (('carrier',))
DEBUG:root:[flightaware.com] [part-1] Class predictions {('uri 1',): 'Film', ('uri 2',): 'Album', ('remark',): 'Film', ('facility',): 'Band'}
DEBUG:root:[flightaware.com] [part-1] Predicted class Film for col 6 (('remark',))
INFO:root:[flightaware.com] Created binary tables for classes: {'Airline': 14, 'Film': 3}


CPU times: user 2.52 s, sys: 18.8 ms, total: 2.54 s
Wall time: 2.56 s


?,0,1
Unnamed: 0_level_1,percentage of seats filled,FK
,100%,Airline_flightaware.com~Row67
,87%,Airline_flightaware.com~Row17
,77%,Airline_flightaware.com~Row3
,76%,Airline_flightaware.com~Row87
,70%,Airline_flightaware.com~Row62

?,0,1
Unnamed: 0_level_1,passengers,FK
,100,Airline_flightaware.com~Row67
,84630,Airline_flightaware.com~Row17
,77,Airline_flightaware.com~Row3
,7264,Airline_flightaware.com~Row87
,8565,Airline_flightaware.com~Row62

?,0,1
Unnamed: 0_level_1,total seats,FK
,100,Airline_flightaware.com~Row67
,97910,Airline_flightaware.com~Row17
,100,Airline_flightaware.com~Row3
,9632,Airline_flightaware.com~Row87
,12202,Airline_flightaware.com~Row62

?,0,1
Unnamed: 0_level_1,average per flight,FK
,50,Airline_flightaware.com~Row67
,117,Airline_flightaware.com~Row17
,39,Airline_flightaware.com~Row3
,65,Airline_flightaware.com~Row87
,60,Airline_flightaware.com~Row62

?,0,1
Unnamed: 0_level_1,maximum,FK
,$792.00,Airline_flightaware.com~Row0
,$523.69,Airline_flightaware.com~Row1
,$288.38,Airline_flightaware.com~Row2
,$298.55,Airline_flightaware.com~Row3
,"$1,297.99",Airline_flightaware.com~Row1

?,0,1
Unnamed: 0_level_1,median,FK
,$170.49,Airline_flightaware.com~Row0
,$179.15,Airline_flightaware.com~Row1
,$186.50,Airline_flightaware.com~Row2
,$180.08,Airline_flightaware.com~Row3
,$138.50,Airline_flightaware.com~Row1

?,0,1
Unnamed: 0_level_1,minimum,FK
,$69.45,Airline_flightaware.com~Row0
,$75.75,Airline_flightaware.com~Row1
,$110.00,Airline_flightaware.com~Row2
,$92.19,Airline_flightaware.com~Row3
,$53.51,Airline_flightaware.com~Row1

?,0,1
Unnamed: 0_level_1,popularity,FK
,69%,Airline_flightaware.com~Row0
,22%,Airline_flightaware.com~Row1
,5%,Airline_flightaware.com~Row2
,3%,Airline_flightaware.com~Row3
,57%,Airline_flightaware.com~Row1

?,0,1
Unnamed: 0_level_1,routing,FK
,non-stop,Airline_flightaware.com~Row0
,non-stop,Airline_flightaware.com~Row1
,non-stop,Airline_flightaware.com~Row2
,via klga,Airline_flightaware.com~Row3
,via katl,Airline_flightaware.com~Row4

?,0,1
Unnamed: 0_level_1,mail transport (lbs),FK
,1109,Airline_flightaware.com~Row17
,0,Airline_flightaware.com~Row62
,0,Airline_flightaware.com~Row102
,0,Airline_flightaware.com~Row87
,0,Airline_flightaware.com~Row81

?,0,1
Unnamed: 0_level_1,cargo weight (lbs),FK
,42544000,Airline_flightaware.com~Row17
,11536960,Airline_flightaware.com~Row62
,1639176,Airline_flightaware.com~Row102
,832500,Airline_flightaware.com~Row87
,229820,Airline_flightaware.com~Row81

?,0,1
Unnamed: 0_level_1,flights performed,FK
,37,Airline_flightaware.com~Row87
,14,Airline_flightaware.com~Row81
,1,Airline_flightaware.com~Row3
,576,Airline_flightaware.com~Row62
,1222,Airline_flightaware.com~Row17

?,0,1
Unnamed: 0_level_1,percentage flown,FK
,100%,Airline_flightaware.com~Row87
,100%,Airline_flightaware.com~Row81
,100%,Airline_flightaware.com~Row3
,99%,Airline_flightaware.com~Row62
,99%,Airline_flightaware.com~Row17

?,0,1
Unnamed: 0_level_1,flights scheduled,FK
,37,Airline_flightaware.com~Row87
,14,Airline_flightaware.com~Row81
,1,Airline_flightaware.com~Row3
,579,Airline_flightaware.com~Row62
,1232,Airline_flightaware.com~Row17

?,0,1
Unnamed: 0_level_1,type,FK
,a110-1,Film_flightaware.com~Row1
,a81-apt,Film_flightaware.com~Row2
,e40-13,Film_flightaware.com~Row3
,e40-31,Film_flightaware.com~Row4
,a14,Film_flightaware.com~Row5

?,0,1
Unnamed: 0_level_1,facility,FK
,ground,Film_flightaware.com~Row0
,lc,Film_flightaware.com~Row0

?,0,1
Unnamed: 0_level_1,frequency,FK
,121.7,Film_flightaware.com~Row0
,128.4,Film_flightaware.com~Row0

?,0,1
Unnamed: 0_level_1,PK,rdf-schema#label
,Airline_flightaware.com~Row0,us airways (operated by air wisconsin)
,Airline_flightaware.com~Row1,us airways (operated by psa airlines)
,Airline_flightaware.com~Row2,us airways (operated by piedmont)
,Airline_flightaware.com~Row3,air wisconsin
,Airline_flightaware.com~Row4,delta

?,0,1
Unnamed: 0_level_1,PK,rdf-schema#label
,Film_flightaware.com~Row0,
,Film_flightaware.com~Row1,deer on and in vicinity of airport.
,Film_flightaware.com~Row2,"dusk-dawn. activate hirl runway 13/31, mirl runway (...)"
,Film_flightaware.com~Row3,southeast.
,Film_flightaware.com~Row4,northwest.


In [281]:
df = partid_df['part-0']
for ci, p in kb.predict_classes(df, 0).items():
    print(ci, f"{str(df.columns[ci]):30s}", p)

0 ('country',)                   {'class': 'Country', 'score': 0.0004304543061848038}
2 ('page title',)                {'class': 'Album', 'score': 3.1402682732709e-07}
3 ('date of information',)       {'class': 'Album', 'score': 0.0001533198919196577}
14 ('irrigated land(sq km)',)     {'class': 'Film', 'score': 1.009142759278311e-06}
70 ('fiscal year',)               {'class': 'Artist', 'score': 3.393720467862174e-08}
104 ('country',)                   {'class': 'Country', 'score': 6.604557695985393e-05}
215 ('country',)                   {'class': 'Country', 'score': 2.066512438916895e-06}
223 ('dan',)                       {'class': 'Album', 'score': 3.930035500312785e-07}
225 ('levi',)                      {'class': 'Album', 'score': 4.2873379941568706e-09}
226 ('zebulun',)                   {'class': 'Album', 'score': 1.208242213626726e-06}
227 ('reuben',)                    {'class': 'Album', 'score': 6.04121106813363e-07}
228 ('manessah',)                  {'class': 'Album', 'sco

In [282]:
df.head()

Unnamed: 0,country,uri 4,page title,date of information,current account balance,(%),rank,(bbl/day),rank.1,carbon dioxide emissions from consumption of energy(mt),...,ephraim,length,tribe,cruise altitudes,range (total),issachar,simeon,gad,gross weight,speed
0,,2137.html,The World Factbook,,,,,,,,...,,,,,,,,,,
1,,2137.html,The World Factbook,,,,,,,,...,,,,,,,,,,
2,,2137.html,The World Factbook,,,,,,,,...,,,,,,,,,,
3,,2137.html,The World Factbook,,,,,,,,...,,,,,,,,,,
4,,2137.html,The World Factbook,,,,,,,,...,,,,,,,,,,


### FD-based decomposition

In [None]:
## Decomposing stitched tables
partid_keys = {}
partid_colnr_fk = {}
for partid, tabids in partid_tabids.items():
    # Predict classes
    colnr_fk = predict_classes(df.set_axis(range(len(df.columns)), axis=1))
    partid_colnr_fk[partid] = colnr_fk
    
    # Get partition keyset
    colnames = list(zip(*head))
    context_cols = set(colnames.index(c) for c in get_context_headers(colnames))
    keys = set()
    allow_missing = 1
    main_cols = set(df.columns) - set(context_cols) - set(get_singleton_cols(df))
    try:
        for det, dep in tane.rundf(df, stoplevel=6, g3_threshold=.05).items():
            missing = main_cols - (set(det)|set(dep))
            if len(missing) <= allow_missing:
                if len(set(det) - set(context_cols)) - len(missing) > len(keys):
                    if any(c in colnr_fk for c in det):
                        keys = det
    except Exception as e:
        print('Tane error:', e)
    partid_keys[partid] = keys
    
    sch = lambda cs: '[%s]'%(' '.join(map('|'.join, zip(*(colnames[c] for c in cs)))))
    print(partid, tabids)
    print(' ', sch(set(keys)), '->', sch(set(df.columns)-set(keys)))
    print(' ', colnr_fk)
print()


# Decompose FDs
def decompose_fd_tables(df, keys, header=None):
    for c in df.columns:
        if (c not in keys) and (len(set(df[c])) > 1):
            fd_df = df[[c] + list(keys)]
            
            # get filled unique rows
            filled_mask = fd_df.fillna(False).applymap(bool).any(axis=1)
            fd_df = fd_df[filled_mask].drop_duplicates(ignore_index=True)
            
            if header is not None:
                head = list(zip(*header[[c] + list(keys)]))
                _id = f"fd_{header[c][0].replace(' ','_')}"
            else:
                head = list(zip(*fd_df.columns))
                _id = f"fd_{fd_df.columns[c][0].replace(' ','_')}"
                
            yield takco.Table(head=head, body=fd_df.values, _id=_id)

def write_snow(t, name, fd_path):
    doc = takco.evaluate.dataset.WebDataCommons.convert_back(t, snow=True)
    fname = Path(fd_path).joinpath(name)
    with open(fname, 'w') as fw:
        json.dump(doc, fw, ensure_ascii=False)
            
fd_path = ROOT.joinpath(f'{dataset_name}/normalised_X_fd_relations')
!rm -r $fd_path
Path(fd_path).mkdir(parents=True,exist_ok=True)

class_value_fk = {}
from collections import Counter
class_nfds = Counter()
for partid, df in partid_df.items():
    keys = partid_keys[partid]
    columns = list(zip(*partid_head[partid]))
    colnr_fk = partid_colnr_fk[partid]
    if any(k in colnr_fk for k in keys):
        # split off top-scoring FK in keys, add its values to FK table
        fkcolnr = max(keys, key=lambda k: colnr_fk.get(k, {}).get('score', 0))
        fkclass = colnr_fk.get(fkcolnr, {}).get('class')
        value_fk = class_value_fk.setdefault(fkclass, {})
        prefix = f"{fkclass}_{dataset_name}"
        fks = [f"{prefix}~Row{value_fk.setdefault(v, len(value_fk))}" for v in df[fkcolnr]]
        df = df.fillna('')
        df[fkcolnr] = fks
        columns[fkcolnr] = ('FK',)
        
        for c in df:
            if looks_numeric(df[c]):
                df[c] = make_numeric(df[c]).fillna('').astype('str')
    
        # for each non-key column, make a table object and write it out
        for t in decompose_fd_tables(df, keys, pd.Series(columns)):
            class_nfds[fkclass] += 1
            name = f"{prefix}_fd_{class_nfds[fkclass]}.json"
            print(name)
            write_snow(t, name, fd_path)
        
    else:
        print(f"Partition {partid} has no fk in keys {keys}!")

for fkclass, value_fk in class_value_fk.items():
    value_fk = class_value_fk['VideoGame']
    body = [(f"{fkclass}_{dataset_name}~Row{i}", val) for val, i in value_fk.items()]
    head = [('PK', 'rdf-schema#label')]
    fktable = takco.Table(head=head, body=body)
    name = f"{fkclass}_{dataset_name}.json"
    print(name)
    write_snow(t, name, fd_path)
        
fd_dataset = takco.evaluate.dataset.WebDataCommons(fnames=list(fd_path.glob("*.json")))
takco.TableSet.dataset(fd_dataset).preview(ntables=None)

## Finding FDs

In [38]:
# Find FDs in gold tables
from snow_pipeline import *

dataset_name = 'www.cia.gov'
# dataset_name = 'itunes.apple.com'
# dataset_name = 'www.amoeba.com'
# dataset_name = 'seatgeek.com'

snow_root = Path('~/snow/').expanduser().absolute()

import pandas as pd
gold_fdeps_path = snow_root.joinpath(f'datasets/{dataset_name}/evaluation/functional_dependencies.tsv')
for row in pd.read_csv(gold_fdeps_path, sep='\t', header=None).values:
    print(row[1], '->', len(row[2].split(',')))
print()

# Load gold tables
tane = takco.link.Tane(snow_root.joinpath('tane'))
fnames = list(snow_root.joinpath(f'datasets/{dataset_name}/evaluation/normalised').glob('*'))
dataset = takco.evaluate.dataset.WebDataCommons(fnames=fnames)
tabid_df = {t._id:t.df for t in takco.TableSet.dataset(dataset)}

show = lambda cs: '[%s]'%(' '.join(map('|'.join, zip(*cs))))



for tabid, df in tabid_df.items():
    if any('PK' in h for h in df.columns):
        continue
    print(tabid, df.shape)
    dfi = pd.DataFrame(df.values)
    
    print(get_keylike_columns(dfi) )
    print([d for ds in df.columns for d in ds if 'date' in d])
    
    fkcolnr = list(df.columns).index( ('FK',) )
    fds = get_pervalue_pdfs(dfi, fkcolnr, stoplevel=2, minp=.95)

    for det, dep in sorted(fds.items()):
        print(' ', show(df.columns[list(det)]), '->', dep)
    print()

FK,date of information -> 109
FK -> 54

Country_www.cia.gov_rel_421.json (297, 55)
[22, 26, 46, 54]
[]
  [FK] -> {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53}

Country_www.cia.gov_rel_423.json (8000, 111)
[101, 104, 110]
['date of information']
  [FK] -> {0, 2, 4, 5, 7, 9, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 25, 26, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 45, 46, 48, 49, 51, 52, 53, 54, 56, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 86, 88, 89, 90, 91, 93, 94, 96, 98, 100, 101, 102, 103, 105, 106, 107, 108, 109}
  [FK|manpower reaching militarily significant age annually] -> {1, 97, 6, 10, 110, 47, 83, 28}
  [FK|date of information] -> {1, 3, 6, 10, 18, 83, 50, 87, 55, 57, 92}



In [15]:
# Find FDs in tables, find frequent determiners
from snow_pipeline import *

dataset_name = 'itunes.apple.com'
snow_root = Path('~/snow/').expanduser().absolute()
benchmark_datasets = dict(get_snow_datasets(snow_root))
tabid_table = {t._id: t for t in takco.TableSet.dataset(benchmark_datasets[dataset_name][0])}

import pandas as pd
gold_fdeps_path = snow_root.joinpath(f'datasets/{dataset_name}/evaluation/functional_dependencies.tsv')
for row in pd.read_csv(gold_fdeps_path, sep='\t', header=None).values:
    print(row[1], '->', row[2])
print()

tane = takco.link.Tane(snow_root.joinpath('tane'))
from collections import Counter
det_count = Counter()
for tabid, t in tabid_table.items():
    print(tabid)
    
    df = t.df
    df = df.iloc[:, [ci for ci, c in enumerate(df.columns) if 'NULL' not in c]]
    df = extract_bracket_disambiguation(df)
    show = lambda cs: '[%s]'%(' '.join(map('|'.join, zip(*cs))))
    print(show(df.columns))
    
    allow_missing = 1
    main_cols = set(df.columns) - set(get_context_headers(df.columns)) - set(get_singleton_cols(df))
    try:
        for det, dep in tane.rundf(df, stoplevel=6, g3_threshold=.05).items():
            missing = main_cols - (set(det)|set(dep))
            if not missing:
                print(' ', show(set(det)), '->', show(set(dep)))
                det_count[tuple(set(det))] += 1
            elif len(missing) <= allow_missing:
                det_count[tuple(set(det))] += 1
                print(' ', show(det), '->', show(dep), 'missing:', show(missing))
    except tane.TaneException:
        pass
    print()
    
print({show(det):c for det,c in det_count.items() if c>1})

FK,page title,Disambiguation of name,album,region -> row nr
FK,Disambiguation of name,album,Disambiguation of album -> time
FK,region,time -> price
FK,Disambiguation of name,album -> track nr
FK,time,price -> Disambiguation of name,Disambiguation of album

75.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|name|description|price]

61.json
[page title|table heading|uri 0|uri 1|name|album|artist|time|price]
  [name] -> [price|artist|time|album]
  [page title|time] -> [name|price|artist|album]
  [uri 1|time] -> [name|price|artist|album]
  [price|time] -> [name|artist|album]

69.json
[page title|table heading|uri 0|uri 1|uri 2|namn|album|tid|pris]
  [namn] -> [tid|album]
  [tid] -> [namn|album]

72.json
[page title|table heading|uri 0|uri 1|uri 2|name|album|time|price|disambiguation of page title]

44.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|naam|album|lengte|prijs]
  [naam] -> [uri 3|page title|lengte|prijs|uri 2|album]
  [lengte|album] -> [naam|prijs]

68.json
[page 

  [name] -> [uri 0|page title|artist|time|uri 2|price]
  [page title|time] -> [price|artist] missing: [name]
  [uri 2|time] -> [price|artist] missing: [name]
  [artist|time] -> [price] missing: [name]

2.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|nombre|artista|duración|precio]
  [nombre|uri 2] -> [uri 3|page title|artista|precio|uri 0|duración]
  [page title|nombre] -> [duración|artista] missing: [precio]
  [nombre|duración] -> [uri 3|page title|artista] missing: [precio]
  [page title|duración] -> [artista|precio] missing: [nombre]
  [uri 2|duración] -> [artista|precio] missing: [nombre]
  [uri 3|nombre] -> [duración|artista] missing: [precio]
  [uri 3|duración] -> [artista|precio] missing: [nombre]
  [nombre|artista] -> [duración] missing: [precio]
  [page title|nombre|uri 0] -> [uri 3|artista|precio|uri 2|duración]
  [uri 3|nombre|uri 0] -> [duración|artista|precio|uri 2]
  [artista|nombre|uri 0] -> [uri 3|page title|precio|uri 2|duración]
  [duración|nombre|uri 0] -> [

  [name|page title|album] -> [price|time|uri 2|uri 0]
  [name|page title|time] -> [price|album|uri 2|uri 0]
  [page title|time|album] -> [name|price|uri 2|uri 0]
  [name|album|uri 2] -> [price|time]
  [name|time|uri 2] -> [price|album]
  [album|time|uri 2] -> [name|price]

40.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|nombre|álbum|artista|duración|precio]
  [nombre] -> [duración|artista|precio|álbum]
  [page title|nombre] -> [uri 0|artista|duración|precio|álbum|uri 2]
  [duración|page title] -> [uri 0|artista|nombre|precio|álbum|uri 2]
  [nombre|uri 0] -> [uri 3|page title|artista|duración|precio|álbum|uri 2]
  [duración|uri 0] -> [uri 3|page title|artista|nombre|precio|álbum|uri 2]
  [nombre|uri 2] -> [duración|artista|precio|álbum]
  [duración|uri 2] -> [artista|nombre|precio|álbum]
  [uri 3|nombre] -> [uri 0|artista|duración|precio|álbum|uri 2]
  [uri 3|duración] -> [uri 0|artista|nombre|precio|álbum|uri 2]
  [álbum|duración] -> [artista|nombre|uri 2|uri 3|page title|uri

  [name] -> [uri 0|page title|artist|time|uri 2|price|album]
  [page title|time] -> [name|price|artist|album]
  [time|uri 2] -> [name|price|artist|album]
  [artist|time] -> [name|album] missing: [price]

34.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|naam|album|lengte|prijs]
  [naam|album] -> [lengte|prijs|uri 2]
  [lengte|album] -> [naam|prijs|uri 2]

4.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|name|album|time|price]
  [name|album] -> [uri 3|uri 0|page title|time|uri 2|price]
  [name|time] -> [uri 2|uri 3|page title|uri 0|price] missing: [album]
  [album|time] -> [price] missing: [name]

49.json
[page title|table heading|uri 0|uri 1|uri 2|name|album|artist|time|price]
  [name] -> [price|artist|time|album]
  [page title|time] -> [name|price|artist|album]
  [uri 1|time] -> [name|price|artist|album]
  [time|uri 2] -> [name|price|artist|album]

32.json
[page title|table heading|uri 0|uri 1|uri 2|uri 3|name|album|länge|preis]
  [name|album] -> [preis|länge]
  [name|