# Comparison to SNoW

> Given a set of web tables and a target knowledge base, the SNoW method extends each web table with additional context columns, stitches matching web tables into larger tables, and applies functional dependency discovery to identify the relations that are represented in the web tables. Further, it normalises the stitched tables, guided by the schema of the knowledge base, to create an integrated schema.

We assume that the tables are already context-enriched, and schema-unioned per Pay-Level Domain. Our task is now to match and stitch these supertables into universal tables, and decompose them into normalised relations.

## Matching without FDs

In [6]:
%%time
from snow_pipeline import *
log.getLogger().setLevel(log.DEBUG)

snow_root = Path('~/snow/').expanduser().absolute()
kb = KB(snow_root)

Loading KB classes: 100%|██████████| 20/20 [00:03<00:00,  5.35it/s]
DEBUG:root:Made KB feature matrix of shape (20, 2336698)


CPU times: user 35.4 s, sys: 1.34 s, total: 36.7 s
Wall time: 37.7 s


In [275]:
%%time
from snow_pipeline import *
log.getLogger().setLevel(log.DEBUG)

snow_root = Path('~/snow/').expanduser().absolute()
benchmark_datasets = dict(get_snow_datasets(snow_root))
for name, ds in benchmark_datasets.items():
    print(f'{len(ds[0].fnames):3d}', name)

dataset_name = 'www.nndb.com'
ts = list(takco.TableSet.dataset(benchmark_datasets[dataset_name][0]))#[-50:]
print(f"Loaded {dataset_name},  {len(ts)} tables")

 12 d3football.com
  8 www.vgchartz.com
213 www.cia.gov
 29 www.nndb.com
  6 flightaware.com
 76 itunes.apple.com
 74 seatgeek.com
 65 www.amoeba.com
 13 data.bls.gov
Loaded www.nndb.com,  29 tables
CPU times: user 4.15 s, sys: 95.5 ms, total: 4.24 s
Wall time: 4.27 s


In [276]:
%%time
tabid_df = preprocess_tables(ts)

Extracting disambguation columns: 100%|██████████| 29/29 [00:01<00:00, 23.57it/s]
Renaming temporal columns: 100%|██████████| 29/29 [00:01<00:00, 19.28it/s]
Renaming URI prefix columns: 1it [00:02,  2.20s/it]

CPU times: user 20.6 s, sys: 36 ms, total: 20.6 s
Wall time: 20.7 s





In [302]:
# Which features of these tables can we use to match them?
from snow_pipeline import *

predict_fkclasses(tabid_df, dataset_name, kb)
# for tabid, df in tabid_df.items():
# #     print(tabid, *('|'.join(c) for c in zip(*df.columns)))
#     for c, s in df.iteritems():
#         top = pd.Series(TfidfMatcher._analyzer(s)).value_counts()[:3]
# #         if c[0].startswith('pr'):
#         print(tabid, c, dict(top))

DEBUG:root:[www.nndb.com] [0.json] Class predictions: ['page title:Film/1.24e-01', 'uri 0:Company/3.95e-08', 'name:Artist/6.70e-02', 'occupation:Film/3.94e-07', 'known for:Film/6.37e-02']
DEBUG:root:[www.nndb.com] [1.json] Class predictions: ['page title:Single/6.90e-04', 'uri 0:Single/3.65e-10', 'name:Person/4.75e-01', 'occupation:Country/3.49e-05', 'known for:Film/3.52e-01']
DEBUG:root:[www.nndb.com] [10.json] Class predictions: ['page title:VideoGame/1.45e-03', 'uri 0:TelevisionShow/2.45e-07', 'name:Person/3.30e-01', 'occupation:Country/1.05e-03', 'known for:VideoGame/4.02e-01']
DEBUG:root:[www.nndb.com] [11.json] Class predictions: ['page title:SportsTeam/2.76e-03', 'uri 0:VideoGame/1.34e-06', 'name:Athlete/2.73e-01', 'occupation:SportsTeam/6.26e-04', 'known for:Country/1.69e-01']
DEBUG:root:[www.nndb.com] [12.json] Class predictions: ['uri 0:Building/3.36e-06', 'name:Person/6.59e-01', 'occupation:Country/4.05e-04', 'known for:Film/8.29e-01']
DEBUG:root:[www.nndb.com] [13.json] Cla

{'0.json': (0, 'Film'),
 '1.json': (5, 'Person'),
 '10.json': (9, 'VideoGame'),
 '11.json': (5, 'Athlete'),
 '12.json': (9, 'Film'),
 '13.json': (5, 'Artist'),
 '14.json': (5, 'OfficeHolder'),
 '15.json': (9, 'VideoGame'),
 '16.json': (5, 'Company'),
 '17.json': (5, 'Company'),
 '18.json': (5, 'EducationalInstitution'),
 '19.json': (9, 'Film'),
 '2.json': (5, 'OfficeHolder'),
 '20.json': (9, 'Film'),
 '21.json': (5, 'Museum'),
 '22.json': (5, 'AdministrativeRegion'),
 '23.json': (9, 'TelevisionShow'),
 '24.json': (6, 'Athlete'),
 '25.json': (9, 'Country'),
 '26.json': (5, 'Museum'),
 '27.json': (5, 'Band'),
 '28.json': (5, 'Building'),
 '3.json': (5, 'OfficeHolder'),
 '4.json': (5, 'Person'),
 '5.json': (5, 'Artist'),
 '6.json': (5, 'OfficeHolder'),
 '7.json': (5, 'OfficeHolder'),
 '8.json': (9, 'Film'),
 '9.json': (5, 'OfficeHolder')}

In [297]:
from snow_pipeline import *

matchers = [
    KBClassMatcher(kb, include_context=False, pred_max_threshold=0.9),
    ExactHeadMatcher(include_context=False),
    TfidfMatcher(num_threshold=0.75, min_df=2),
]
agg_func = "KBClassMatcher * @max(ExactHeadMatcher, TfidfMatcher)"
agg_threshold_col = 0

colsim, idpairs = get_colsim_and_codes(
    tabid_df, matchers,
    agg_func=agg_func, 
    agg_threshold_col=agg_threshold_col
)
partcols = cluster_columns(colsim)
partcolid_to_colids = aggr_by_val(partcols.items())
partcolid_to_colids

Getting KBClassMatcher candidates: 100%|██████████| 29/29 [00:32<00:00,  1.12s/it]
Getting column text: 100%|██████████| 29/29 [00:23<00:00,  1.21it/s]
Extracting features: 100%|██████████| 269/269 [00:06<00:00, 39.29it/s] 
DEBUG:root:Got (269, 36958) column features. Calculating similarities...
DEBUG:root:Clustering columns...


{2: {"0.json~Col5 ('name',)",
  "1.json~Col5 ('name',)",
  "10.json~Col5 ('name',)",
  "11.json~Col5 ('name',)",
  "12.json~Col5 ('name',)",
  "13.json~Col5 ('name',)",
  "14.json~Col5 ('name',)",
  "15.json~Col5 ('name',)",
  "19.json~Col5 ('name',)",
  "2.json~Col5 ('name',)",
  "20.json~Col5 ('name',)",
  "23.json~Col5 ('name',)",
  "24.json~Col6 ('representative',)",
  "25.json~Col5 ('name',)",
  "3.json~Col5 ('name',)",
  "4.json~Col5 ('name',)",
  "5.json~Col5 ('name',)",
  "6.json~Col5 ('name',)",
  "7.json~Col5 ('name',)",
  "8.json~Col5 ('name',)",
  "9.json~Col5 ('name',)"},
 4: {"0.json~Col6 ('occupation',)",
  "1.json~Col6 ('occupation',)",
  "10.json~Col6 ('occupation',)",
  "11.json~Col6 ('occupation',)",
  "12.json~Col6 ('occupation',)",
  "13.json~Col6 ('occupation',)",
  "14.json~Col6 ('occupation',)",
  "15.json~Col6 ('occupation',)",
  "19.json~Col6 ('occupation',)",
  "2.json~Col6 ('occupation',)",
  "20.json~Col6 ('occupation',)",
  "23.json~Col6 ('occupation',)",


In [298]:
partid_to_tabids = partition_connected_components(tabid_df, partcolid_to_colids, idpairs)
for partid, tis in partid_to_tabids.items():
    print(f"part-{partid}", len(tis), tis)

part-0 21 {'4.json', '23.json', '0.json', '12.json', '1.json', '11.json', '14.json', '20.json', '13.json', '3.json', '2.json', '5.json', '9.json', '8.json', '19.json', '25.json', '15.json', '6.json', '24.json', '10.json', '7.json'}
part-1 8 {'17.json', '27.json', '22.json', '21.json', '26.json', '18.json', '28.json', '16.json'}


In [299]:
from snow_pipeline import *

stitched = stitch_colclustered_tables(tabid_df, partcols, idpairs)
stitched_df = {}
for partid, (df, cols) in enumerate(stitched):
    df.columns = pd.MultiIndex.from_tuples(cols)
    stitched_df[f"part-{partid}"] = df
    print(*zip(*cols))
    display( df.sample( min(5, len(df)) ) )
#     if len(df.columns) > 10:
#         break

DEBUG:root:Stitching 21 aligned tables


('page title', 'table heading', 'uri 0', 'uri 1', 'uri 2', 'name', 'occupation', 'birth', 'death', 'known for', 'district', 'since', 'party')


Unnamed: 0,page title,table heading,uri 0,uri 1,uri 2,name,occupation,birth,death,known for,district,since,party
81461,murder on flight 502,,films,860,205245,ralph bellamy,actor,17-jun-1904,29-nov-1991,trading places,,,
25376,risk factor: lyme disease,,lists,101,369988,richard gere,actor,31-aug-1949,,american gigolo,,,
2393,exodus,,films,701,35596,sal mineo,actor,10-jan-1939,12-feb-1976,dies in rebel without a cause,,,
1620,burglary,,crime,314,47173,ryan leaf,football,15-may-1976,,disappointing nfl quarterback,,,
39093,occupation: sculptor,,lists,332,96044,aristide maillol,sculptor,8-apr-1861,27-sep-1944,sculptor of the female nude,,,


DEBUG:root:Stitching 8 aligned tables


('page title', 'table heading', 'uri 0', 'uri 1', 'uri 2', 'company', 'founded', 'sales', 'employees', 'disambiguation of sales (year)', 'type', 'population', 'orchestra')


Unnamed: 0,page title,table heading,uri 0,uri 1,uri 2,company,founded,sales,employees,disambiguation of sales (year),type,population,orchestra
3,"springfield, ma",,geo,724,69517,american international college,1885.0,,,,,,
614,"philadelphia, pa",,geo,659,69452,dechert llp,1875.0,,,,,,
787,"houston, tx",,geo,540,69333,quanta services,1997.0,,,,,,
80,sector: communications,,lists,628,98334,bellsouth,,,,,,,
91,"charlotte, nc",,geo,453,69246,johnson c. smith university,1867.0,,,,,,


In [300]:
df = stitched_df['part-0']
print(len(df))
sim = kb.get_sim(df)
display(df.describe().T)
pred = pd.DataFrame({"column": list(df.columns[list(sim.columns)]), "class": sim.idxmax(), "score": sim.max()})
display( pred.style.background_gradient() )

227226


Unnamed: 0,count,unique,top,freq
page title,227226,22007,university of pennsylvania,523
table heading,227226,1,,227226
uri 0,227226,20,films,104010
uri 1,227226,997,640,1114
uri 2,227226,22791,000068313,523
name,227226,37563,donald sutherland,82
occupation,227180,164,actor,117879
birth,227180,23854,,951
death,227180,13018,,130437
known for,227180,34338,american character actor,625


Unnamed: 0,column,class,score
0,"('page title',)",Film,0.050354
1,"('table heading',)",Hospital,0.0
2,"('uri 0',)",Company,1e-06
5,"('name',)",Person,0.121111
6,"('occupation',)",Country,2e-06
9,"('known for',)",Film,0.098536


In [301]:
tabid_to_colnr_and_fkclass = predict_fkclasses(stitched_df, dataset_name, kb)
tabid_to_colnr_and_fkclass

DEBUG:root:[www.nndb.com] [part-0] Class predictions: ['page title:Film/5.04e-02', 'uri 0:Company/1.34e-06', 'name:Person/1.21e-01', 'occupation:Country/1.72e-06', 'known for:Film/9.85e-02']
DEBUG:root:[www.nndb.com] [part-1] Class predictions: ['page title:Company/3.22e-03', 'uri 0:Company/1.86e-07', 'company:EducationalInstitution/5.49e-01']


{'part-0': (5, 'Person'), 'part-1': (5, 'EducationalInstitution')}

In [None]:
nary_induction = True
decomposed = iter_decomposed(
    stitched_df, 
    dataset_name, 
    tabid_to_colnr_and_fkclass, 
    nary=nary_induction, 
    nary_stoplevel=2,
    nary_minp= 0.95
)
for t in postprocess_tables(decomposed, numeric_threshold=0.5):
    print(f"{t._id}: {len(t.df)} rows")
    display( t.df.sample(min(len(t.df), 3)) )

DEBUG:root:[itunes.apple.com] [part-0] Decomposing class Single for col 10 (('nome',))
DEBUG:root:[itunes.apple.com] [part-0] Not decomposing context columns [('page title',), ('uri 1 (se)',), ('disambiguation of name',), ('table heading',), ('disambiguation of album',), ('uri 0',), ('uri 3',), ('uri 2 (album)',), ('uri 1 (music video)',), ('uri 2 (music video)',), ('uri 2 (artist)',), ('uri 2',), ('uri 1',), ('uri 1 (gb)',), ('uri 1 (podcast)',), ('uri 2 (podcast)',), ('disambiguation of page title',), ('uri 1 (hn)',)]
DEBUG:root:[itunes.apple.com] [part-0] Inferring FDs for [page title|NULL|duração|uri 0|uri 2 (album)|preço|artista|FK|name|album|interpret|álbum]
FD candidates:  46%|████▋     | 31/67 [00:29<00:27,  1.30it/s]