In [1]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8786')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8786  Dashboard: http://192.168.62.207:8787/status,Cluster  Workers: 10  Cores: 10  Memory: 673.47 GB


In [2]:
import takco
fnames = 'hdfs://bricks07:9000/user/kruit/wikidump-tables/0100*.jsonl'
tables = takco.DaskHashBag(client=client).load(fnames).persist()
tables.bag.count().compute()

1800

In [3]:
import takco

steps = takco.config.build('step', load=['resources/graphs/wikidata.toml','resources/pipelines/wikitables.toml'])
prefix_header_rules = steps[0]['prefix_header_rules']
unpivot_heuristics = steps[0]['unpivot_heuristics']

reshaped = takco.TableSet.reshape(
    tables,
    restructure = True,
    prefix_header_rules = prefix_header_rules,
    unpivot_heuristics = unpivot_heuristics,
).tables.persist()
len(reshaped)

1229

In [4]:
from collections import Counter
def get_pivoted_cells(ts):
    cells = []
    for t in ts:
        if 'pivot' in t.provenance:
            pivot = t.provenance['pivot']
            for cell in pivot['headers'][pivot['level']][pivot['colfrom']:pivot['colto']+1]:
                cells.append( (cell, pivot['heuristic']) )
    return cells

unpivoted_freq = reshaped.bag.map_partitions(get_pivoted_cells).frequencies().compute()
print(f"Top unpivoted headers:")
for (cell, heuristic), freq in Counter(dict(unpivoted_freq)).most_common(50):
    print(f"{freq:4d} {cell:<40s} ({heuristic})")

Top unpivoted headers:
  15 0                                        (sports-round)
  13 1                                        (NumSuffix)
  13 2                                        (NumSuffix)
  13 3                                        (NumSuffix)
  13 4                                        (NumSuffix)
  13 5                                        (NumSuffix)
  13 6                                        (NumSuffix)
  13 7                                        (NumSuffix)
  13 8                                        (NumSuffix)
  13 9                                        (NumSuffix)
  12 AR                                       (AgentLikeHyperlink)
  12 ER                                       (AgentLikeHyperlink)
  12 GR                                       (AgentLikeHyperlink)
  12 MR                                       (AgentLikeHyperlink)
  12 SHBG                                     (AgentLikeHyperlink)
  11 US                                       (AgentLikeHyp

In [9]:
headclustered = takco.TableSet.cluster(
    reshaped,
    addcontext = ['pgTitle'],
    headerunions = True
).tables.persist()
len(headclustered)

477