In [1]:
%%time
import pandas as pd
topics = [
    'Business', 'Economy', 'Education', 'Energy', 
    'Engineering', 'Events', 'Food and drink', 'Geography', 
    'Government', 'History', 'Industry', 
    'Language', 'Law', 'Mathematics', 'Military', 
    'Music', 'Nature', 
    'Religion', 'Science and technology', 'Sports'
]
# topics = ['Music', 'Politics', 'Sports', 'Business', 'Science']

import urllib.parse as ul
import trident
cat_db = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')

topic_subcats = {}
topic_articles = {}
for topic in topics:
    topic = topic.replace(" ","_")
    c = cat_db.lookup_id(f'<http://dbpedia.org/resource/Category:{topic}>')
    if not c:
        continue
        
    br = cat_db.lookup_id('<http://www.w3.org/2004/02/skos/core#broader>')
    subj = cat_db.lookup_id('<http://purl.org/dc/terms/subject>')

    arts = set()
    cats = set([c])
    arts.update(cat_db.s(subj, c))
    for c2 in cat_db.s(br, c):
        arts.update(cat_db.s(subj, c2))
        cats.add(c2)
        for c3 in cat_db.s(br, c2):
            if c3 not in cats:
                arts.update(cat_db.s(subj, c3))
                cats.add(c3)
                for c4 in cat_db.s(br, c3):
                    if c4 not in cats:
                        arts.update(cat_db.s(subj, c4))
    
    topic_subcats[topic] = cats
    topic_articles[topic] = arts
    print(f"{topic:>30s}: {len(cats):5d} subcats, {len(arts):8d} articles")
    
def get_article_titles(arts):
    for a in arts:
        yield cat_db.lookup_str(a)[1:-1].replace('http://dbpedia.org/resource/', '').replace('_', ' ')

topics = sorted(topic_articles, key=lambda k: -len(topic_articles[k]))[:10]
print(f"Largest topics:", topics)
topic_articles = {t:topic_articles[t] for t in topics}

for topic, articles in topic_articles.items():
    with open(f'/export/scratch1/home/kruit/scratch/wikicat/{topic}.txt', 'w') as fw:
        for a in get_article_titles(articles):
            print(a, file=fw)

                      Business:   454 subcats,    73164 articles
                       Economy:   352 subcats,    54204 articles
                     Education:   523 subcats,    50648 articles
                        Energy:   441 subcats,    18630 articles
                   Engineering:   195 subcats,    38870 articles
                        Events:   789 subcats,    51487 articles
                Food_and_drink:   647 subcats,    41434 articles
                     Geography:   246 subcats,    44074 articles
                    Government:  1064 subcats,   141264 articles
                        Health:   587 subcats,   112370 articles
                       History:   516 subcats,    85833 articles
                      Industry:   504 subcats,    45565 articles
                      Language:   339 subcats,    48625 articles
                           Law:   421 subcats,    53521 articles
                   Mathematics:   208 subcats,    30739 articles
                      Mil

In [2]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8786')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8786  Dashboard: http://192.168.62.207:8787/status,Cluster  Workers: 10  Cores: 10  Memory: 673.47 GB


In [3]:
import dask.bag as db
from takco.util import robust_json_loads_lines
fnames = 'hdfs:///user/kruit/output/tabel-small-6+7+8+9/9-triples/*'
# fnames = '/export/scratch1/home/kruit/scratch/output/tabel-small-4/9-triples/*'
bag = db.read_text(fnames).map_partitions(robust_json_loads_lines).persist()
bag.count().compute()

14702

In [4]:
# Top header cells

from collections import Counter

def get_pagetitles(prov):
    if 'pgTitle' in prov:
        yield prov['pgTitle']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pagetitles(c)
            
            
def frequencies(bag, func, pagetitle_filter=()):
    def multiply_matched(ts, func, pagetitle_filter):
        items = []
        for t in ts:
            n = 1
            if pagetitle_filter:
                n = len(set(get_pagetitles(t.provenance)) & set(pagetitle_filter))
            for x in func(t):
                for _ in range(n):
                    items.append(x)
        return items
    
    if isinstance(bag, db.Bag):
        extract = bag.map_partitions(multiply_matched, func, pagetitle_filter)
        return Counter(dict(extract.frequencies().compute()))
    else:
        return Counter(multiply_matched(bag, func, pagetitle_filter))



def get_header_cells(t):
    for hr in t.get('tableHeaders'):
        for c in hr:
            text = c.get('text', '').strip()
            if text and text[0] != '_':
                yield text

def top_headers(bag, pagetitle_filter=(), n=10):
    return pd.Series(dict(frequencies(bag, get_header_cells, pagetitle_filter).most_common(n)))

top_headers(bag)

Year      2133
Name      1771
Date      1549
Team      1317
Title     1182
Rank      1048
Notes     1034
Result     885
W          853
L          838
dtype: int64

In [5]:
import trident
prop_db = trident.Db('/export/scratch1/home/kruit/20200713-prop-skos')
plabel = prop_db.lookup_id('<http://www.w3.org/2004/02/skos/core#prefLabel>')
def label(uri):
    i = prop_db.lookup_id(f"<{uri}>")
    if i:
        for li in prop_db.o(i, plabel):
            l = prop_db.lookup_str(li).strip()
            if l.endswith('@en'):
                return l[1:-4]

In [6]:
# Top classes
from collections import Counter
bad = ['Q17442446','Q12139612','Q15633587','Q14204246','Q4167410','Q48522', 'Q11266439', 'Q4167836', 'Q13406463', 
       'Q21025364', 'Q56248902', 'Q164509']
bad += ['Q154954', 'Q5'] # natural person and human (redundant if we have "person")

def get_coltypes(t):
    for _, cs in t.get('classes', {}).items():
        for c, score in cs.items():
            if not any(c.endswith(b) for b in bad):
                yield c

ent_name = lambda uri: uri.split('/')[-1] + " " + (label(uri) or '')
                
def top_coltypes(bag, pagetitle_filter=(), n=10):
    freqs = frequencies(bag, get_coltypes, pagetitle_filter).most_common(n)
    return pd.Series({label(cls): count for cls, count in freqs if label(cls)})

top_coltypes(bag)

sports season                     1377
sports season of a sports club     608
sports festival                    400
person                             398
season                             385
competition                        363
dtype: int64

In [7]:
wikiprops = '/export/scratch1/home/kruit/nary/data/kb/wikidata/wikidata-properties.txt'
puri_name = {
    l.split(' ', 1)[0]: tuple(l.split(' ', 1))
    for l in map(str.strip, open(wikiprops))
}

In [8]:
# Top classes
from collections import Counter

def get_props(t):
    for _, cps in t.get('properties', {}).items():
        for _, ps in cps.items():
            for p, score in ps.items():
                yield p.split('/')[-1]

def top_props(bag, pagetitle_filter=(), n=10):
    freqs = frequencies(bag, get_props, pagetitle_filter).most_common(n)
    return pd.Series({puri_name.get(prp, prp)[1]: count for prp, count in freqs})
                
top_props(bag)

member of sports team                               103
cast member                                         101
winner                                              100
located in the administrative territorial entity     94
follows                                              64
participant of                                       59
performer                                            50
category's main topic                                49
topic's main category                                44
part of                                              43
dtype: int64

In [9]:
def get_pivots(prov, pagetitle_filter = ()):
    if 'pivot' in prov:
        if (not pagetitle_filter) or (prov['pgTitle'] in pagetitle_filter):
            yield prov['pivot']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pivots(c, pagetitle_filter=pagetitle_filter)
            
def get_table_pivots(t, pagetitle_filter = ()):
    for pivot in get_pivots(t.provenance, pagetitle_filter=pagetitle_filter):
        yield from pivot['headers'][pivot['level']][pivot['colfrom']:pivot['colto']+1]

def top_pivots(bag, pagetitle_filter=(), n=10):
    return pd.Series(dict(frequencies(bag, lambda t: get_table_pivots(t, pagetitle_filter)).most_common(n)))
        
top_pivots(bag)

1         6937
Total     6607
2         6218
3         6207
4         6085
League    4691
5         4267
6         4258
7         4064
8         3970
dtype: int64

In [12]:
%%time
import pandas as pd
from pathlib import Path
        
def filter_tables(ts, topic):
    articles = set(open(f'/export/scratch1/home/kruit/scratch/wikicat/{topic}.txt').read().splitlines())
    tables = []
    for t in ts:
        if articles & set(get_pagetitles(t.provenance)):
            tables.append(t)
    return tables

def table_pagetitles(t):
    return list(get_pagetitles(t.provenance))
        
n_origtables = bag.map(table_pagetitles).flatten().count().compute()
n_pages = bag.map(table_pagetitles).flatten().distinct().count().compute()
print(f"Got a total of {n_origtables} original tables on {n_pages} pages")
print()

root = Path('fig/wikicat')
root.mkdir(exist_ok=True)
        
ntopics = len(topic_articles)
tops = {}
for ti, (topic, articles) in enumerate(sorted(topic_articles.items(), key=lambda ta: len(ta[1]))):
    article_titles = set(get_article_titles(articles))
    print(topic, f"({len(article_titles)} articles)", f"[{ti+1}/{ntopics}]")
    
    tables = bag.map_partitions(filter_tables, topic).compute() 
    
    tabletitles = [pt for t in tables for pt in table_pagetitles(t)]
    ntables = len(tables)
    norigtables = len(tabletitles)
    ncols = sum(t['numCols'] for t in tables)
    print(f'{ntables} supertables with {ncols} columns; {norigtables} original tables')
    
    print(f"{len(set(tabletitles) & article_titles)} article matches")
    print('Examples:', list(set(tabletitles) & article_titles)[:10])
    
    if ntables:
        for kind in ['headers', 'coltypes', 'props', 'pivots']:
            f = eval(f"top_{kind}")
            top = f(tables, pagetitle_filter=article_titles, n=20)
            tops.setdefault(kind, {})[topic] = top
            print(kind, dict(top.head(3)))
        
    print()

Got a total of 211209 original tables on 102646 pages

Law (53521 articles) [1/10]
179 supertables with 1013 columns; 46862 original tables
404 article matches
Examples: ['Robert Kingscote', 'Paul Fletcher (politician)', 'Croatia–Serbia genocide case', 'Jan I of Żagań', 'Christopher Palles', 'Celestine Babayaro', 'William Watson, Baron Watson', '2009 Honduran constitutional crisis', 'Hamoodur Rahman', 'Research, Development and Evaluation Commission']
headers {'Preceded by': 234, 'Succeeded by': 195, 'Year': 59}
coltypes {'person': 255, 'legislature': 37, 'organ': 8}
props {'child': 414, 'father': 222, 'position held': 196}
pivots {'Became heir': 18, 'Ceased to be heir': 18, 'Total': 9}

Economy (54204 articles) [2/10]
150 supertables with 771 columns; 42270 original tables
183 article matches
Examples: ['Starvation', 'Jun Song', 'Pirate Party UK', 'List of AEW&C aircraft operators', 'List of Charvet customers', "Angelo Dell'Acqua", 'Hollyoaks', 'CRDB Bank', 'Manchester United F.C.', '

In [19]:
top_combined = {}
for toptype, top in tops.items():
    dim = toptype[:-1].title()
    dfs = []
    for t, s in tops[toptype].items():
        if t not in ['Health', 'Politics', 'Economy']:
            s.index.name = dim
            dfs.append(s.reset_index(name='n').assign(Top=t))

    df = pd.concat(dfs)
    df[dim] = df[dim].map(lambda x: x if len(x)<16 else x[:16] + '...')
    df = df.pivot(columns=['Top'])
    df['n'] = df['n'].astype('Int64')
    df = df.astype('str').replace('nan', '')
    df.columns = df.columns.swaplevel()
    df = df.sort_index(axis=1).head(10)
    df.index += 1
    display.display(df)
    top_combined[toptype] = df
    df.to_latex(root.joinpath(toptype+'.tex'))

Top,Business,Business,Government,Government,History,History,Law,Law,Military,Military,Music,Music,Sports,Sports
Unnamed: 0_level_1,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n,Header,n
1,Preceded by,197,Preceded by,1612,Preceded by,360,Preceded by,234,Preceded by,481,Year,1569,Gold,595
2,Succeeded by,145,Succeeded by,1231,Succeeded by,172,Succeeded by,195,Succeeded by,403,Title,954,Silver,590
3,Year,37,Name,283,Name,68,Year,59,Notes,57,Chart,849,Bronze,588
4,Name,26,Party,187,Year,67,Notes,31,Name,51,Album,803,Year,509
5,Title,25,Notes,98,Date,56,Name,29,Date,37,Peak position,773,Event,423
6,No.,19,Began active ser...,84,Notes,53,Title,27,Type,36,Peak chart posit...,688,Preceded by,420
7,Notes,15,Ended active ser...,84,Location,39,Role,26,Division,21,Source,466,Rank,403
8,Location,15,Left office,84,Event,25,Date,24,Origin,20,Preceded by,419,Date,365
9,Rank,11,Judge,77,Description,24,Party,20,Brigade,20,Album details,375,Total,327
10,Japanese release...,9,State,61,Country,21,Number,16,#,18,Single,307,Nation,300


Top,Business,Business,Government,Government,History,History,Law,Law,Military,Military,Music,Music,Sports,Sports
Unnamed: 0_level_1,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n,Coltype,n
1,person,204,person,1629,person,373,person,255,person,501,release,1354,person,575
2,business,14,position,49,fossil taxon,30,legislature,37,order of battle,25,single,1172,sports festival,416
3,manga series,10,cabinet,38,Wikimedia timeli...,20,organ,8,command hierarch...,16,discography,1156,sports season,169
4,organization,8,legislature,38,taxon,19,discography,5,military unit,9,album,851,sport competitio...,136
5,economic unit,7,regency of Indon...,27,formation,15,position,4,battle,9,person,463,natural number,71
6,comic book serie...,6,executive branch...,18,lithostratigraph...,14,film,4,album,7,collection,399,Formula One race...,41
7,release,6,administrative t...,18,timeline,13,organization,4,video game,7,series of creati...,351,competition,39
8,single,6,electoral result...,15,national timelin...,11,audiovisual work...,3,Wikipedia:Books,6,bibliography,140,horse race,37
9,radio station,4,government agenc...,15,release,10,coup d'\u00E9tat...,3,historical event...,5,events in a spec...,81,sporting event,36
10,free content,3,Wikimedia naviga...,14,human settlement...,9,white coup,3,ship type,5,opera,66,recurring sporti...,32


Top,Business,Business,Government,Government,History,History,Law,Law,Military,Military,Music,Music,Sports,Sports
Unnamed: 0_level_1,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n,Prop,n
1,child,325,child,2766,child,527,child,414,child,866,child,498,child,676
2,father,183,father,1543,father,359,father,222,father,471,performer,496,father,419
3,position held,143,position held,1277,position held,168,position held,196,position held,395,father,415,position held,258
4,sibling,28,sibling,134,sibling,6,cast member,25,sibling,20,part of,289,followed by,180
5,member of sports...,6,located in the a...,33,parent taxon,4,sibling,24,airline hub,6,record label,91,participant of,137
6,cast member,5,capital,28,part of,4,given name,22,cast member,5,position held,83,follows,128
7,given name,4,member of politi...,16,category's main ...,4,genre,15,item operated,4,cast member,71,sports disciplin...,121
8,notable work,4,cast member,10,derivative work,3,notable work,14,category's main ...,4,genre,57,part of,96
9,genre,4,given name,7,topic's main cat...,3,member of sports...,5,topic's main cat...,4,given name,39,has part,93
10,office held by h...,2,airline hub,7,director,3,based on,3,subclass of,3,residence,38,participant,92


Top,Business,Business,Government,Government,History,History,Law,Law,Military,Military,Music,Music,Sports,Sports
Unnamed: 0_level_1,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n,Pivot,n
1,2006,8,Pop. (2001),24,Spouse,16,Became heir,18,Combatant 1,6,US,686,Total,93
2,2008,8,Population Censu...,23,Plate No.,8,Ceased to be hei...,18,Combatant 2,6,UK,521,League,84
3,2009,8,Province,14,Recession period...,6,Total,9,Rate,3,GER,287,1,76
4,2007,7,EPDP,14,Back,6,RWR 2,8,Tonnage (GRT),3,CAN,286,2,47
5,2005,7,NLF,14,Front,5,RWR 1,8,League,3,AUS,285,4,46
6,2010,7,Ind,14,Combatant 1,5,RPG,7,1977,3,US Country,282,3,45
7,2004,6,DPLF,12,Combatant 2,5,APG,7,1979,3,NZ,242,8,44
8,2011,6,PA,10,Pres,4,SPG,7,1980,3,SWI,237,6,44
9,2012,6,JVP,10,Impf,4,BPG,7,County,2,US R&B,212,Cup,44
10,1,6,SU,10,Fut,4,US,6,Aircraft,2,AUT,178,7,43
