In [1]:
import pandas as pd
topics = [
    'Business', 'Culture', 'Economy', 'Education', 'Energy', 
    'Engineering', 'Entertainment', 'Events', 'Food and drink', 'Geography', 
    'Government', 'Health', 'History', 'Human nature', 'Humanities', 'Industry', 
    'Language', 'Law', 'Life', 'Mass media', 'Mathematics', 'Military', 
    'Music', 'Nature', 'Organizations', 'People', 'Policy', 'Politics', 
    'Religion', 'Science and technology', 'Society', 'Sports', 'Universe', 'World'
]

In [2]:
%%time
import urllib.parse as ul
import trident
g = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')

topic_subcats = {}
topic_articles = {}
for topic in topics:
    topic = topic.replace(" ","_")
    c = g.lookup_id(f'<http://dbpedia.org/resource/Category:{topic}>')
    if not c:
        continue
        
    br = g.lookup_id('<http://www.w3.org/2004/02/skos/core#broader>')
    subj = g.lookup_id('<http://purl.org/dc/terms/subject>')

    arts = set()
    cats = set([c])
    arts.update(g.s(subj, c))
    for c2 in g.s(br, c):
        arts.update(g.s(subj, c2))
        cats.add(c2)
        for c3 in g.s(br, c2):
            if c3 not in cats:
                arts.update(g.s(subj, c3))
                cats.add(c3)
                for c4 in g.s(br, c3):
                    if c4 not in cats:
                        arts.update(g.s(subj, c4))
    
    topic_subcats[topic] = cats
    topic_articles[topic] = arts
    print(f"{topic:>30s}: {len(cats):5d} subcats, {len(arts):8d} articles")

                      Business:   454 subcats,    73164 articles
                       Culture:   886 subcats,   404422 articles
                       Economy:   352 subcats,    54204 articles
                     Education:   523 subcats,    50648 articles
                        Energy:   441 subcats,    18630 articles
                   Engineering:   195 subcats,    38870 articles
                 Entertainment:   981 subcats,   132142 articles
                        Ethics:   245 subcats,    38774 articles
                        Events:   789 subcats,    51487 articles
                Food_and_drink:   647 subcats,    41434 articles
                     Geography:   246 subcats,    44074 articles
                    Government:  1064 subcats,   141264 articles
                        Health:   587 subcats,   112370 articles
                       History:   516 subcats,    85833 articles
                  Human_nature:   127 subcats,  1040093 articles
                    Human

In [3]:
%%capture --no-display
from dask.distributed import Client
client = Client(address = 'tcp://192.168.62.207:8786')
client

0,1
Client  Scheduler: tcp://192.168.62.207:8786  Dashboard: http://192.168.62.207:8787/status,Cluster  Workers: 10  Cores: 10  Memory: 673.47 GB


In [4]:
import dask.bag as db
from takco.util import robust_json_loads_lines
fnames = 'hdfs:///user/kruit/output/tabel-small-4/9-triples/*'
bag = db.read_text(fnames).map_partitions(robust_json_loads_lines).persist()
bag.count().compute()

4103

In [5]:
# Top header cells

from collections import Counter

def frequencies(bag, func):
    extract = bag.map_partitions(lambda ts: [x for t in ts for x in func(t)] )
    return Counter(dict(extract.frequencies().compute()))


def get_header_cells(t):
    for hr in t.get('tableHeaders'):
        for c in hr:
            text = c.get('text', '').strip()
            if text and text[0] != '_':
                yield text

def top_headers(bag):
    return pd.Series(dict(frequencies(bag, get_header_cells).most_common(10)))

top_headers(bag)

Year      620
Team      420
Date      415
Name      410
Title     348
W         333
L         328
Notes     283
Result    248
Player    230
dtype: int64

In [6]:
import trident
g = trident.Db('/export/scratch1/home/kruit/20200713-prop-skos')
plabel = g.lookup_id('<http://www.w3.org/2004/02/skos/core#prefLabel>')
def label(uri):
    i = g.lookup_id(f"<{uri}>")
    if i:
        for li in g.o(i, plabel):
            l = g.lookup_str(li).strip()
            if l.endswith('@en'):
                return l[1:-4]

In [7]:
# Top classes
from collections import Counter
bad = ['Q17442446','Q12139612','Q15633587','Q14204246','Q4167410','Q48522', 'Q11266439', 'Q4167836', 'Q13406463', 
       'Q21025364', 'Q56248902']

def get_coltypes(t):
    for _, cs in t.get('classes', {}).items():
        for c, score in cs.items():
            if not any(c.endswith(b) for b in bad):
                yield c

ent_name = lambda uri: uri.split('/')[-1] + " " + (label(uri) or '')
                
def top_coltypes(bag):
    freqs = frequencies(bag, get_coltypes).most_common(10)
    return pd.Series({ent_name(cls): count for cls, count in freqs})

top_coltypes(bag)

XMLSchema#string             10144
XMLSchema#decimal             6769
XMLSchema#dateTime            2303
Q13406554 sports festival      549
Q27020041 sports season        484
owl#Thing                      483
Q5 human                       478
Q215627 person                 478
Q164509 omnivore               478
Q154954 natural person         478
dtype: int64

In [8]:
wikiprops = '/export/scratch1/home/kruit/nary/data/kb/wikidata/wikidata-properties.txt'
puri_name = {
    l.split(' ', 1)[0]: l 
    for l in map(str.strip, open(wikiprops))
}

In [9]:
# Top classes
from collections import Counter

def get_props(t):
    for _, cps in t.get('properties', {}).items():
        for _, ps in cps.items():
            for p, score in ps.items():
                yield p.split('/')[-1]

def top_props(bag):
    freqs = frequencies(bag, get_props).most_common(10)
    return pd.Series({puri_name.get(prp, prp): count for prp, count in freqs})
                
top_props(bag)

P161 cast member                                         37
P54 member of sports team                                33
P131 located in the administrative territorial entity    27
P1346 winner                                             25
P1344 participant of                                     21
P1923 participating team                                 20
P175 performer                                           19
P301 category's main topic                               12
P57 director                                             11
P910 topic's main category                               11
dtype: int64

In [10]:
def get_pivots(prov):
    if 'pivot' in prov:
        yield prov['pivot']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pivots(c)
            
def get_table_pivots(t):
    for pivot in get_pivots(t.provenance):
        yield from pivot['headers'][pivot['level']][pivot['colfrom']:pivot['colto']+1]

def top_pivots(bag):
    return pd.Series(dict(frequencies(bag, get_table_pivots).most_common(10)))
        
top_pivots(bag)

1        1568
3        1425
2        1415
4        1383
Total    1002
5         991
7         968
6         959
Final     946
8         939
dtype: int64

In [11]:
import pandas as pd



def get_pagetitles(prov):
    if 'pgTitle' in prov:
        yield prov['pgTitle']
    elif 'concat' in prov:
        for c in prov['concat']:
            yield from get_pagetitles(c)
        
def filter_tables(ts, articles):
    tables = []
    for t in ts:
        if articles & set(get_pagetitles(t.provenance)):
            tables.append(t)
    return tables
        
        
import trident
catdb = trident.Db('/scratch/kruit/kb/dbpedia_20201001/categories.tridentdb')
def get_article_titles(arts):
    for a in arts:
        yield catdb.lookup_str(a)[1:-1].replace('http://dbpedia.org/resource/', '').replace('_', ' ')

n_pages = bag.map(lambda t: list(get_pagetitles(t.provenance))).flatten().distinct().count().compute()
print(f"Got a total of {n_pages} pages")
        
ntopics = len(topic_articles)
for ti, (topic, articles) in enumerate(sorted(topic_articles.items(), key=lambda ta: len(ta[1]))):
    article_titles = set(get_article_titles(articles))
    print(topic, f"({len(article_titles)} articles)", f"[{ti+1}/{ntopics}]")
    tables = bag.map_partitions(filter_tables, article_titles).persist()
    ntables = tables.count().compute()
    print(ntables, 'tables')
    if ntables:
    
        
        headers = top_headers(tables)
        print('headers')
        display.display(headers)
        
        coltypes = top_coltypes(tables)
        print('coltypes')
        display.display(coltypes)
        
        props = top_props(tables)
        print('props')
        display.display(props)
        
        pivots = top_pivots(tables)
        print('pivots')
        display.display(pivots)
        
    print()

Got a total of 25699 pages
Universe (11963 articles)
11 tables
headers


TWh                    4
Facility               2
Preceded by            1
Rank                   1
Country/Region         1
Date of information    1
Capita                 1
Prim. energy           1
Production             1
Export                 1
dtype: int64

coltypes


XMLSchema#string                         25
XMLSchema#decimal                        17
owl#Thing                                 3
XMLSchema#dateTime                        3
Q215627 person                            1
Q5 human                                  1
Q164509 omnivore                          1
Q154954 natural person                    1
Q21025364 WikiProject                     1
Q56248902 Wikimedia community project     1
dtype: int64

props


P155 follows        1
P156 followed by    1
dtype: int64

pivots


1                                   6
10                                  6
100                                 6
1 k                                 6
10 k                                6
100 k                               6
HST , GST , or GST + PST/QST (%)    2
Oil - consumption ( bbl /day)       1
Oil - exports ( bbl /day)           1
Oil - imports ( bbl /day)           1
dtype: int64

Energy (18630 articles)
12 tables
headers


TWh                    4
Preceded by            3
Facility               2
Succeeded by           1
Rank                   1
Country/Region         1
Date of information    1
Date                   1
Event                  1
Capita                 1
dtype: int64

coltypes


XMLSchema#string                         25
XMLSchema#decimal                        14
Q215627 person                            3
Q154954 natural person                    3
Q164509 omnivore                          3
Q5 human                                  3
XMLSchema#dateTime                        3
owl#Thing                                 2
Q21025364 WikiProject                     1
Q56248902 Wikimedia community project     1
dtype: int64

props


P40 child            3
P22 father           2
P39 position held    1
P361 part of         1
P155 follows         1
P156 followed by     1
dtype: int64

pivots


HST , GST , or GST + PST/QST (%)    2
Oil - consumption ( bbl /day)       1
Oil - exports ( bbl /day)           1
Oil - imports ( bbl /day)           1
dtype: int64

Mathematics (30739 articles)
13 tables
headers


Name           3
DeptName       3
1st throw      2
2nd throw      2
Equation       2
Result         2
Preceded by    2
Year           2
in × in        2
mm × mm        2
dtype: int64

coltypes


XMLSchema#string             19
XMLSchema#decimal            11
XMLSchema#dateTime            5
owl#Thing                     3
Q11410 game                   2
Q1150958 play                 2
Q17538258 recreative work     2
Q173799 entertainment         2
Q215627 person                2
Q154954 natural person        2
dtype: int64

props


P40 child            2
P22 father           1
P39 position held    1
P361 part of         1
P155 follows         1
P156 followed by     1
dtype: int64

pivots


2007    13
2006    13
2005    12
2008    11
2003     9
2010     9
2009     9
2000     8
2001     8
2004     8
dtype: int64

World (30954 articles)
51 tables
headers


Year                  10
Event                  6
Name                   6
Date                   6
Rank                   5
Season                 3
Country                3
Principal location     3
Location               3
Preceded by            2
dtype: int64

coltypes


XMLSchema#string                         112
XMLSchema#dateTime                        33
XMLSchema#decimal                         26
Q215627 person                            13
Q154954 natural person                    13
Q164509 omnivore                          13
Q5 human                                  13
owl#Thing                                 12
Q21025364 WikiProject                      9
Q56248902 Wikimedia community project      9
dtype: int64

props


P361 part of         3
P40 child            2
P39 position held    2
P22 father           1
P155 follows         1
P156 followed by     1
P138 named after     1
P161 cast member     1
P31 instance of      1
P106 occupation      1
dtype: int64

pivots


Total     421
1         417
3         408
2         393
4         392
Canada     22
2007       18
2010       18
2006       18
2008       18
dtype: int64

Ethics (38774 articles)
18 tables
headers


Year               5
Chart              3
Preceded by        2
Succeeded by       2
Country            2
Peak position      2
Region             1
Certification      1
Sales/shipments    1
Venue              1
dtype: int64

coltypes


XMLSchema#string           46
XMLSchema#dateTime          8
Q2031291 release            7
Q4502142 visual artwork     7
XMLSchema#decimal           6
Q482994 album               5
Q215627 person              3
Q154954 natural person      3
Q164509 omnivore            3
Q5 human                    3
dtype: int64

props


P40 child            3
P22 father           2
P39 position held    2
P161 cast member     2
P361 part of         1
dtype: int64

pivots


Chart (2008)    222
Chart (2007)     55
Chart (2002)     48
Chart (2003)     46
Chart (2004)     42
Chart (1993)     41
Chart (1995)     36
Chart (2006)     34
Chart (1999)     34
Chart (2001)     32
dtype: int64

Engineering (38870 articles)
19 tables
headers


Year                     5
Network                  4
Play-by-play             4
Preceded by              3
Description              3
Name                     3
Date                     2
in × in                  2
mm × mm                  2
Colour commentator(s)    2
dtype: int64

coltypes


XMLSchema#string                   58
XMLSchema#decimal                   6
XMLSchema#dateTime                  6
Q215627 person                      5
Q154954 natural person              5
Q164509 omnivore                    5
Q5 human                            5
owl#Thing                           4
Q23959932 fixed-order metaclass     2
Q24017414 first-order metaclass     2
dtype: int64

props


P40 child             3
P22 father            2
P2061 aspect ratio    2
P39 position held     1
P361 part of          1
P155 follows          1
P156 followed by      1
dtype: int64

pivots


PSIP Short Name       32
B series               4
In Design Patterns     3
In Code Complete       3
A series               2
C series               2
Shiroku ban            2
Kiku                   2
PSIP short name        2
In POSA2               1
dtype: int64

Philosophy (38872 articles)
11 tables
headers


Preceded by     2
Succeeded by    2
No.             2
Year            2
Actor           1
Instrument      1
Uses            1
Latitude        1
Longitude       1
Diameter        1
dtype: int64

coltypes


XMLSchema#string                      25
XMLSchema#dateTime                     6
Q215627 person                         3
Q154954 natural person                 3
Q164509 omnivore                       3
Q5 human                               3
XMLSchema#decimal                      3
Q4502142 visual artwork                2
Q11086742 anime television program     2
Q581714 animated series                2
dtype: int64

props


P40 child            2
P39 position held    2
P22 father           1
P361 part of         1
P155 follows         1
P156 followed by     1
P279 subclass of     1
P138 named after     1
dtype: int64

pivots


Eötvös       1
Babcock      1
Arnold       1
Archytas     1
Frost        1
Arrhenius    1
Avogadro     1
Fowler       1
Finsen       1
Borman       1
dtype: int64

Knowledge (38935 articles)
26 tables
headers


Notes              4
Date               4
Name               4
Location           4
Preceded by        3
Year               3
Yield              3
Code               3
PIM application    2
Platform(s)        2
dtype: int64

coltypes


XMLSchema#string                                       80
XMLSchema#dateTime                                     13
XMLSchema#decimal                                       5
Q215627 person                                          4
Q154954 natural person                                  4
Q164509 omnivore                                        4
Q5 human                                                4
Q17305522 ISO 3166-2 data set of a specific country     4
Q1172284 data set                                       4
owl#Thing                                               2
dtype: int64

props


P40 child                        3
P22 father                       2
P39 position held                1
P361 part of                     1
P155 follows                     1
P156 followed by                 1
P36 capital                      1
P1376 capital of                 1
P398 child astronomical body     1
P397 parent astronomical body    1
dtype: int64

pivots


Total     421
1         417
3         408
2         393
4         392
Canada     22
Front      17
Back       16
OT         13
Mass       13
dtype: int64

Religion (39902 articles)
25 tables
headers


Year            14
Notes            5
Role             5
Title            4
Preceded by      3
Succeeded by     2
Abbey            2
Type             2
Founded          2
Location         2
dtype: int64

coltypes


XMLSchema#string           59
XMLSchema#dateTime         23
Q5 human                   13
Q154954 natural person     13
Q215627 person             13
Q164509 omnivore           13
XMLSchema#decimal          12
Q2031291 release            5
Q482994 album               3
Q4502142 visual artwork     3
dtype: int64

props


P161 cast member     5
P735 given name      4
P40 child            3
P22 father           2
P39 position held    2
P361 part of         1
P488 chairperson     1
P155 follows         1
P156 followed by     1
P106 occupation      1
dtype: int64

pivots


Chart (2008)    210
RPG             110
APG             110
PPG             110
SPG             108
BPG             106
Chart (2007)     52
Chart (2002)     47
Chart (2003)     41
Chart (1993)     41
dtype: int64

Food_and_drink (41434 articles)
29 tables
headers


Description      5
Country          4
Year             4
Name             3
Title            3
Notes            3
Certification    2
Preceded by      2
Succeeded by     2
Date             2
dtype: int64

coltypes


XMLSchema#string          60
XMLSchema#decimal         17
XMLSchema#dateTime        13
owl#Thing                  7
Q5 human                   6
Q164509 omnivore           6
Q154954 natural person     6
Q215627 person             6
Q2031291 release           5
Q134556 single             5
dtype: int64

props


P735 given name             3
P40 child                   2
P39 position held           2
P161 cast member            2
P22 father                  1
P361 part of                1
P927 anatomical location    1
P155 follows                1
P156 followed by            1
P106 occupation             1
dtype: int64

pivots


Chart (2008)    226
Chart (1993)     62
Chart (2007)     57
Chart (2002)     54
Chart (2003)     49
Chart (2004)     48
Chart (1995)     43
Chart (1992)     39
Chart (1999)     38
Chart (1998)     36
dtype: int64

Science_and_technology (42470 articles)
25 tables
headers


Name           8
Year           4
Preceded by    3
Date           3
Description    3
Location       3
Yield          3
DIN            2
TITLE          2
STATUS         2
dtype: int64

coltypes


XMLSchema#string                             77
XMLSchema#dateTime                           11
XMLSchema#decimal                             6
owl#Thing                                     5
Q215627 person                                3
Q154954 natural person                        3
Q164509 omnivore                              3
Q5 human                                      3
Q11753321 Wikimedia navigational template     2
Q6156112 chronostratigraphic unit             2
dtype: int64

props


P40 child              3
P22 father             2
P361 part of           2
P39 position held      1
P166 award received    1
P155 follows           1
P156 followed by       1
P527 has part          1
P57 director           1
P161 cast member       1
dtype: int64

pivots


Chart (2008)    210
Chart (2007)     52
Chart (2002)     47
Chart (2003)     41
Chart (1993)     41
Chart (2004)     40
Chart (1995)     36
Chart (1999)     33
Chart (2006)     32
Chart (2001)     30
dtype: int64

Geography (44074 articles)
62 tables
headers


Name             12
Year              7
Notes             6
Serial format     4
Rank              4
Country           4
Code              4
Municipality      3
First issued      3
Design            3
dtype: int64

coltypes


XMLSchema#string                                       148
XMLSchema#decimal                                       29
XMLSchema#dateTime                                      25
owl#Thing                                               18
Q21025364 WikiProject                                   10
Q56248902 Wikimedia community project                   10
Q17305522 ISO 3166-2 data set of a specific country      4
Q1172284 data set                                        4
Q177634 community                                        3
Q486972 human settlement                                 3
dtype: int64

props


P40 child                     3
P361 part of                  3
P31 instance of               2
P22 father                    2
P39 position held             1
P155 follows                  1
P156 followed by              1
P527 has part                 1
P910 topic's main category    1
P301 category's main topic    1
dtype: int64

pivots


Chart (2008)    210
Chart (2007)     52
Chart (2002)     47
Chart (2003)     41
Chart (1993)     41
Chart (2004)     40
Chart (1995)     36
Chart (1999)     33
Chart (2006)     32
Chart (2001)     30
dtype: int64

Industry (45565 articles)
24 tables
headers


Year            3
Description     3
Name            3
Country         3
Preceded by     2
Succeeded by    2
Notes           2
Team            2
Overall         2
Date            2
dtype: int64

coltypes


XMLSchema#string                         55
XMLSchema#dateTime                       11
XMLSchema#decimal                        11
Q215627 person                            7
Q154954 natural person                    7
Q164509 omnivore                          7
Q5 human                                  7
Q21025364 WikiProject                     5
Q56248902 Wikimedia community project     5
owl#Thing                                 3
dtype: int64

props


P40 child             2
P39 position held     2
P735 given name       2
P22 father            1
P361 part of          1
P155 follows          1
P156 followed by      1
P106 occupation       1
P161 cast member      1
P19 place of birth    1
dtype: int64

pivots


2007    13
2006    13
2005    12
2008    11
2003     9
2010     9
2009     9
2000     8
2001     8
2004     8
dtype: int64

Nature (46753 articles)
46 tables
headers


Year                 8
Country              7
Location             6
Creature             5
Date                 5
Name                 5
Page                 4
Other appearances    4
Variants             4
TWh                  4
dtype: int64

coltypes


XMLSchema#string                         107
XMLSchema#decimal                         37
XMLSchema#dateTime                        23
owl#Thing                                 10
Q56248902 Wikimedia community project      7
Q21025364 WikiProject                      7
Q2031291 release                           4
Q482994 album                              3
Q4502142 visual artwork                    3
Q215627 person                             2
dtype: int64

props


P40 child            2
P361 part of         2
P22 father           1
P39 position held    1
P155 follows         1
P156 followed by     1
P527 has part        1
P137 operator        1
dtype: int64

pivots


Chart (2008)    210
Chart (2007)     52
Chart (2002)     47
Chart (2003)     41
Chart (1993)     41
Chart (2004)     40
Chart (1995)     36
Chart (1999)     33
Chart (2006)     32
Chart (2001)     30
dtype: int64

Language (48625 articles)
45 tables
headers


Year            7
Record          5
Word            5
Meaning         4
Title           4
Name            4
Network         4
Play-by-play    4
Preceded by     3
Notes           3
dtype: int64

coltypes


XMLSchema#string                         139
XMLSchema#decimal                         19
owl#Thing                                 14
XMLSchema#dateTime                        11
Q215627 person                             7
Q154954 natural person                     7
Q164509 omnivore                           7
Q5 human                                   7
Q21025364 WikiProject                      3
Q56248902 Wikimedia community project      3
dtype: int64

props


P40 child            3
P22 father           2
P361 part of         2
P735 given name      2
P161 cast member     2
P39 position held    1
P155 follows         1
P156 followed by     1
P106 occupation      1
P527 has part        1
dtype: int64

pivots


1         421
Total     421
3         408
2         393
4         392
Front      23
Canada     22
Back       22
OT         13
China      12
dtype: int64

Education (50648 articles)
25 tables
headers


Year                       7
Name                       5
Grade Point Equivalence    4
Role                       3
Equivalence                3
Description                3
Preceded by                2
Title                      2
Winner                     2
Notes                      2
dtype: int64

coltypes


XMLSchema#string          50
XMLSchema#dateTime        18
XMLSchema#decimal         10
owl#Thing                  7
Q215627 person             6
Q154954 natural person     6
Q164509 omnivore           6
Q5 human                   6
Q2031291 release           3
Q482994 album              2
dtype: int64

props


P735 given name      3
P161 cast member     3
P40 child            2
P106 occupation      2
P22 father           1
P39 position held    1
P361 part of         1
P155 follows         1
P156 followed by     1
P31 instance of      1
dtype: int64

pivots


Chart (1993)    21
Chart (2008)    16
2007            14
2006            14
2005            13
Chart (1992)    13
2008            12
2003            10
2010             9
2004             9
dtype: int64

Events (51487 articles)
72 tables
headers


Date                        17
Country                     13
Year                        13
Name                         9
Team                         8
Location                     6
Result                       5
Eastern Conference v t e     5
Notes                        4
Killed                       3
dtype: int64

coltypes


XMLSchema#string                         164
XMLSchema#decimal                         53
XMLSchema#dateTime                        45
Q56248902 Wikimedia community project     17
Q21025364 WikiProject                     17
Q2031291 release                           9
Q154954 natural person                     8
Q164509 omnivore                           8
Q5 human                                   8
Q215627 person                             8
dtype: int64

props


P197 adjacent station       3
P40 child                   2
P39 position held           2
P137 operator               2
P1923 participating team    2
P22 father                  1
P361 part of                1
P155 follows                1
P156 followed by            1
P945 allegiance             1
dtype: int64

pivots


National Rail       508
Disused railways    394
Line                338
Chart (2008)        238
RPG                 110
APG                 110
PPG                 110
NOR                 109
SPG                 109
BPG                 106
dtype: int64

Law (53521 articles)
46 tables
headers


Year       15
Date       10
Name        8
Country     6
Party       5
Result      5
Score       5
Notes       5
Award       4
Reason      4
dtype: int64

coltypes


XMLSchema#string                         135
XMLSchema#dateTime                        39
XMLSchema#decimal                         38
Q215627 person                            18
Q154954 natural person                    18
Q164509 omnivore                          18
Q5 human                                  18
owl#Thing                                  9
Q56248902 Wikimedia community project      7
Q21025364 WikiProject                      7
dtype: int64

props


P40 child            4
P161 cast member     4
P22 father           3
P39 position held    3
P735 given name      3
P106 occupation      2
P175 performer       1
P361 part of         1
P155 follows         1
P156 followed by     1
dtype: int64

pivots


League        117
RPG           110
APG           110
PPG           110
SPG           108
BPG           106
Total          78
League Cup     72
FA Cup         70
Other [A ]     42
dtype: int64

Economy (54204 articles)
29 tables
headers


Location                6
Traction Type           5
Date (From)             5
Date (To)               5
Notes                   5
Thresholds per award    4
Name of System          3
Year                    3
Rank                    3
Name                    3
dtype: int64

coltypes


XMLSchema#string                         53
XMLSchema#dateTime                       25
XMLSchema#decimal                        20
owl#Thing                                12
Q215627 person                            3
Q154954 natural person                    3
Q164509 omnivore                          3
Q5 human                                  3
Q21025364 WikiProject                     3
Q56248902 Wikimedia community project     3
dtype: int64

props


P40 child            3
P22 father           2
P361 part of         2
P39 position held    1
P155 follows         1
P156 followed by     1
dtype: int64

pivots


Chart (2008)    210
Chart (2007)     52
Chart (2002)     47
Chart (2003)     41
Chart (1993)     41
Chart (2004)     40
Chart (1995)     36
Chart (1999)     33
Chart (2006)     32
Chart (2001)     30
dtype: int64

Military (59929 articles)
53 tables
headers


Type              12
Notes             11
Name              11
Origin             8
In service         5
From               5
To                 5
Class              4
Aircraft           4
Term of office     4
dtype: int64

coltypes


XMLSchema#string                         147
XMLSchema#dateTime                        33
XMLSchema#decimal                         27
Q154954 natural person                     7
Q164509 omnivore                           7
Q5 human                                   7
Q215627 person                             7
Q56248902 Wikimedia community project      6
Q21025364 WikiProject                      6
Q6540697 Wikipedia:Books                   5
dtype: int64

props


P40 child                             3
P39 position held                     3
P22 father                            2
P361 part of                          1
P31 instance of                       1
P155 follows                          1
P156 followed by                      1
P1308 officeholder                    1
P176 manufacturer                     1
P1056 product or material produced    1
dtype: int64

pivots


Chart (2008)    210
Chart (2007)     52
Chart (2002)     47
Chart (2003)     41
Chart (1993)     41
Chart (2004)     40
Chart (1995)     36
Chart (1999)     33
Chart (2006)     32
Chart (2001)     30
dtype: int64

Policy (61033 articles)
37 tables
headers


Name           12
Party           6
Left Office     4
Preceded by     3
Notes           3
#               3
Inaugurated     3
Years           2
Minister        2
Image           2
dtype: int64

coltypes


XMLSchema#string          95
XMLSchema#decimal         25
XMLSchema#dateTime        23
Q4164871 position          8
Q214339 role               6
owl#Thing                  5
Q215627 person             4
Q154954 natural person     4
Q164509 omnivore           4
Q5 human                   4
dtype: int64

props


P39 position held            4
P40 child                    3
P22 father                   2
P361 part of                 2
P155 follows                 1
P156 followed by             1
P279 subclass of             1
P54 member of sports team    1
dtype: int64

pivots


+/–                                 25
HST , GST , or GST + PST/QST (%)     2
dtype: int64

Life (69236 articles)
44 tables
headers


Date                 11
Year                 11
Team                  7
Country               6
Record                6
Creature              5
Event                 5
Page                  4
Other appearances     4
Variants              4
dtype: int64

coltypes


XMLSchema#string                         153
XMLSchema#decimal                         40
XMLSchema#dateTime                        33
Q215627 person                            16
Q154954 natural person                    16
Q164509 omnivore                          16
Q5 human                                  16
Q56248902 Wikimedia community project      7
Q21025364 WikiProject                      7
owl#Thing                                  7
dtype: int64

props


P40 child             3
P22 father            2
P39 position held     2
P361 part of          1
P155 follows          1
P156 followed by      1
P19 place of birth    1
dtype: int64

pivots


Chart (2008)    226
RPG             110
APG             110
PPG             110
SPG             108
BPG             106
Chart (1993)     62
Chart (2007)     57
Chart (2002)     54
Chart (2003)     49
dtype: int64

Business (73164 articles)
52 tables
headers


Year            13
Title            6
Name             5
#                4
Party            4
Channel          4
Play-by-play     4
Preceded by      3
Team             3
Description      3
dtype: int64

coltypes


XMLSchema#string                         116
XMLSchema#decimal                         39
XMLSchema#dateTime                        26
Q215627 person                            12
Q154954 natural person                    12
Q164509 omnivore                          12
Q5 human                                  12
owl#Thing                                  9
Q21025364 WikiProject                      7
Q56248902 Wikimedia community project      7
dtype: int64

props


P40 child             3
P22 father            2
P361 part of          2
P735 given name       2
P39 position held     1
P155 follows          1
P156 followed by      1
P106 occupation       1
P161 cast member      1
P19 place of birth    1
dtype: int64

pivots


2007    13
2006    13
2005    12
2008    11
2000     9
2003     9
2010     9
2009     9
2001     8
2004     8
dtype: int64

Organizations (73520 articles)
70 tables
headers


Year              17
Name              12
Country            6
Type               4
Location           4
Notes              4
Term of office     4
Album              4
Description        3
MEPs               3
dtype: int64

coltypes


XMLSchema#string                         162
XMLSchema#decimal                         45
XMLSchema#dateTime                        41
owl#Thing                                 11
Q215627 person                            10
Q154954 natural person                    10
Q164509 omnivore                          10
Q5 human                                  10
Q21025364 WikiProject                     10
Q56248902 Wikimedia community project     10
dtype: int64

props


P39 position held                     4
P40 child                             2
P361 part of                          2
P106 occupation                       2
P735 given name                       2
P161 cast member                      2
P931 place served by transport hub    2
P22 father                            1
P155 follows                          1
P156 followed by                      1
dtype: int64

pivots


1     108
5      83
4      79
2      78
6      77
7      73
9      69
8      60
3      60
US     49
dtype: int64

Sports (76740 articles)
237 tables
headers


Year        63
Date        40
Team        39
Event       30
Name        24
Season      23
Score       21
Goals       19
Position    17
Apps        17
dtype: int64

coltypes


XMLSchema#string                         608
XMLSchema#decimal                        292
XMLSchema#dateTime                       166
Q5 human                                  78
Q154954 natural person                    78
Q164509 omnivore                          78
Q215627 person                            78
Q13406554 sports festival                 47
Q21025364 WikiProject                     47
Q56248902 Wikimedia community project     47
dtype: int64

props


P54 member of sports team    7
P1344 participant of         3
P161 cast member             3
P40 child                    3
P735 given name              3
P22 father                   2
P136 genre                   2
P1923 participating team     2
P39 position held            1
P361 part of                 1
dtype: int64

pivots


Total           722
1               635
2               581
3               578
4               567
League          312
Chart (2008)    210
5               179
6               173
7               169
dtype: int64

Music (80424 articles)
265 tables
headers


Year                    131
Title                    64
Album                    50
Peak chart positions     36
Peak position            30
Country                  23
Date                     21
Album details            20
Result                   19
Award                    16
dtype: int64

coltypes


XMLSchema#string                     524
XMLSchema#dateTime                   195
XMLSchema#decimal                    137
Q2031291 release                      88
Q273057 discography                   71
Q1631107 bibliography                 70
Q7725310 series of creative works     70
Q2668072 collection                   69
Q17489659 group of works              69
Q482994 album                         59
dtype: int64

props


P175 performer         15
P161 cast member        7
P361 part of            5
P735 given name         4
P40 child               3
P264 record label       3
P22 father              2
P166 award received     2
P156 followed by        2
P155 follows            2
dtype: int64

pivots


US              394
UK              265
US Country      265
Chart (2008)    238
CAN             148
AUS             142
GER             140
NZ              118
CAN Country     116
SWI             102
dtype: int64

Politics (80611 articles)
69 tables
headers


Year             13
Name             10
Party             8
Notes             7
Country           5
Role              5
Chart             5
Peak position     5
Date              5
Result            5
dtype: int64

coltypes


XMLSchema#string          152
XMLSchema#dateTime         50
XMLSchema#decimal          39
Q2031291 release           14
Q5 human                   13
Q164509 omnivore           13
Q215627 person             13
Q154954 natural person     13
Q134556 single             10
owl#Thing                   9
dtype: int64

props


P161 cast member                  6
P39 position held                 5
P40 child                         3
P735 given name                   3
P22 father                        2
P361 part of                      1
P102 member of political party    1
P155 follows                      1
P156 followed by                  1
P106 occupation                   1
dtype: int64

pivots


Total           427
1               423
3               408
2               393
4               392
Chart (2008)    226
Chart (1993)     62
Chart (2007)     57
Chart (2002)     54
Chart (2003)     49
dtype: int64

History (85833 articles)
71 tables
headers


Name            15
Date            11
Country          9
Year             9
Notes            6
Title            4
Highest rank     4
Reason           4
Party            4
Location         4
dtype: int64

coltypes


XMLSchema#string                         178
XMLSchema#dateTime                        52
XMLSchema#decimal                         47
Q215627 person                            14
Q154954 natural person                    14
Q164509 omnivore                          14
Q5 human                                  14
owl#Thing                                 12
Q56248902 Wikimedia community project     10
Q21025364 WikiProject                     10
dtype: int64

props


P40 child            4
P22 father           3
P161 cast member     3
P39 position held    2
P361 part of         2
P735 given name      2
P155 follows         1
P156 followed by     1
P106 occupation      1
P97 noble title      1
dtype: int64

pivots


Chart (2008)    226
Chart (1993)     62
Chart (2007)     57
Chart (2002)     54
Chart (2003)     49
Chart (2004)     48
Chart (1995)     43
Chart (1992)     39
Chart (1999)     38
Chart (1998)     36
dtype: int64

Health (112370 articles)
89 tables
headers


Year       35
Date       13
Country     8
Team        8
Award       7
Result      7
Title       7
Notes       7
Role        6
Name        6
dtype: int64

coltypes


XMLSchema#string          241
XMLSchema#decimal          80
XMLSchema#dateTime         65
Q5 human                   38
Q154954 natural person     38
Q164509 omnivore           38
Q215627 person             38
owl#Thing                  16
Q2031291 release           14
Q21025364 WikiProject      14
dtype: int64

props


P161 cast member         5
P175 performer           4
P735 given name          4
P40 child                3
P197 adjacent station    3
P22 father               2
P39 position held        2
P361 part of             2
P137 operator            2
P106 occupation          2
dtype: int64

pivots


National Rail       508
1                   436
Total               425
3                   421
2                   406
4                   405
Disused railways    394
Line                338
Chart (2008)        238
RPG                 110
dtype: int64

Entertainment (132142 articles)
294 tables
headers


Year       82
Title      81
Notes      26
Role       21
Country    20
Rank       18
Result     18
Studio     18
Album      18
Award      16
dtype: int64

coltypes


XMLSchema#string                         713
XMLSchema#dateTime                       183
XMLSchema#decimal                        164
Q2031291 release                          49
Q7725310 series of creative works         47
Q5 human                                  44
Q215627 person                            44
Q164509 omnivore                          44
Q154954 natural person                    44
Q56248902 Wikimedia community project     37
dtype: int64

props


P161 cast member         15
P175 performer           13
P57 director              6
P735 given name           4
P361 part of              3
P156 followed by          3
P197 adjacent station     3
P40 child                 2
P39 position held         2
P155 follows              2
dtype: int64

pivots


National Rail       508
Disused railways    394
Line                338
Chart (2008)        238
US                  236
US Country          201
UK                  144
NOR                 140
SWI                 110
BAR                  98
dtype: int64

Government (141264 articles)
121 tables
headers


Name           28
Party          19
Notes          14
Year           11
#              10
Type            9
Minister        8
Title           7
Term            6
Took office     5
dtype: int64

coltypes


XMLSchema#string                         304
XMLSchema#dateTime                        85
XMLSchema#decimal                         60
owl#Thing                                 26
Q4164871 position                         12
Q215627 person                            11
Q154954 natural person                    11
Q164509 omnivore                          11
Q5 human                                  11
Q56248902 Wikimedia community project      8
dtype: int64

props


P39 position held                                        7
P40 child                                                5
P22 father                                               4
P31 instance of                                          2
P102 member of political party                           2
P150 contains administrative territorial entity          2
P131 located in the administrative territorial entity    2
P361 part of                                             1
P166 award received                                      1
P155 follows                                             1
dtype: int64

pivots


+/–              25
2007             13
2006             13
2005             12
2008             11
Senior status    11
Senior           10
2000              9
2003              9
2010              9
dtype: int64

Humanities (164416 articles)
150 tables
headers


Title      37
Year       37
Rank       19
Studio     18
Opening    14
Notes      13
Date       11
Result     10
Role        9
Country     8
dtype: int64

coltypes


XMLSchema#string           385
XMLSchema#decimal           96
XMLSchema#dateTime          84
owl#Thing                   21
Q5 human                    20
Q215627 person              20
Q164509 omnivore            20
Q154954 natural person      20
Q2031291 release            17
Q4502142 visual artwork     16
dtype: int64

props


P161 cast member       7
P40 child              3
P39 position held      3
P175 performer         3
P735 given name        3
P22 father             2
P361 part of           2
P106 occupation        2
P57 director           2
P166 award received    1
dtype: int64

pivots


Total           421
1               417
3               408
2               393
4               392
Chart (2008)    226
Chart (1993)     62
Chart (2007)     57
Chart (2002)     54
Chart (2003)     49
dtype: int64

Society (187633 articles)
157 tables
headers


Year        49
Date        18
Name        17
Country     15
Notes       15
Result      12
Role        12
Location    11
%           10
Award        9
dtype: int64

coltypes


XMLSchema#string                         374
XMLSchema#decimal                        125
XMLSchema#dateTime                       114
Q5 human                                  41
Q154954 natural person                    41
Q215627 person                            41
Q164509 omnivore                          41
owl#Thing                                 25
Q56248902 Wikimedia community project     21
Q21025364 WikiProject                     21
dtype: int64

props


P161 cast member                      7
P40 child                             4
P361 part of                          4
P735 given name                       4
P22 father                            3
P39 position held                     3
P197 adjacent station                 3
P106 occupation                       3
P137 operator                         2
P931 place served by transport hub    2
dtype: int64

pivots


Total               559
National Rail       508
1                   429
3                   414
2                   399
4                   398
Disused railways    394
Line                338
Chart (2008)        226
League              122
dtype: int64

Mass_media (211310 articles)
337 tables
headers


Year                    124
Title                   115
Album                    44
Peak chart positions     35
Notes                    26
Director                 24
Peak position            23
Rank                     20
Album details            20
#                        18
dtype: int64

coltypes


XMLSchema#string                         796
XMLSchema#dateTime                       233
XMLSchema#decimal                        208
Q7725310 series of creative works         96
Q273057 discography                       71
Q17489659 group of works                  71
Q1631107 bibliography                     70
Q2668072 collection                       69
Q2031291 release                          63
Q56248902 Wikimedia community project     52
dtype: int64

props


P175 performer           12
P161 cast member         12
P361 part of              5
P57 director              5
P735 given name           4
P136 genre                4
P40 child                 3
P197 adjacent station     3
P22 father                2
P86 composer              2
dtype: int64

pivots


National Rail       508
1                   429
Total               421
3                   420
2                   405
4                   404
Disused railways    394
US                  365
Line                338
US Country          249
dtype: int64

Culture (404422 articles)
338 tables
headers


Year        146
Title        72
Team         41
Record       34
Date         30
Notes        29
Role         23
Result       22
Album        22
Director     21
dtype: int64

coltypes


XMLSchema#string                         921
XMLSchema#dateTime                       264
XMLSchema#decimal                        258
Q5 human                                 161
Q154954 natural person                   161
Q215627 person                           161
Q164509 omnivore                         161
owl#Thing                                 44
Q56248902 Wikimedia community project     37
Q21025364 WikiProject                     37
dtype: int64

props


P161 cast member             21
P175 performer                8
P57 director                  7
P735 given name               5
P136 genre                    4
P40 child                     3
P197 adjacent station         3
P106 occupation               3
P54 member of sports team     3
P22 father                    2
dtype: int64

pivots


Total               682
National Rail       508
1                   473
3                   460
2                   445
4                   430
Disused railways    394
Line                338
League              275
US Country          230
dtype: int64

Human_nature (1040093 articles)
507 tables
headers


Year      249
Goals     144
Apps      141
Title      81
League     66
Date       63
Team       59
Total      57
Result     54
Season     52
dtype: int64

coltypes


XMLSchema#string                         1357
XMLSchema#decimal                         686
XMLSchema#dateTime                        443
Q5 human                                  401
Q215627 person                            401
Q164509 omnivore                          401
Q154954 natural person                    401
Q21025364 WikiProject                      51
Q56248902 Wikimedia community project      51
owl#Thing                                  50
dtype: int64

props


P161 cast member               23
P54 member of sports team      18
P175 performer                 16
P735 given name                 5
P361 part of                    4
P57 director                    3
P40 child                       3
P1424 topic's main template     3
P1423 template's main topic     3
P106 occupation                 3
dtype: int64

pivots


Total         887
1             609
3             602
2             587
4             585
League        490
US            367
League Cup    284
US Country    280
Cup           260
dtype: int64

People (1264659 articles)
622 tables
headers


Year      302
Goals     144
Apps      141
Title     118
Album      68
League     66
Team       61
Result     61
Date       60
Notes      60
dtype: int64

coltypes


XMLSchema#string                     1617
XMLSchema#decimal                     741
XMLSchema#dateTime                    540
Q5 human                              414
Q215627 person                        414
Q164509 omnivore                      414
Q154954 natural person                414
Q7725310 series of creative works      99
Q1631107 bibliography                  97
Q273057 discography                    95
dtype: int64

props


P161 cast member             25
P54 member of sports team    19
P175 performer               17
P735 given name               6
P40 child                     5
P361 part of                  5
P22 father                    4
P39 position held             4
P57 director                  3
P197 adjacent station         3
dtype: int64

pivots


Total               887
1                   611
3                   604
2                   589
4                   587
National Rail       508
League              490
US                  437
Disused railways    394
Line                338
dtype: int64