# Music Charts

In this example, we will extract information about music charts from tables on Wikipedia.
We will populate the Wikidata predicate "charted in" ([P2291](https://www.wikidata.org/wiki/Property:P2291)), which often occurs with the qualifiers "point in time" ([P585](https://www.wikidata.org/wiki/Property:P585)) and "ranking" ([P1352](https://www.wikidata.org/wiki/Property:P1352)).

In [1]:
from rdflib.plugins.stores.sparqlstore import SPARQLStore
st = SPARQLStore('http://query.wikidata.org/sparql')
r = st.query("""
SELECT DISTINCT ?entity ?article WHERE {
    # ?entity p:P726 ?p . ?p ps:P726 ?o . ?p pq:P1111 ?votes .
    ?entity wdt:P2291 ?chart .
    ?article schema:about ?entity .
    ?article schema:isPartOf <https://en.wikipedia.org/>.
}
""")
ent_abouturl = [tuple(b[v] for v in r.vars) for b in r.bindings]
ent_abouturl = sorted([
    (e, url.replace('https://en.wikipedia.org/wiki/', 'http://localhost:8989/wikipedia_en_all_nopic_2020-10/A/'))
    for e, url in ent_abouturl
])
len(ent_abouturl)



435

In [None]:
%%time
import takco
sample = 100
pages = takco.extract.WikiPages(ent_abouturl[:sample], encoding='utf8').load()
extracted = list(takco.extract.extract_tables(pages, link_pattern=r'[^\W](?!ttp:)'))
print(f"Got {len(list(extracted))} tables")
takco.preview(extracted, nrows=5, ntables=25)

In [None]:
steps = takco.config.build('step', load=['resources/graphs/wikidata.toml','resources/pipelines/TabEL.toml'])
unpivot_heuristics = steps[0]['unpivot_heuristics']

reshaped = list(takco.TableSet.reshape(extracted, unpivot_heuristics=unpivot_heuristics))
print(f"Processed {len(list(reshaped))} tables")
takco.preview(reshaped, nrows=5, ntables=25)

In [None]:
clustered = list(takco.TableSet.cluster(reshaped, addcontext = ["pgTitle"], matchers=[]))
clustered = sorted(clustered, key=lambda table: -table.get('numDataRows', 0))

print(f"Processed {len(list(clustered))} tables")
takco.preview(clustered, nrows=5, ntables=25)

In [None]:
linked = list(takco.TableSet.link(
    clustered, 
    lookup_cells = False,
    lookup = takco.link.SQLiteLookup(
        sqlitedb= 'data/wdid_wpname.sqlitedb',
        baseuri = 'http://www.wikidata.org/entity/Q',
        extract = 'http://[^\.]+.wikipedia.org/wiki/([^?]+)',
        fallback = takco.link.MediaWikiAPI(),
    )
))

print(f"Processed {len(list(linked))} tables")
takco.preview(linked, nrows=5, ntables=25)

In [None]:
%%time
searcher = takco.link.RDFSearcher(
    typeProperties = ["http://www.wikidata.org/prop/direct/P31"],
    statementURIprefix = "http://www.wikidata.org/entity/statement/",
    store_classname = 'takco.link.Trident',
    store_kwargs = {'configuration': "/export/scratch1/home/kruit/20200713-prop-skos"}
)
    
typer = takco.link.EntityTyper(
    db = searcher, 
    type_prop = "http://www.wikidata.org/prop/direct/P31",
    cover_threshold = 0.2,
)
typed = list(takco.TableSet.coltypes(linked, typer=typer))
integrated = list(takco.TableSet.integrate(linked, pfd_threshold = 0.95, db=searcher))
print(f"Processed {len(list(integrated))} tables")

integrated = sorted(integrated, key=lambda table: -table.get('numDataRows', 0))
takco.preview(integrated, nrows=5, ntables=25)

In [None]:
triples = takco.TableSet.triples(integrated)
print(f"Extracted {sum(len(table.get('triples')) for table in triples)} triples")