# SPARQL data analysis

In [1]:
const _ = require('lodash')
const traverse = require('traverse')
const sparqljs = require('sparqljs')

const eachFileInDir = require('./eachFileInDir')
const countInQueries = require('./countInQueries')
const namespaces = require('./namespaces')
const parser = new sparqljs.Parser(namespaces)
const Query = require('./Query')
const makePlot = require('./makePlot')
const analysis = require('./analysis')

## Queries per Wiki

In [3]:
Promise.all([
    countInQueries('data/cywiki', analysis.isValidQuery),
    countInQueries('data/dewiki', analysis.isValidQuery),
    countInQueries('data/enwiki', analysis.isValidQuery),
    countInQueries('data/euwiki', analysis.isValidQuery),
    countInQueries('data/wikidatawiki', analysis.isValidQuery),
    countInQueries('data/normalized', analysis.isValidQuery),
]).then(([cy, de, en, eu, wd, normalized]) => {
    makePlot([{
        x: ['cywiki', 'dewiki', 'enwiki', 'euwiki', 'wikidata', 'all queries normalized'],
        y: [cy, de, en, eu, wd, normalized],
        type: 'bar'
    }])
})

## Features used

In [4]:
var wiki = 'cywiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [6]:
var wiki = 'dewiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [7]:
var wiki = 'enwiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [8]:
var wiki = 'euwiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [9]:
var wiki = 'wikidatawiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [9]:
var wiki = 'all'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [10]:
var wiki = 'normalized'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasOptional),
    countInQueries('data/' + wiki, analysis.hasFilter),
    countInQueries('data/' + wiki, analysis.hasOrderBy),
    countInQueries('data/' + wiki, analysis.hasDistinct),
    countInQueries('data/' + wiki, analysis.hasGroupBy),
    countInQueries('data/' + wiki, analysis.hasValues),
    countInQueries('data/' + wiki, analysis.hasUnion),
    countInQueries('data/' + wiki, analysis.hasMinus),
    countInQueries('data/' + wiki, analysis.hasBind),
    countInQueries('data/' + wiki, analysis.hasSubQuery),
    countInQueries('data/' + wiki, analysis.hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

## Query Shapes

### Examples
in pseudo-turtle.

#### Star/source
Natural language: Things that are cities, that are located in Germany.
```
?foo instanceOf city .
?foo locatedIn germany .
```

#### Cycle
Natural language: Inventors killed by their own invention
```
?invention inventedBy ?inventor .
?inventor causeOfDeath ?invention .
```

#### Path
Natural language: Female head of government
```
?city headOfGovernment ?person .
?person sexOrGender female .
```

#### Sink
Natural language: Actors appearing in both The Hobbit movie and in Avengers
```
hobbit castMember ?actor .
avengers castMember ?actor .
```

In [12]:
var wiki = 'cywiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [14]:
var wiki = 'dewiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [15]:
var wiki = 'enwiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [16]:
var wiki = 'euwiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [17]:
var wiki = 'wikidatawiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [17]:
var wiki = 'all'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [18]:
var wiki = 'normalized'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasSource),
    countInQueries('data/' + wiki, analysis.hasPath),
    countInQueries('data/' + wiki, analysis.hasSink),
    countInQueries('data/' + wiki, analysis.hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

## "Backwards" queries

In [19]:
Promise.all([
    countInQueries('data/cywiki', analysis.hasURISubject),
    countInQueries('data/dewiki', analysis.hasURISubject),
    countInQueries('data/enwiki', analysis.hasURISubject),
    countInQueries('data/euwiki', analysis.hasURISubject),
    countInQueries('data/wikidatawiki', analysis.hasURISubject),
    countInQueries('data/normalized', analysis.hasURISubject),
]).then(([cy, de, en, eu, wd, normalized]) => {
    makePlot([{
        x: ['cywiki', 'dewiki', 'enwiki', 'euwiki', 'wikidata', 'all queries normalized'],
        y: [cy, de, en, eu, wd, normalized],
        type: 'bar'
    }])
})

## Property Paths

In [21]:
analysis.getPropertyPathPredicates('data/cywiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [23]:
analysis.getPropertyPathPredicates('data/dewiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [24]:
analysis.getPropertyPathPredicates('data/enwiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [25]:
analysis.getPropertyPathPredicates('data/euwiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [26]:
analysis.getPropertyPathPredicates('data/wikidatawiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [27]:
analysis.getPropertyPathPredicates('data/all').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

## Usage by data model part

In [28]:
var wiki = 'cywiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [28]:
var wiki = 'dewiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [30]:
var wiki = 'enwiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [31]:
var wiki = 'euwiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [32]:
var wiki = 'wikidatawiki'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [33]:
var wiki = 'all'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [33]:
var wiki = 'normalized'
Promise.all([
    countInQueries('data/' + wiki, analysis.hasQualifiers),
    countInQueries('data/' + wiki, analysis.hasReferences),
    countInQueries('data/' + wiki, analysis.hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})