# SPARQL data analysis

In [1]:
const _ = require('lodash')
const traverse = require('traverse')
const sparqljs = require('sparqljs')

const eachFileInDir = require('./eachFileInDir')
const countInQueries = require('./countInQueries')
const namespaces = require('./namespaces')
const parser = new sparqljs.Parser(namespaces)
const Query = require('./Query')

function makePlot(data) {
    var randomId = Math.round(Math.random() * 10000)
    $$.html(`
        <div id="${randomId}"/>
        <script>
        requirejs( ['plotly'], function(plotly) {
            plotly.newPlot('${randomId}', ${JSON.stringify(data)})
        } )
        </script>
    `)
}
$$.html(`<script>
requirejs.config({
    paths: {
        plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
    },
});
</script>`)

var isValidQuery = (query) => {
    return true // will internally throw an exception when parsed
}
var hasOptional = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'optional'
    }, false)
}
var hasFilter = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'filter'
    }, false)
}
var hasOrderBy = (query) => {
    return query.getParsed().order
}
var hasDistinct = (query) => {
    return query.getParsed().distinct
}
var hasGroupBy = (query) => {
    return query.getParsed().group
}
var hasValues = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'values'
    }, false)
}
var hasUnion = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'union'
    }, false)
}
var hasNotExist = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.operator === 'notexists'
    }, false)
}
var hasBind = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'bind'
    }, false)
}
var hasMinus = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'minus'
    }, false)
}
var hasSubQuery = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.level > 0 && this.notLeaf && node.type === 'query'
    }, false)
}
var hasPropertyPath = (query) => {
    return query.getPredicates().some(predicate => predicate.type === 'path')
}

// shapes
var hasPath = (query) => {
    const objects = query.getObjects()
    
    return query.getSubjects().some((subject) => {
        return typeof subject === 'string' && subject.startsWith('?') && objects.includes(subject)
    })
}
var hasSink = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    return _.size(_.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))) > 0
}
var hasSource = (query) => {
    const subjectCounts = _.countBy(query.getSubjects(), _.identity)
    return _.size(_.pickBy(subjectCounts, (count, subj) => count > 1 && _.startsWith(subj, '?'))) > 0
}
var transitive = (graph) => { // silly
    do {
        var change = false
        
        for (var subject in graph) {
            var subjectReachability = graph[subject]
            for (var i = 0; i < graph[subject].length; i++) {
                if (graph[graph[subject][i]]) {
                    subjectReachability = subjectReachability.concat(graph[graph[subject][i]])
                }
            }
            if (_.uniq(subjectReachability).length > graph[subject].length) {
                change = true
                graph[subject] = _.uniq(subjectReachability)
            }
        }
    } while (change)
        
    return graph
}

var hasCycles = (graph) => {
    for (var subject in graph) {
        if (graph[subject].includes(subject)) return true
    }
    return false
}

var hasCycle = (query) => {
    var reachability = query.traverse().reduce((acc, node) => {
        if (node.subject && node.object.startsWith('?')) {
            if (!acc[node.subject]) acc[node.subject] = []
            acc[node.subject].push(node.object)
        }
        
        return acc
    }, {})

    return hasCycles(transitive(reachability))
}

// concrete Item
var hasURISubject = (query) => {
    return query.getSubjects().some(subject => subject.startsWith('http://www.wikidata.org/entity/Q'))
}

// Property paths
function getPropertyPathPredicates(wiki) {
    var predicates = []

    return eachFileInDir(wiki, (query, resolve) => {
        var query = new Query(query, parser.parse(query))

        predicates = traverse(query.getPredicates().filter(predicate => predicate.type === 'path')).reduce((acc, node) => {
            if (typeof node === 'string' && node.includes('prop/direct/P')) {
                acc.push(node)
            }

            return acc
        }, predicates)

        resolve()
    }).then(() => {
        return _.countBy(predicates, _.identity)
    })
}

// Wikidata features
var hasQualifiers = (query) => {
    return query.getRaw().includes('pq:P')
}
var hasReferences = (query) => {
    return query.getRaw().includes('pr:P')
}
var hasSitelinks = (query) => {
    return query.getRaw().includes('schema:about')
} 

## Queries per Wiki

In [6]:
Promise.all([
    countInQueries('data/cywiki', isValidQuery),
    countInQueries('data/dewiki', isValidQuery),
    countInQueries('data/enwiki', isValidQuery),
    countInQueries('data/euwiki', isValidQuery),
    countInQueries('data/wikidatawiki', isValidQuery),
]).then(([cy, de, en, eu, wd]) => {
    makePlot([{
        x: ['cywiki', 'dewiki', 'enwiki', 'euwiki', 'wikidata'],
        y: [cy, de, en, eu, wd],
        type: 'bar'
    }])
})

## Features used

In [14]:
var wiki = 'cywiki'
Promise.all([
    countInQueries('data/' + wiki, hasOptional),
    countInQueries('data/' + wiki, hasFilter),
    countInQueries('data/' + wiki, hasOrderBy),
    countInQueries('data/' + wiki, hasDistinct),
    countInQueries('data/' + wiki, hasGroupBy),
    countInQueries('data/' + wiki, hasValues),
    countInQueries('data/' + wiki, hasUnion),
    countInQueries('data/' + wiki, hasMinus),
    countInQueries('data/' + wiki, hasBind),
    countInQueries('data/' + wiki, hasSubQuery),
    countInQueries('data/' + wiki, hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [15]:
var wiki = 'dewiki'
Promise.all([
    countInQueries('data/' + wiki, hasOptional),
    countInQueries('data/' + wiki, hasFilter),
    countInQueries('data/' + wiki, hasOrderBy),
    countInQueries('data/' + wiki, hasDistinct),
    countInQueries('data/' + wiki, hasGroupBy),
    countInQueries('data/' + wiki, hasValues),
    countInQueries('data/' + wiki, hasUnion),
    countInQueries('data/' + wiki, hasMinus),
    countInQueries('data/' + wiki, hasBind),
    countInQueries('data/' + wiki, hasSubQuery),
    countInQueries('data/' + wiki, hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [16]:
var wiki = 'enwiki'
Promise.all([
    countInQueries('data/' + wiki, hasOptional),
    countInQueries('data/' + wiki, hasFilter),
    countInQueries('data/' + wiki, hasOrderBy),
    countInQueries('data/' + wiki, hasDistinct),
    countInQueries('data/' + wiki, hasGroupBy),
    countInQueries('data/' + wiki, hasValues),
    countInQueries('data/' + wiki, hasUnion),
    countInQueries('data/' + wiki, hasMinus),
    countInQueries('data/' + wiki, hasBind),
    countInQueries('data/' + wiki, hasSubQuery),
    countInQueries('data/' + wiki, hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [18]:
var wiki = 'euwiki'
Promise.all([
    countInQueries('data/' + wiki, hasOptional),
    countInQueries('data/' + wiki, hasFilter),
    countInQueries('data/' + wiki, hasOrderBy),
    countInQueries('data/' + wiki, hasDistinct),
    countInQueries('data/' + wiki, hasGroupBy),
    countInQueries('data/' + wiki, hasValues),
    countInQueries('data/' + wiki, hasUnion),
    countInQueries('data/' + wiki, hasMinus),
    countInQueries('data/' + wiki, hasBind),
    countInQueries('data/' + wiki, hasSubQuery),
    countInQueries('data/' + wiki, hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [18]:
var wiki = 'wikidatawiki'
Promise.all([
    countInQueries('data/' + wiki, hasOptional),
    countInQueries('data/' + wiki, hasFilter),
    countInQueries('data/' + wiki, hasOrderBy),
    countInQueries('data/' + wiki, hasDistinct),
    countInQueries('data/' + wiki, hasGroupBy),
    countInQueries('data/' + wiki, hasValues),
    countInQueries('data/' + wiki, hasUnion),
    countInQueries('data/' + wiki, hasMinus),
    countInQueries('data/' + wiki, hasBind),
    countInQueries('data/' + wiki, hasSubQuery),
    countInQueries('data/' + wiki, hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

In [7]:
var wiki = 'all'
Promise.all([
    countInQueries('data/' + wiki, hasOptional),
    countInQueries('data/' + wiki, hasFilter),
    countInQueries('data/' + wiki, hasOrderBy),
    countInQueries('data/' + wiki, hasDistinct),
    countInQueries('data/' + wiki, hasGroupBy),
    countInQueries('data/' + wiki, hasValues),
    countInQueries('data/' + wiki, hasUnion),
    countInQueries('data/' + wiki, hasMinus),
    countInQueries('data/' + wiki, hasBind),
    countInQueries('data/' + wiki, hasSubQuery),
    countInQueries('data/' + wiki, hasPropertyPath),
]).then(([optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath]) => {
    makePlot([{
        x: ['optional', 'filter', 'order by', 'distinct', 'group by', 'values', 'union', 'minus', 'bind', 'sub query', 'property path'],
        y: [optional, filter, orderBy, distinct, groupBy, values, union, minus, bind, subQuery, propertyPath],
        type: 'bar'
    }])
})

## Query Shapes

### Examples
in pseudo-turtle.

#### Star/source
Natural language: Things that are cities, that are located in Germany.
```
?foo instanceOf city .
?foo locatedIn germany .
```

#### Cycle
Natural language: Inventors killed by their own invention
```
?invention inventedBy ?inventor .
?inventor causeOfDeath ?invention .
```

#### Path
Natural language: Female head of government
```
?city headOfGovernment ?person .
?person sexOrGender female .
```

#### Sink
Natural language: Actors appearing in both The Hobbit movie and in Avengers
```
hobbit castMember ?actor .
avengers castMember ?actor .
```

In [3]:
var wiki = 'cywiki'
Promise.all([
    countInQueries('data/' + wiki, hasSource),
    countInQueries('data/' + wiki, hasPath),
    countInQueries('data/' + wiki, hasSink),
    countInQueries('data/' + wiki, hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [2]:
var wiki = 'dewiki'
Promise.all([
    countInQueries('data/' + wiki, hasSource),
    countInQueries('data/' + wiki, hasPath),
    countInQueries('data/' + wiki, hasSink),
    countInQueries('data/' + wiki, hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [4]:
var wiki = 'enwiki'
Promise.all([
    countInQueries('data/' + wiki, hasSource),
    countInQueries('data/' + wiki, hasPath),
    countInQueries('data/' + wiki, hasSink),
    countInQueries('data/' + wiki, hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [5]:
var wiki = 'euwiki'
Promise.all([
    countInQueries('data/' + wiki, hasSource),
    countInQueries('data/' + wiki, hasPath),
    countInQueries('data/' + wiki, hasSink),
    countInQueries('data/' + wiki, hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [6]:
var wiki = 'wikidatawiki'
Promise.all([
    countInQueries('data/' + wiki, hasSource),
    countInQueries('data/' + wiki, hasPath),
    countInQueries('data/' + wiki, hasSink),
    countInQueries('data/' + wiki, hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

In [8]:
var wiki = 'all'
Promise.all([
    countInQueries('data/' + wiki, hasSource),
    countInQueries('data/' + wiki, hasPath),
    countInQueries('data/' + wiki, hasSink),
    countInQueries('data/' + wiki, hasCycle),
]).then(([source, path, sink, cycle]) => {
    makePlot([{
        x: ['source', 'path', 'sink', 'cycle'],
        y: [source, path, sink, cycle],
        type: 'bar'
    }])
})

## "Backwards" queries

In [3]:
Promise.all([
    countInQueries('data/cywiki', hasURISubject),
    countInQueries('data/dewiki', hasURISubject),
    countInQueries('data/enwiki', hasURISubject),
    countInQueries('data/euwiki', hasURISubject),
    countInQueries('data/wikidatawiki', hasURISubject),
]).then(([cy, de, en, eu, wd]) => {
    makePlot([{
        x: ['cywiki', 'dewiki', 'enwiki', 'euwiki', 'wikidata'],
        y: [cy, de, en, eu, wd],
        type: 'bar'
    }])
})

## Property Paths

In [8]:
getPropertyPathPredicates('data/cywiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [7]:
getPropertyPathPredicates('data/dewiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [9]:
getPropertyPathPredicates('data/enwiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [10]:
getPropertyPathPredicates('data/euwiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [11]:
getPropertyPathPredicates('data/wikidatawiki').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

In [9]:
getPropertyPathPredicates('data/all').then((properties) => {
    makePlot([{
        x: Object.keys(properties).map((prop) => prop.substr(36)),
        y: Object.values(properties),
        type: 'bar'
    }])
})

## Usage by data model part

In [4]:
var wiki = 'cywiki'
Promise.all([
    countInQueries('data/' + wiki, hasQualifiers),
    countInQueries('data/' + wiki, hasReferences),
    countInQueries('data/' + wiki, hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [2]:
var wiki = 'dewiki'
Promise.all([
    countInQueries('data/' + wiki, hasQualifiers),
    countInQueries('data/' + wiki, hasReferences),
    countInQueries('data/' + wiki, hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [4]:
var wiki = 'enwiki'
Promise.all([
    countInQueries('data/' + wiki, hasQualifiers),
    countInQueries('data/' + wiki, hasReferences),
    countInQueries('data/' + wiki, hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [5]:
var wiki = 'euwiki'
Promise.all([
    countInQueries('data/' + wiki, hasQualifiers),
    countInQueries('data/' + wiki, hasReferences),
    countInQueries('data/' + wiki, hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [6]:
var wiki = 'wikidatawiki'
Promise.all([
    countInQueries('data/' + wiki, hasQualifiers),
    countInQueries('data/' + wiki, hasReferences),
    countInQueries('data/' + wiki, hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})

In [10]:
var wiki = 'all'
Promise.all([
    countInQueries('data/' + wiki, hasQualifiers),
    countInQueries('data/' + wiki, hasReferences),
    countInQueries('data/' + wiki, hasSitelinks),
]).then(([qualifiers, references, sitelinks]) => {
    makePlot([{
        x: ['qualifiers', 'references', 'sitelinks'],
        y: [qualifiers, references, sitelinks],
        type: 'bar'
    }])
})