In [1]:
const _ = require('lodash')
const traverse = require('traverse')
const sparqljs = require('sparqljs')

const eachFileInDir = require('./eachFileInDir')
const countInQueries = require('./countInQueries')
const namespaces = require('./namespaces')
const parser = new sparqljs.Parser(namespaces)
const Query = require('./Query')

const EN_SPARQL = 'data/sparql_queries_enwiki'
const DE_SPARQL = 'data/sparql_queries_dewiki'

## Valid Queries

In [2]:
var isValidQuery = (query) => {
    return true // will internally throw an exception when parsed
}

countInQueries(EN_SPARQL, isValidQuery)
countInQueries(DE_SPARQL, isValidQuery)

data/sparql_queries_dewiki 669 669
data/sparql_queries_enwiki 992 992


## OPTIONAL

In [3]:
var hasOptional = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'optional'
    }, false)
}
countInQueries(EN_SPARQL, hasOptional)
countInQueries(DE_SPARQL, hasOptional)

data/sparql_queries_dewiki 669 40
data/sparql_queries_enwiki 992 144


## FILTER

In [4]:
var hasFilter = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'filter'
    }, false)
}
countInQueries(EN_SPARQL, hasFilter)
countInQueries(DE_SPARQL, hasFilter)

data/sparql_queries_dewiki 669 51
data/sparql_queries_enwiki 992 375


## ORDER BY

In [5]:
var hasOrderBy = (query) => {
    return query.getParsed().order
}
countInQueries(EN_SPARQL, hasOrderBy)
countInQueries(DE_SPARQL, hasOrderBy)

data/sparql_queries_dewiki 669 27
data/sparql_queries_enwiki 992 98


## DISTINCT

In [6]:
var hasDistinct = (query) => {
    return query.getParsed().distinct
}
countInQueries(EN_SPARQL, hasDistinct)
countInQueries(DE_SPARQL, hasDistinct)

data/sparql_queries_dewiki 669 33
data/sparql_queries_enwiki 992 46


## GROUP BY

In [7]:
var hasGroupBy = (query) => {
    return query.getParsed().group
}
countInQueries(EN_SPARQL, hasGroupBy)
countInQueries(DE_SPARQL, hasGroupBy)

data/sparql_queries_dewiki 669 14
data/sparql_queries_enwiki 992 76


## VALUES

In [8]:
var hasValues = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'values'
    }, false)
}
countInQueries(EN_SPARQL, hasValues)
countInQueries(DE_SPARQL, hasValues)

data/sparql_queries_dewiki 669 13
data/sparql_queries_enwiki 992 57


## UNION

In [9]:
var hasUnion = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'union'
    }, false)
}
countInQueries(EN_SPARQL, hasUnion)
countInQueries(DE_SPARQL, hasUnion)

data/sparql_queries_dewiki 669 148
data/sparql_queries_enwiki 992 86


## MINUS

In [10]:
var hasMinus = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'minus'
    }, false)
}
countInQueries(EN_SPARQL, hasMinus)
countInQueries(DE_SPARQL, hasMinus)

data/sparql_queries_dewiki 669 16
data/sparql_queries_enwiki 992 32


## Subqueries

In [11]:
var hasSubQuery = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.level > 0 && this.notLeaf && node.type === 'query'
    }, false)
}
countInQueries(EN_SPARQL, hasSubQuery)
countInQueries(DE_SPARQL, hasSubQuery)

data/sparql_queries_dewiki 669 23
data/sparql_queries_enwiki 992 16


## Multiple Subject Queries

In [12]:
var hasMultipleSubjects = (query) => {
    const objects = query.getObjects()
    
    return query.getSubjects().some((subject) => {
        return typeof subject === 'string' && subject.startsWith('?') && objects.includes(subject)
    })
}
countInQueries(EN_SPARQL, hasMultipleSubjects)
countInQueries(DE_SPARQL, hasMultipleSubjects)

data/sparql_queries_dewiki 669 99
data/sparql_queries_enwiki 992 430


## Property Path

In [13]:
var hasPropertyPath = (query) => {
    return query.getPredicates().some(predicate => predicate.type === 'path')
}
countInQueries(EN_SPARQL, hasPropertyPath)
countInQueries(DE_SPARQL, hasPropertyPath)

data/sparql_queries_dewiki 669 531
data/sparql_queries_enwiki 992 113


## Object Variable Referencing (TODO: how is this really called?)

In [14]:
// This is about queries with object variables that are references as objects in another triple.
// This functionality is needed for cyclic links to answer questions like "What movies have actors starring together with their children?

var hasObjectReference = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    return _.size(_.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))) > 0
}
countInQueries(EN_SPARQL, hasObjectReference)
countInQueries(DE_SPARQL, hasObjectReference)

data/sparql_queries_dewiki 669 477
data/sparql_queries_enwiki 992 256


## Relevant Object Variable References

In [15]:
var hasRelevantObjectReference = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    const referencedObjects = _.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))
    const predicatesWithObj = query.traverse().reduce(function(acc, node) {

      if (typeof node == 'object' && node.object && referencedObjects[node.object]) acc.push(node.predicate)

      return acc
    }, [])
    const relevantPredicatesWithObj = predicatesWithObj.filter(predicate => {
        return traverse(predicate).reduce((acc, node) => {
            return acc || typeof node === 'string' && node.includes('http://www.wikidata.org/prop/direct/')
        }, false)
    })
    
    
    return _.size(relevantPredicatesWithObj) > 1
}
countInQueries(EN_SPARQL, hasRelevantObjectReference)
countInQueries(DE_SPARQL, hasRelevantObjectReference)

data/sparql_queries_dewiki 669 465
data/sparql_queries_enwiki 992 17


## wdt:P31/wdt:P279*

In [16]:
var hasInstanceOfSubclassOf = (query) => {
    return query.getPredicates().some(predicate => {
        return ( predicate.type === 'path'
            && predicate.pathType === '/'
            && predicate.items[0] === 'http://www.wikidata.org/prop/direct/P31'
            && predicate.items[1].pathType === '*'
            && predicate.items[1].items[0] === 'http://www.wikidata.org/prop/direct/P279' )
    })
}
countInQueries(EN_SPARQL, hasInstanceOfSubclassOf)
countInQueries(DE_SPARQL, hasInstanceOfSubclassOf)

data/sparql_queries_dewiki 669 25
data/sparql_queries_enwiki 992 22


## wdt:P279*

In [17]:
var hasSubclassOf = function hasSubclassOf(predicate) {
    if (predicate.type !== 'path') return false
    if (predicate.pathType === '*' && predicate.items.includes('http://www.wikidata.org/prop/direct/P279')) {
        return true
    }
    
    return predicate.items.reduce((any, predicate) => { return any || hasSubclassOf(predicate) }, false)
}
var hasInstanceOfSubclassOf = (query) => {
    return query.getPredicates().some(hasSubclassOf)
}
countInQueries(EN_SPARQL, hasInstanceOfSubclassOf)
countInQueries(DE_SPARQL, hasInstanceOfSubclassOf)

data/sparql_queries_dewiki 669 74
data/sparql_queries_enwiki 992 68


## Use of Qualifiers

In [18]:
var hasQualifiers = (query) => {
    return query.getRaw().includes('pq:P')
}

countInQueries(EN_SPARQL, hasQualifiers)
countInQueries(DE_SPARQL, hasQualifiers)

data/sparql_queries_dewiki 669 22
data/sparql_queries_enwiki 992 37


## Use of References

In [19]:
var hasReferences = (query) => {
    return query.getRaw().includes('pr:P')
}

countInQueries(EN_SPARQL, hasReferences)
countInQueries(DE_SPARQL, hasReferences)

data/sparql_queries_dewiki 669 0
data/sparql_queries_enwiki 992 4


In [20]:
var predicates = {}

eachFileInDir(DE_SPARQL, (query, resolve) => {
    var wikibasePredicates = query.match(/wikibase:\w+/g)
    if (wikibasePredicates) wikibasePredicates.forEach((predicate) => {
        if (predicates[predicate]) predicates[predicate]++
        else predicates[predicate] = 1
    })
    resolve()
}).then(() => {
    var tuples = []
    for (var predicate in predicates) tuples.push([predicate, predicates[predicate]])
    console.log(tuples.sort((a, b) => b[1] - a[1]).slice(0, 5))
})

Promise { <pending> }

[ [ 'wikibase:label', 13 ],
  [ 'wikibase:language', 13 ],
  [ 'wikibase:rank', 9 ],
  [ 'wikibase:DeprecatedRank', 9 ],
  [ 'wikibase:around', 6 ] ]


## Features per Query

In [21]:
var allFeatures = [
    hasOptional, hasFilter, hasOrderBy,
    hasValues, hasUnion, hasMinus,
    hasSubQuery, hasMultipleSubjects, hasPropertyPath,
    hasSubclassOf, hasQualifiers, hasReferences,
    hasGroupBy
]
var featuresPerQuery = (new Array(12)).fill(0)

eachFileInDir(EN_SPARQL, (query, resolve) => {
    let features = 0
    allFeatures.forEach((hasFeature) => {
        if (hasFeature(new Query(query, parser.parse(query)))) features++
    })
    featuresPerQuery[features]++
    resolve()
}).then(() => {
    console.log(featuresPerQuery)
})

// EN: [ 432, 111, 248, 117, 71, 9, 3, 1, 0, 0, 0, 0 ]
// DE: [ 85,  451,  74,  40,  7, 3, 0, 9, 0, 0, 0, 0 ]

// Updated
// EN: [ 432, 101, 222, 103, 79, 40, 9, 4, 2, 0, 0, 0 ]
// DE: [ 84, 343, 170, 27, 26, 7, 3, 9, 0, 0, 0, 0 ]

Promise { <pending> }

[ 432, 101, 222, 103, 79, 40, 9, 4, 2, 0, 0, 0 ]


## Finding a subset of features that is sufficient for most queries

In [22]:
var allFeatures = [
    hasOptional, hasFilter, hasOrderBy,
    hasValues, hasUnion, hasMinus,
    hasSubQuery, hasMultipleSubjects, hasPropertyPath,
    hasSubclassOf, hasQualifiers, hasReferences,
    hasGroupBy
]
var subset = [
    hasOptional, hasFilter, hasMultipleSubjects,
    hasQualifiers, hasValues, hasOrderBy,
    hasUnion
];
var queriesProduceable = 0

var isProduceableWithSubset = (query) => {
    let featuresUsed = 0
    let subsetFeaturesUsed = 0

    allFeatures.forEach((hasFeature) => {
        if (hasFeature(query)) featuresUsed++
    })
    subset.forEach((hasFeature) => {
        if (hasFeature(query)) subsetFeaturesUsed++
    })
    
    return featuresUsed === subsetFeaturesUsed
}

countInQueries(EN_SPARQL, isProduceableWithSubset)
countInQueries(DE_SPARQL, isProduceableWithSubset)

data/sparql_queries_dewiki 669 129
data/sparql_queries_enwiki 992 797


## Queries with Wikidata-URI subjects (not variables)

In [23]:
var hasURISubject = (query) => {
    return query.getSubjects().some(subject => subject.startsWith('http://www.wikidata.org/entity/Q'))
}

countInQueries(EN_SPARQL, hasURISubject)
countInQueries(DE_SPARQL, hasURISubject)

data/sparql_queries_dewiki 669 0
data/sparql_queries_enwiki 992 2
