In [1]:
const _ = require('lodash')
const traverse = require('traverse')
const sparqljs = require('sparqljs')

const eachFileInDir = require('./eachFileInDir')
const countInQueries = require('./countInQueries')
const namespaces = require('./namespaces')
const parser = new sparqljs.Parser(namespaces)
const Query = require('./Query')

const WIKI = 'data/enwiki'

## Valid Queries

In [2]:
var isValidQuery = (query) => {
    return true // will internally throw an exception when parsed
}

countInQueries(WIKI, isValidQuery)

data/enwiki 2105 2105


## OPTIONAL

In [3]:
var hasOptional = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'optional'
    }, false)
}
countInQueries(WIKI, hasOptional)

data/enwiki 2105 568


## FILTER

In [4]:
var hasFilter = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'filter'
    }, false)
}
countInQueries(WIKI, hasFilter)

data/enwiki 2105 733


## ORDER BY

In [5]:
var hasOrderBy = (query) => {
    return query.getParsed().order
}
countInQueries(WIKI, hasOrderBy)

data/enwiki 2105 209


## DISTINCT

In [6]:
var hasDistinct = (query) => {
    return query.getParsed().distinct
}
countInQueries(WIKI, hasDistinct)

data/enwiki 2105 169


## GROUP BY

In [7]:
var hasGroupBy = (query) => {
    return query.getParsed().group
}
countInQueries(WIKI, hasGroupBy)

data/enwiki 2105 182


## VALUES

In [8]:
var hasValues = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'values'
    }, false)
}
countInQueries(WIKI, hasValues)

data/enwiki 2105 290


## UNION

In [9]:
var hasUnion = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'union'
    }, false)
}
countInQueries(WIKI, hasUnion)

data/enwiki 2105 182


## NOT EXIST

In [10]:
var hasNotExist = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.operator === 'notexists'
    }, false)
}
countInQueries(WIKI, hasNotExist)

data/enwiki 2105 549


## BIND

In [11]:
var hasBind = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'bind'
    }, false)
}
countInQueries(WIKI, hasBind)

data/enwiki 2105 159


## MINUS

In [12]:
var hasMinus = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'minus'
    }, false)
}
countInQueries(WIKI, hasMinus)

data/enwiki 2105 83


## Subqueries

In [13]:
var hasSubQuery = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.level > 0 && this.notLeaf && node.type === 'query'
    }, false)
}
countInQueries(WIKI, hasSubQuery)

data/enwiki 2105 4


## Multiple Subject Queries

In [14]:
var hasMultipleSubjects = (query) => {
    const objects = query.getObjects()
    
    return query.getSubjects().some((subject) => {
        return typeof subject === 'string' && subject.startsWith('?') && objects.includes(subject)
    })
}
countInQueries(WIKI, hasMultipleSubjects)

data/enwiki 2105 903


## Property Path

In [15]:
var hasPropertyPath = (query) => {
    return query.getPredicates().some(predicate => predicate.type === 'path')
}
countInQueries(WIKI, hasPropertyPath)

data/enwiki 2105 223


## Most Common Property Path Predicates

In [16]:
var predicates = []
eachFileInDir(WIKI, (query, resolve) => {
    var query = new Query(query, parser.parse(query))
    
    predicates = traverse(query.getPredicates().filter(predicate => predicate.type === 'path')).reduce((acc, node) => {
        if (typeof node === 'string' && node.includes('prop/direct/P')) {
            acc.push(node)
        }
        
        return acc
    }, predicates)
    
    resolve()
}).then(() => {
    console.log(_.countBy(predicates, _.identity))
})

Promise { <pending> }

{ 'http://www.wikidata.org/prop/direct/P131': 70,
  'http://www.wikidata.org/prop/direct/P361': 40,
  'http://www.wikidata.org/prop/direct/P541': 38,
  'http://www.wikidata.org/prop/direct/P279': 211,
  'http://www.wikidata.org/prop/direct/P360': 1,
  'http://www.wikidata.org/prop/direct/P31': 84,
  'http://www.wikidata.org/prop/direct/P171': 6,
  'http://www.wikidata.org/prop/direct/P106': 16,
  'http://www.wikidata.org/prop/direct/P172': 2,
  'http://www.wikidata.org/prop/direct/P527': 4,
  'http://www.wikidata.org/prop/direct/P150': 13,
  'http://www.wikidata.org/prop/direct/P1343': 1,
  'http://www.wikidata.org/prop/direct/P1142': 4,
  'http://www.wikidata.org/prop/direct/P1387': 4,
  'http://www.wikidata.org/prop/direct/P2632': 1,
  'http://www.wikidata.org/prop/direct/P20': 2,
  'http://www.wikidata.org/prop/direct/P136': 1,
  'http://www.wikidata.org/prop/direct/P17': 1,
  'http://www.wikidata.org/prop/direct/P166': 2,
  'http://www.wikidata.org/prop/direct/P176': 1,
  'http://w

## Object Variable Referencing (TODO: how is this really called?)

In [17]:
// This is about queries with object variables that are references as objects in another triple.
// This functionality is needed for cyclic links to answer questions like "What movies have actors starring together with their children?

var hasObjectReference = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    return _.size(_.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))) > 0
}
countInQueries(WIKI, hasObjectReference)

data/enwiki 2105 170


## Relevant Object Variable References

In [18]:
var hasRelevantObjectReference = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    const referencedObjects = _.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))
    const predicatesWithObj = query.traverse().reduce(function(acc, node) {

      if (typeof node == 'object' && node.object && referencedObjects[node.object]) acc.push(node.predicate)

      return acc
    }, [])
    const relevantPredicatesWithObj = predicatesWithObj.filter(predicate => {
        return traverse(predicate).reduce((acc, node) => {
            return acc || typeof node === 'string' && node.includes('http://www.wikidata.org/prop/direct/')
        }, false)
    })
    
    
    return _.size(relevantPredicatesWithObj) > 1
}
countInQueries(WIKI, hasRelevantObjectReference)

data/enwiki 2105 28


## Cycles

In [19]:
var transitive = (graph) => { // silly
    do {
        var change = false
        
        for (var subject in graph) {
            var subjectReachability = graph[subject]
            for (var i = 0; i < graph[subject].length; i++) {
                if (graph[graph[subject][i]]) {
                    subjectReachability = subjectReachability.concat(graph[graph[subject][i]])
                }
            }
            if (_.uniq(subjectReachability).length > graph[subject].length) {
                change = true
                graph[subject] = _.uniq(subjectReachability)
            }
        }
    } while (change)
        
    return graph
}

var hasCycles = (graph) => {
    for (var subject in graph) {
        if (graph[subject].includes(subject)) return true
    }
    return false
}

var hasCycle = (query) => {
    var reachability = query.traverse().reduce((acc, node) => {
        if (node.subject && node.object.startsWith('?')) {
            if (!acc[node.subject]) acc[node.subject] = []
            acc[node.subject].push(node.object)
        }
        
        return acc
    }, {})

    return hasCycles(transitive(reachability))
}

countInQueries(WIKI, hasCycle)

data/enwiki 2105 0


## wdt:P31/wdt:P279*

In [20]:
var hasInstanceOfSubclassOf = (query) => {
    return query.getPredicates().some(predicate => {
        return ( predicate.type === 'path'
            && predicate.pathType === '/'
            && predicate.items[0] === 'http://www.wikidata.org/prop/direct/P31'
            && predicate.items[1].pathType === '*'
            && predicate.items[1].items[0] === 'http://www.wikidata.org/prop/direct/P279' )
    })
}
countInQueries(WIKI, hasInstanceOfSubclassOf)

data/enwiki 2105 31


## wdt:P279*

In [21]:
var hasSubclassOf = function hasSubclassOf(predicate) {
    if (predicate.type !== 'path') return false
    if (predicate.pathType === '*' && predicate.items.includes('http://www.wikidata.org/prop/direct/P279')) {
        return true
    }
    
    return predicate.items.reduce((any, predicate) => { return any || hasSubclassOf(predicate) }, false)
}
var hasInstanceOfSubclassOf = (query) => {
    return query.getPredicates().some(hasSubclassOf)
}
countInQueries(WIKI, hasInstanceOfSubclassOf)

data/enwiki 2105 129


## Use of Qualifiers

In [22]:
var hasQualifiers = (query) => {
    return query.getRaw().includes('pq:P')
}

countInQueries(WIKI, hasQualifiers)

data/enwiki 2105 114


## Use of References

In [23]:
var hasReferences = (query) => {
    return query.getRaw().includes('pr:P')
}

countInQueries(WIKI, hasReferences)

data/enwiki 2105 2


## Sitelinks

In [24]:
var hasSitelinks = (query) => {
    return query.getRaw().includes('schema:about')
}

countInQueries(WIKI, hasSitelinks)

data/enwiki 2105 641


In [25]:
var predicates = {}

eachFileInDir(WIKI, (query, resolve) => {
    var wikibasePredicates = query.match(/wikibase:\w+/g)
    if (wikibasePredicates) wikibasePredicates.forEach((predicate) => {
        if (predicates[predicate]) predicates[predicate]++
        else predicates[predicate] = 1
    })
    resolve()
}).then(() => {
    var tuples = []
    for (var predicate in predicates) tuples.push([predicate, predicates[predicate]])
    console.log(tuples.sort((a, b) => b[1] - a[1]).slice(0, 5))
})

Promise { <pending> }

[ [ 'wikibase:sitelinks', 399 ],
  [ 'wikibase:language', 316 ],
  [ 'wikibase:label', 316 ],
  [ 'wikibase:statements', 51 ],
  [ 'wikibase:timeValue', 12 ] ]


## Features per Query

In [26]:
var allFeatures = [
    hasOptional, hasFilter, hasOrderBy,
    hasValues, hasUnion, hasMinus,
    hasSubQuery, hasMultipleSubjects, hasPropertyPath,
    hasSubclassOf, hasQualifiers, hasReferences,
    hasGroupBy
]
var featuresPerQuery = (new Array(12)).fill(0)

eachFileInDir(WIKI, (query, resolve) => {
    let features = 0
    allFeatures.forEach((hasFeature) => {
        if (hasFeature(new Query(query, parser.parse(query)))) features++
    })
    featuresPerQuery[features]++
    resolve()
}).then(() => {
    console.log(featuresPerQuery)
})

// EN: [ 432, 111, 248, 117, 71, 9, 3, 1, 0, 0, 0, 0 ]
// DE: [ 85,  451,  74,  40,  7, 3, 0, 9, 0, 0, 0, 0 ]

// Updated
// EN: [ 432, 101, 222, 103, 79, 40, 9, 4, 2, 0, 0, 0 ]
// DE: [ 84, 343, 170, 27, 26, 7, 3, 9, 0, 0, 0, 0 ]

Promise { <pending> }

## Finding a subset of features that is sufficient for most queries

In [31]:
var allFeatures = [
    hasOptional, hasFilter, hasOrderBy,
    hasValues, hasUnion, hasMinus,
    hasSubQuery, 
    hasGroupBy, hasPropertyPath
]
var subset = [
    hasOptional, hasFilter,
    hasValues, hasOrderBy, hasPropertyPath
];
var queriesProduceable = 0

var isProduceableWithSubset = (query) => {
    let featuresUsed = 0
    let subsetFeaturesUsed = 0

    allFeatures.forEach((hasFeature) => {
        if (hasFeature(query)) featuresUsed++
    })
    subset.forEach((hasFeature) => {
        if (hasFeature(query)) subsetFeaturesUsed++
    })
    
    return featuresUsed === subsetFeaturesUsed
}

countInQueries(WIKI, isProduceableWithSubset)

[ 1185, 2410, 958, 958, 716, 815, 158, 93, 18, 38, 1, 0 ]
[ 1411, 2975, 1121, 1270, 901, 869, 170, 96, 18, 38, 2, 0 ]
data/wikidatawiki 7349 4337


## Queries with Wikidata-URI subjects (not variables)

In [30]:
var hasURISubject = (query) => {
    return query.getSubjects().some(subject => subject.startsWith('http://www.wikidata.org/entity/Q'))
}

countInQueries(WIKI, hasURISubject)

data/sparql_queries_dewiki 669 0
data/sparql_queries_enwiki 992 2
