In [2]:
const _ = require('lodash')
const traverse = require('traverse')
const sparqljs = require('sparqljs')

const eachFileInDir = require('./eachFileInDir')
const countInQueries = require('./countInQueries')
const namespaces = require('./namespaces')
const parser = new sparqljs.Parser(namespaces)
const Query = require('./Query')

const WIKI = 'data/normalized'

## Valid Queries

In [3]:
var isValidQuery = (query) => {
    return true // will internally throw an exception when parsed
}

countInQueries(WIKI, isValidQuery)

data/normalized 2484 2484


## OPTIONAL

In [4]:
var hasOptional = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'optional'
    }, false)
}
countInQueries(WIKI, hasOptional)

data/normalized 2484 1148


## FILTER

In [9]:
var hasFilter = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'filter'
    }, false)
}
countInQueries(WIKI, hasFilter)

data/normalized 2484 1078


## ORDER BY

In [10]:
var hasOrderBy = (query) => {
    return query.getParsed().order
}
countInQueries(WIKI, hasOrderBy)

data/normalized 2484 904


## DISTINCT

In [11]:
var hasDistinct = (query) => {
    return query.getParsed().distinct
}
countInQueries(WIKI, hasDistinct)

data/normalized 2484 657


## GROUP BY

In [12]:
var hasGroupBy = (query) => {
    return query.getParsed().group
}
countInQueries(WIKI, hasGroupBy)

data/normalized 2484 368


## VALUES

In [25]:
var hasValues = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'values'
    }, false)
}
countInQueries(WIKI, hasValues)

data/normalized 2484 337


## UNION

In [26]:
var hasUnion = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'union'
    }, false)
}
countInQueries(WIKI, hasUnion)

data/normalized 2484 475


## NOT EXIST

In [11]:
var hasNotExist = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.operator === 'notexists'
    }, false)
}
countInQueries(WIKI, hasNotExist)

data/normalized 2484 364


## BIND

In [12]:
var hasBind = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'bind'
    }, false)
}
countInQueries(WIKI, hasBind)

data/normalized 2484 618


## MINUS

In [28]:
var hasMinus = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.notLeaf && node.type === 'minus'
    }, false)
}
countInQueries(WIKI, hasMinus)

data/normalized 2484 465


## Subqueries

In [29]:
var hasSubQuery = (query) => {
    return query.traverse().reduce(function(acc, node) {
        return acc || this.level > 0 && this.notLeaf && node.type === 'query'
    }, false)
}
countInQueries(WIKI, hasSubQuery)

data/normalized 2484 165


## Multiple Subject Queries

In [13]:
var hasMultipleSubjects = (query) => {
    const objects = query.getObjects()
    
    return query.getSubjects().some((subject) => {
        return typeof subject === 'string' && subject.startsWith('?') && objects.includes(subject)
    })
}
countInQueries(WIKI, hasMultipleSubjects)

data/normalized 2483 1475


## Property Path

In [14]:
var hasPropertyPath = (query) => {
    return query.getPredicates().some(predicate => predicate.type === 'path')
}
countInQueries(WIKI, hasPropertyPath)

data/normalized 2483 849


## Most Common Property Path Predicates

In [15]:
var predicates = []
eachFileInDir(WIKI, (query, resolve) => {
    var query = new Query(query, parser.parse(query))
    
    predicates = traverse(query.getPredicates().filter(predicate => predicate.type === 'path')).reduce((acc, node) => {
        if (typeof node === 'string' && node.includes('prop/direct/P')) {
            acc.push(node)
        }
        
        return acc
    }, predicates)
    
    resolve()
}).then(() => {
    console.log(_.countBy(predicates, _.identity))
})

{ 'http://www.wikidata.org/prop/direct/P777': 2328 }


## Object Variable Referencing (TODO: how is this really called?)

In [16]:
// This is about queries with object variables that are references as objects in another triple.
// This functionality is needed for cyclic links to answer questions like "What movies have actors starring together with their children?

var hasObjectReference = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    return _.size(_.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))) > 0
}
countInQueries(WIKI, hasObjectReference)

data/normalized 2483 442


## Relevant Object Variable References

In [17]:
var hasRelevantObjectReference = (query) => {
    const objectCounts = _.countBy(query.getObjects(), _.identity)
    const referencedObjects = _.pickBy(objectCounts, (count, obj) => count > 1 && _.startsWith(obj, '?'))
    const predicatesWithObj = query.traverse().reduce(function(acc, node) {

      if (typeof node == 'object' && node.object && referencedObjects[node.object]) acc.push(node.predicate)

      return acc
    }, [])
    const relevantPredicatesWithObj = predicatesWithObj.filter(predicate => {
        return traverse(predicate).reduce((acc, node) => {
            return acc || typeof node === 'string' && node.includes('http://www.wikidata.org/prop/direct/')
        }, false)
    })
    
    
    return _.size(relevantPredicatesWithObj) > 1
}
countInQueries(WIKI, hasRelevantObjectReference)

data/normalized 2483 236


## Cycles

In [18]:
var transitive = (graph) => { // silly
    do {
        var change = false
        
        for (var subject in graph) {
            var subjectReachability = graph[subject]
            for (var i = 0; i < graph[subject].length; i++) {
                if (graph[graph[subject][i]]) {
                    subjectReachability = subjectReachability.concat(graph[graph[subject][i]])
                }
            }
            if (_.uniq(subjectReachability).length > graph[subject].length) {
                change = true
                graph[subject] = _.uniq(subjectReachability)
            }
        }
    } while (change)
        
    return graph
}

var hasCycles = (graph) => {
    for (var subject in graph) {
        if (graph[subject].includes(subject)) return true
    }
    return false
}

var hasCycle = (query) => {
    var reachability = query.traverse().reduce((acc, node) => {
        if (node.subject && node.object.startsWith('?')) {
            if (!acc[node.subject]) acc[node.subject] = []
            acc[node.subject].push(node.object)
        }
        
        return acc
    }, {})

    return hasCycles(transitive(reachability))
}

countInQueries(WIKI, hasCycle)

data/normalized 2483 21


## Use of Qualifiers

In [20]:
var hasQualifiers = (query) => {
    return query.getRaw().includes('pq:P')
}

countInQueries(WIKI, hasQualifiers)

data/normalized 2484 497


## Use of References

In [21]:
var hasReferences = (query) => {
    return query.getRaw().includes('pr:P')
}

countInQueries(WIKI, hasReferences)

data/normalized 2484 23


## Sitelinks

In [22]:
var hasSitelinks = (query) => {
    return query.getRaw().includes('schema:about')
}

countInQueries(WIKI, hasSitelinks)

data/normalized 2484 518


In [23]:
var predicates = {}

eachFileInDir(WIKI, (query, resolve) => {
    var wikibasePredicates = query.match(/wikibase:\w+/g)
    if (wikibasePredicates) wikibasePredicates.forEach((predicate) => {
        if (predicates[predicate]) predicates[predicate]++
        else predicates[predicate] = 1
    })
    resolve()
}).then(() => {
    var tuples = []
    for (var predicate in predicates) tuples.push([predicate, predicates[predicate]])
    console.log(tuples.sort((a, b) => b[1] - a[1]).slice(0, 5))
})

[
  [ 'wikibase:label', 616 ],
  [ 'wikibase:language', 616 ],
  [ 'wikibase:sitelinks', 275 ],
  [ 'wikibase:statements', 125 ],
  [ 'wikibase:timePrecision', 43 ]
]


## Features per Query

In [32]:
var allFeatures = [
    hasOptional, hasFilter, hasOrderBy,
    hasValues, hasUnion, hasMinus,
    hasSubQuery, hasMultipleSubjects, hasPropertyPath,
    hasQualifiers, hasReferences,
    hasGroupBy
]
var featuresPerQuery = (new Array(12)).fill(0)

eachFileInDir(WIKI, (query, resolve) => {
    let features = 0
    allFeatures.forEach((hasFeature) => {
        if (hasFeature(new Query(query, parser.parse(query)))) features++
    })
    featuresPerQuery[features]++
    resolve()
}).then(() => {
    console.log(featuresPerQuery)
})

[
  133, 372, 491, 496, 424,
  311, 157,  69,   8,  21,
    1,   0
]


## Finding a subset of features that is sufficient for most queries

In [33]:
var allFeatures = [
    hasOptional, hasFilter, hasOrderBy,
    hasValues, hasUnion, hasMinus,
    hasSubQuery, 
    hasGroupBy, hasPropertyPath
]
var subset = [
    hasOptional, hasFilter,
    hasValues, hasOrderBy, hasPropertyPath
];
var queriesProduceable = 0

var isProduceableWithSubset = (query) => {
    let featuresUsed = 0
    let subsetFeaturesUsed = 0

    allFeatures.forEach((hasFeature) => {
        if (hasFeature(query)) featuresUsed++
    })
    subset.forEach((hasFeature) => {
        if (hasFeature(query)) subsetFeaturesUsed++
    })
    
    return featuresUsed === subsetFeaturesUsed
}

countInQueries(WIKI, isProduceableWithSubset)

data/normalized 2483 1340


## Queries with Wikidata-URI subjects (not variables)

In [31]:
var hasURISubject = (query) => {
    return query.getSubjects().some(subject => subject.startsWith('http://www.wikidata.org/entity/Q'))
}

countInQueries(WIKI, hasURISubject)

data/normalized 2483 34
