Skip to content

Commit

Permalink
Working FAST reconile.
Browse files Browse the repository at this point in the history
  • Loading branch information
lawlesst committed Aug 13, 2013
1 parent 13f4c41 commit cbcb6df
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 91 deletions.
2 changes: 2 additions & 0 deletions .gitignore
@@ -0,0 +1,2 @@
*pyc
fast_cache.sqlite
12 changes: 8 additions & 4 deletions README.md
@@ -1,10 +1,14 @@
A Google Refine reconciliation service that queries the API provided by the [JournalTOCs](http://www.journaltocs.ac.uk/api_help.php?subAction=journals) project.
An OpenRefine reconciliation service for [FAST](http://www.oclc.org/research/activities/fast.html?urlm=159754).

You will need to registor for the api at http://www.journaltocs.ac.uk/api_help.php?subAction=journals.
>FAST is available as Linked Data, which is an approach to publishing data which enhances the utility of information on the web by making references to persons, places, things, etc. more consistent and linkable across domains.
Run as:
The service queries the [FAST AutoSuggest API](http://www.oclc.org/developer/documentation/fast-linked-data-api/request-types) and provides normalized scores across queries for recoiling in Refine.

* as of 8/13/13 only supports Corporate Name queries. This will be expanded soon.

Run locally as:
~~~~
$ python reconcile.py python reconcile.py --debug -u your_api_email@none.com
$ python reconcile.py python reconcile.py --debug
~~~~

Michael Stephens wrote a [demo reconcilliation service](https://github.com/mikejs/reconcile-demo) that this code is based on.
165 changes: 78 additions & 87 deletions reconcile.py
@@ -1,13 +1,12 @@
"""
A Google Refine reconcillation servce for the api provided by
the JournalTOCs project.
An OpenRefine reconciliation service for the API provided by
OCLC for FAST.
See API documentation:
http://www.journaltocs.ac.uk/api_help.php?subAction=journals
http://www.oclc.org/developer/documentation/fast-linked-data-api/request-types
An example reconciliation service API for Google Refine 2.0.
See http://code.google.com/p/google-refine/wiki/ReconciliationServiceApi.
This code is adapted from Michael Stephens:
https://github.com/mikejs/reconcile-demo
"""

from flask import Flask
Expand All @@ -18,32 +17,49 @@
from operator import itemgetter
import urllib

import feedparser
#For scoring results
from fuzzywuzzy import fuzz
import requests
import requests_cache
requests_cache.install_cache('fast_corporate_cache')

import text

app = Flask(__name__)

#some config
api_base_url = 'http://fast.oclc.org/searchfast/fastsuggest'
#For constructing links to FAST.
fast_uri_base = 'http://id.worldcat.org/fast/{0}'

#If it's installed, use the requests_cache library to
#cache calls to the FAST API.
try:
import requests_cache
requests_cache.install_cache('fast_cache')
except ImportError:
app.logger.debug("No request cache found.")
pass

#Helper text processing
import text

# Basic service metadata. There are a number of other documented options
# but this is all we need for a simple service.
metadata = {
"name": "Fast Corporate Name Reconciliation Service",
"defaultTypes": [{"id": "http://www.w3.org/2004/02/skos/core#", "name": "skos:Concept"}],
#ToDo add support for all types.
"defaultTypes": [
{"id": "/fast/corporate-name", "name": "Corporate Name"}
],
}

api_base_url = 'http://fast.oclc.org/searchfast/fastsuggest'

fast_uri_base = 'http://id.worldcat.org/fast/{0}'
def make_uri(fast_id):
"""
Prepare a FAST url from the ID returned by the API.
"""
fid = fast_id.lstrip('fst').lstrip('0')
fast_uri = fast_uri_base.format(fid)
return fast_uri


def jsonpify(obj):
"""
Helper to support JSONP
Expand All @@ -56,81 +72,63 @@ def jsonpify(obj):
except KeyError:
return jsonify(obj)

#skip these terms for lookup
skip_words = [
'the university of',
'univ',
'univer',
'universi',
'university'
'of',
'the'
]


def search(raw_query):
"""
Hit the FAST API for names.
"""
out = []
#Hit the suggest api for each token
#tokens = [text.normalize(t) for t in text.tokenize(raw_query)]
tokens = []
done = False
query_scrubbed = text.normalize(raw_query).replace('the university of', 'university of').strip()
#minimum of 4 characters
for i in xrange(4, len(query_scrubbed) + 2, 2):
tokens.append(''.join(query_scrubbed[:i]))
for token in [query_scrubbed]:
if done is True:
break
if token in skip_words:
unique_fast_ids = []
query = text.normalize(raw_query).replace('the university of', 'university of').strip()
try:
#FAST api requires spaces to be encoded as %20 rather than +
url = api_base_url + '?query=' + urllib.quote(query) + '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject&queryIndex=suggest10&wt=json'
resp = requests.get(url)
results = resp.json()
except Exception, e:
app.logger.warning(e)
return out
for position, item in enumerate(results['response']['docs']):
match = False
name = item.get('auth')
alternate = item.get('suggestall')
if (len(alternate) > 0):
alt = alternate[0]
else:
alt = ''
fid = item.get('idroot')
fast_uri = make_uri(fid)
#The FAST service returns many duplicates. Avoid returning many of the
#same result
if fid in unique_fast_ids:
continue
try:
#FAST api requires spaces to be encoded as %20 rather than +
url = api_base_url + '?query=' + urllib.quote(token) + '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject&queryIndex=suggest10&wt=json'
resp = requests.get(url)
results = resp.json()
except Exception, e:
print e
for position, item in enumerate(results['response']['docs']):
match = False
score2 = 0
name = item.get('auth')
alternate = item.get('suggestall')
score = item.get('score')
if (len(alternate) > 0):
alt = alternate[0]
else:
alt = ''
pid = item.get('idroot')
normal_query = text.normalize(raw_query)
if normal_query == text.normalize(name):
match = True
elif normal_query == text.normalize(alt):
match = True
resource = {
"id": make_uri(pid),
"name": name,
"score": score,
"match": match,
"type": [
{
"id": "http://www.w3.org/2004/02/skos/core#",
"name": "skos:Concept",
}
]
}
#The FAST service returns many duplicates.
if resource not in out:
out.append(resource)
#Break out of the query loop if we've found a good candidate
if (match is True):
done = True
break
else:
unique_fast_ids.append(fid)
score_1 = fuzz.token_sort_ratio(query, name)
score_2 = fuzz.token_sort_ratio(query, alt)
#Return a maximum score
score = max(score_1, score_2)
if query == text.normalize(name):
match = True
elif query == text.normalize(alt):
match = True
resource = {
"id": fast_uri,
"name": name,
"score": score,
"match": match,
"type": [
{
"id": "/fast/corporate-name",
"name": "Corporate Name",
}
]
}
out.append(resource)
#Sort this list by score
sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
return sorted_out
#Refine only will handle top three matches.
return sorted_out[:2]


@app.route("/fast-corporate/reconcile", methods=['POST', 'GET'])
Expand Down Expand Up @@ -167,13 +165,6 @@ def reconcile():
from optparse import OptionParser
oparser = OptionParser()
oparser.add_option('-d', '--debug', action='store_true', default=False)
oparser.add_option('-u', '--user', dest='api_user', default=False)
opts, args = oparser.parse_args()
if opts.api_user is False:
raise Exception("No API user provided.\
Pass as --user.\
Typically an email address.")
else:
TOC_USER = opts.api_user
app.debug = opts.debug
app.run(host='0.0.0.0')

0 comments on commit cbcb6df

Please sign in to comment.