Working FAST reconile.

lawlesst · Aug 13, 2013 · cbcb6df · cbcb6df
1 parent 13f4c41
commit cbcb6df
Show file tree

Hide file tree

Showing 3 changed files with 88 additions and 91 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*pyc
+fast_cache.sqlite
diff --git a/README.md b/README.md
@@ -1,10 +1,14 @@
-A Google Refine reconciliation service that queries the API provided by the [JournalTOCs](http://www.journaltocs.ac.uk/api_help.php?subAction=journals) project.
+An OpenRefine reconciliation service for [FAST](http://www.oclc.org/research/activities/fast.html?urlm=159754).
 
-You will need to registor for the api at http://www.journaltocs.ac.uk/api_help.php?subAction=journals.
+>FAST is available as Linked Data, which is an approach to publishing data which enhances the utility of information on the web by making references to persons, places, things, etc. more consistent and linkable across domains.
 
-Run as:
+The service queries the [FAST AutoSuggest API](http://www.oclc.org/developer/documentation/fast-linked-data-api/request-types) and provides normalized scores across queries for recoiling in Refine.
+
+ * as of 8/13/13 only supports Corporate Name queries.  This will be expanded soon.
+
+Run locally as:
 ~~~~
-$ python reconcile.py python reconcile.py --debug -u your_api_email@none.com
+$ python reconcile.py python reconcile.py --debug
 ~~~~
 
 Michael Stephens wrote a [demo reconcilliation service](https://github.com/mikejs/reconcile-demo) that this code is based on.
diff --git a/reconcile.py b/reconcile.py
@@ -1,13 +1,12 @@
 """
-A Google Refine reconcillation servce for the api provided by
-the JournalTOCs project.
+An OpenRefine reconciliation service for the API provided by
+OCLC for FAST.
 
 See API documentation:
-http://www.journaltocs.ac.uk/api_help.php?subAction=journals
+http://www.oclc.org/developer/documentation/fast-linked-data-api/request-types
 
-An example reconciliation service API for Google Refine 2.0.
-
-See http://code.google.com/p/google-refine/wiki/ReconciliationServiceApi.
+This code is adapted from Michael Stephens:
+https://github.com/mikejs/reconcile-demo
 """
 
 from flask import Flask
@@ -18,32 +17,49 @@
 from operator import itemgetter
 import urllib
 
-import feedparser
 #For scoring results
 from fuzzywuzzy import fuzz
 import requests
-import requests_cache
-requests_cache.install_cache('fast_corporate_cache')
-
-import text
 
 app = Flask(__name__)
 
+#some config
+api_base_url = 'http://fast.oclc.org/searchfast/fastsuggest'
+#For constructing links to FAST.
+fast_uri_base = 'http://id.worldcat.org/fast/{0}'
+
+#If it's installed, use the requests_cache library to
+#cache calls to the FAST API.
+try:
+    import requests_cache
+    requests_cache.install_cache('fast_cache')
+except ImportError:
+    app.logger.debug("No request cache found.")
+    pass
+
+#Helper text processing
+import text
+
 # Basic service metadata. There are a number of other documented options
 # but this is all we need for a simple service.
 metadata = {
     "name": "Fast Corporate Name Reconciliation Service",
-    "defaultTypes": [{"id": "http://www.w3.org/2004/02/skos/core#", "name": "skos:Concept"}],
+    #ToDo add support for all types.
+    "defaultTypes": [
+        {"id": "/fast/corporate-name", "name": "Corporate Name"}
+    ],
 }
 
-api_base_url = 'http://fast.oclc.org/searchfast/fastsuggest'
 
-fast_uri_base = 'http://id.worldcat.org/fast/{0}'
 def make_uri(fast_id):
+    """
+    Prepare a FAST url from the ID returned by the API.
+    """
     fid = fast_id.lstrip('fst').lstrip('0')
     fast_uri = fast_uri_base.format(fid)
     return fast_uri
 
+
 def jsonpify(obj):
     """
     Helper to support JSONP
@@ -56,81 +72,63 @@ def jsonpify(obj):
     except KeyError:
         return jsonify(obj)
 
-#skip these terms for lookup
-skip_words = [
-    'the university of',
-    'univ',
-    'univer',
-    'universi',
-    'university'
-    'of',
-    'the'
-]
-
 
 def search(raw_query):
     """
     Hit the FAST API for names.
     """
     out = []
-    #Hit the suggest api for each token
-    #tokens = [text.normalize(t) for t in text.tokenize(raw_query)]
-    tokens = []
-    done = False
-    query_scrubbed = text.normalize(raw_query).replace('the university of', 'university of').strip()
-    #minimum of 4 characters
-    for i in xrange(4, len(query_scrubbed) + 2, 2):
-        tokens.append(''.join(query_scrubbed[:i]))
-    for token in [query_scrubbed]:
-        if done is True:
-            break
-        if token in skip_words:
+    unique_fast_ids = []
+    query = text.normalize(raw_query).replace('the university of', 'university of').strip()
+    try:
+        #FAST api requires spaces to be encoded as %20 rather than +
+        url = api_base_url + '?query=' + urllib.quote(query) + '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject&queryIndex=suggest10&wt=json'
+        resp = requests.get(url)
+        results = resp.json()
+    except Exception, e:
+        app.logger.warning(e)
+        return out
+    for position, item in enumerate(results['response']['docs']):
+        match = False
+        name = item.get('auth')
+        alternate = item.get('suggestall')
+        if (len(alternate) > 0):
+            alt = alternate[0]
+        else:
+            alt = ''
+        fid = item.get('idroot')
+        fast_uri = make_uri(fid)
+        #The FAST service returns many duplicates.  Avoid returning many of the
+        #same result
+        if fid in unique_fast_ids:
             continue
-        try:
-            #FAST api requires spaces to be encoded as %20 rather than +
-            url = api_base_url + '?query=' + urllib.quote(token) + '&rows=30&queryReturn=suggestall%2Cidroot%2Cauth%2cscore&suggest=autoSubject&queryIndex=suggest10&wt=json'
-            resp = requests.get(url)
-            results = resp.json()
-        except Exception, e:
-            print e
-        for position, item in enumerate(results['response']['docs']):
-            match = False
-            score2 = 0
-            name = item.get('auth')
-            alternate = item.get('suggestall')
-            score = item.get('score')
-            if (len(alternate) > 0):
-                alt = alternate[0]
-            else:
-                alt = ''
-            pid = item.get('idroot')
-            normal_query = text.normalize(raw_query)
-            if normal_query == text.normalize(name):
-                match = True
-            elif normal_query == text.normalize(alt):
-                match = True
-            resource = {
-                "id": make_uri(pid),
-                "name": name,
-                "score": score,
-                "match": match,
-                "type": [
-                    {
-                        "id": "http://www.w3.org/2004/02/skos/core#",
-                        "name": "skos:Concept",
-                    }
-                ]
-            }
-            #The FAST service returns many duplicates.
-            if resource not in out:
-                out.append(resource)
-            #Break out of the query loop if we've found a good candidate
-            if (match is True):
-                done = True
-                break
+        else:
+            unique_fast_ids.append(fid)
+        score_1 = fuzz.token_sort_ratio(query, name)
+        score_2 = fuzz.token_sort_ratio(query, alt)
+        #Return a maximum score
+        score = max(score_1, score_2)
+        if query == text.normalize(name):
+            match = True
+        elif query == text.normalize(alt):
+            match = True
+        resource = {
+            "id": fast_uri,
+            "name": name,
+            "score": score,
+            "match": match,
+            "type": [
+                {
+                    "id": "/fast/corporate-name",
+                    "name": "Corporate Name",
+                }
+            ]
+        }
+        out.append(resource)
     #Sort this list by score
     sorted_out = sorted(out, key=itemgetter('score'), reverse=True)
-    return sorted_out
+    #Refine only will handle top three matches.
+    return sorted_out[:2]
 
 
 @app.route("/fast-corporate/reconcile", methods=['POST', 'GET'])
@@ -167,13 +165,6 @@ def reconcile():
     from optparse import OptionParser
     oparser = OptionParser()
     oparser.add_option('-d', '--debug', action='store_true', default=False)
-    oparser.add_option('-u', '--user', dest='api_user', default=False)
     opts, args = oparser.parse_args()
-    if opts.api_user is False:
-        raise Exception("No API user provided.\
-                        Pass as --user.\
-                        Typically an email address.")
-    else:
-        TOC_USER = opts.api_user
     app.debug = opts.debug
     app.run(host='0.0.0.0')