From 140a3f240ebd198f491d023b87882eaee51fde7b Mon Sep 17 00:00:00 2001 From: harishvc Date: Sat, 14 Feb 2015 20:42:37 -0800 Subject: [PATCH] Using fuzzywuzzy show similar queries (did you mean or suggestion) if query has no results - harishvc/githubanalytics#34 --- DBQueries.py | 7 +- RunFlask.py | 9 +- Suggestions.py | 44 +++-- fuzzywuzzy/StringMatcher.py | 78 +++++++++ fuzzywuzzy/__init__.py | 2 + fuzzywuzzy/fuzz.py | 284 ++++++++++++++++++++++++++++++++ fuzzywuzzy/process.py | 126 ++++++++++++++ fuzzywuzzy/string_processing.py | 34 ++++ fuzzywuzzy/utils.py | 73 ++++++++ 9 files changed, 636 insertions(+), 21 deletions(-) create mode 100644 fuzzywuzzy/StringMatcher.py create mode 100644 fuzzywuzzy/__init__.py create mode 100644 fuzzywuzzy/fuzz.py create mode 100644 fuzzywuzzy/process.py create mode 100644 fuzzywuzzy/string_processing.py create mode 100644 fuzzywuzzy/utils.py diff --git a/DBQueries.py b/DBQueries.py index 77b0a54..8800b63 100755 --- a/DBQueries.py +++ b/DBQueries.py @@ -10,7 +10,6 @@ #Local modules import RandomQuotes -import Suggestions import Neo4jQueries import MyMoment @@ -19,7 +18,7 @@ #Configure for production or development based on environment variables if (os.environ['deployEnv'] == "production"): MONGO_URL = os.environ['connectURLRead'] - connection = MongoClient(MONGO_URL,auto_start_request=False) + connection = MongoClient(MONGO_URL) db = connection.githublive.pushevent else: MONGO_URL = os.environ['connectURLReaddev'] @@ -40,10 +39,8 @@ DE = "" def ProcessQuery(query): - global ShowSuggestion - ShowSuggestion = False if (query == ""): - return "" + return "EMPTY" else: app.logger.debug("processing ............ ->%s<-" , query) if (query == "active repositories"): diff --git a/RunFlask.py b/RunFlask.py index 94a6118..4f54ef0 100755 --- a/RunFlask.py +++ b/RunFlask.py @@ -15,11 +15,11 @@ from json import dumps #Local modules -import RandomQuotes +import Suggestions import DBQueries #Global variables -NORESULT="

You've got me stumped!

" #No result +NORESULT="

You've got me stumped!

" #No result @@ -37,8 +37,6 @@ def numformat(value): def index(): query = "" processed_text1 = "" - global ShowSuggestion - ShowSuggestion = False #Debug #time.sleep(5) if request.method == 'GET': @@ -54,7 +52,8 @@ def index(): #End: Uncomment to trigger slow response time processed_text1 = DBQueries.ProcessQuery(query) if (processed_text1 == "EMPTY") : - processed_text1 = NORESULT + t1 = Suggestions.compare("now") if (query == "") else Suggestions.compare(query) + processed_text1 = NORESULT + t1 else: query ="" processed_text1 ="" diff --git a/Suggestions.py b/Suggestions.py index 5b52c4a..2c55c4f 100755 --- a/Suggestions.py +++ b/Suggestions.py @@ -1,14 +1,36 @@ -import random +#https://github.com/seatgeek/fuzzywuzzy +#https://pypi.python.org/pypi/fuzzywuzzy/0.4.0 -def RandomQuerySuggestions(): - foo = ["active repositories", - "active users", - "total commits", - "trending now", - "top active repositories by contributors", - "top active repositories by commits", - "top active repositories by branches" - ] - return("Suggestion: " + random.choice(foo)) +from fuzzywuzzy import fuzz +from fuzzywuzzy import process +choices = ["active users", + "active repositories", + "total commits", + "trending now", + "top active repositories by contributors", + "top active repositories by branches", + "top active repositories by commits"] + +def compare(input): + #print "comparing ....", input + r = process.extract(input, choices,limit=5) + suggestionList = "" + #Pick top 3 if more than 75% exact + if (r[0][1] >= 75): + suggestionList += "

Did you mean:

" + #Pick one if no exact + elif (r[0][1] >= 0): + suggestionList += "

Suggestion:

" + str(r[0][0]) + "" + + #print suggestionList + return suggestionList diff --git a/fuzzywuzzy/StringMatcher.py b/fuzzywuzzy/StringMatcher.py new file mode 100644 index 0000000..9dccfe7 --- /dev/null +++ b/fuzzywuzzy/StringMatcher.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +StringMatcher.py + +ported from python-Levenshtein +[https://github.com/miohtama/python-Levenshtein] +""" + +from Levenshtein import * +from warnings import warn + +class StringMatcher: + """A SequenceMatcher-like class built on the top of Levenshtein""" + + def _reset_cache(self): + self._ratio = self._distance = None + self._opcodes = self._editops = self._matching_blocks = None + + def __init__(self, isjunk=None, seq1='', seq2=''): + if isjunk: + warn("isjunk not NOT implemented, it will be ignored") + self._str1, self._str2 = seq1, seq2 + self._reset_cache() + + def set_seqs(self, seq1, seq2): + self._str1, self._str2 = seq1, seq2 + self._reset_cache() + + def set_seq1(self, seq1): + self._str1 = seq1 + self._reset_cache() + + def set_seq2(self, seq2): + self._str2 = seq2 + self._reset_cache() + + def get_opcodes(self): + if not self._opcodes: + if self._editops: + self._opcodes = opcodes(self._editops, self._str1, self._str2) + else: + self._opcodes = opcodes(self._str1, self._str2) + return self._opcodes + + def get_editops(self): + if not self._editops: + if self._opcodes: + self._editops = editops(self._opcodes, self._str1, self._str2) + else: + self._editops = editops(self._str1, self._str2) + return self._editops + + def get_matching_blocks(self): + if not self._matching_blocks: + self._matching_blocks = matching_blocks(self.get_opcodes(), + self._str1, self._str2) + return self._matching_blocks + + def ratio(self): + if not self._ratio: + self._ratio = ratio(self._str1, self._str2) + return self._ratio + + def quick_ratio(self): + # This is usually quick enough :o) + if not self._ratio: + self._ratio = ratio(self._str1, self._str2) + return self._ratio + + def real_quick_ratio(self): + len1, len2 = len(self._str1), len(self._str2) + return 2.0 * min(len1, len2) / (len1 + len2) + + def distance(self): + if not self._distance: + self._distance = distance(self._str1, self._str2) + return self._distance \ No newline at end of file diff --git a/fuzzywuzzy/__init__.py b/fuzzywuzzy/__init__.py new file mode 100644 index 0000000..4543f94 --- /dev/null +++ b/fuzzywuzzy/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +__version__ = '0.5.0' diff --git a/fuzzywuzzy/fuzz.py b/fuzzywuzzy/fuzz.py new file mode 100644 index 0000000..62c8f19 --- /dev/null +++ b/fuzzywuzzy/fuzz.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +fuzz.py + +Copyright (c) 2011 Adam Cohen + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" +from __future__ import unicode_literals +import warnings + +try: + from .StringMatcher import StringMatcher as SequenceMatcher +except ImportError: + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') + from difflib import SequenceMatcher + +from . import utils + + +########################### +# Basic Scoring Functions # +########################### + +def ratio(s1, s2): + + if s1 is None: + raise TypeError("s1 is None") + if s2 is None: + raise TypeError("s2 is None") + s1, s2 = utils.make_type_consistent(s1, s2) + + if len(s1) == 0 or len(s2) == 0: + return 0 + + m = SequenceMatcher(None, s1, s2) + return utils.intr(100 * m.ratio()) + + +# todo: skip duplicate indexes for a little more speed +def partial_ratio(s1, s2): + """"Return the ratio of the most similar substring + as a number between 0 and 100.""" + + if s1 is None: + raise TypeError("s1 is None") + if s2 is None: + raise TypeError("s2 is None") + s1, s2 = utils.make_type_consistent(s1, s2) + if len(s1) == 0 or len(s2) == 0: + return 0 + + if len(s1) <= len(s2): + shorter = s1 + longer = s2 + else: + shorter = s2 + longer = s1 + + m = SequenceMatcher(None, shorter, longer) + blocks = m.get_matching_blocks() + + # each block represents a sequence of matching characters in a string + # of the form (idx_1, idx_2, len) + # the best partial match will block align with at least one of those blocks + # e.g. shorter = "abcd", longer = XXXbcdeEEE + # block = (1,3,3) + # best score === ratio("abcd", "Xbcd") + scores = [] + for block in blocks: + long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0 + long_end = long_start + len(shorter) + long_substr = longer[long_start:long_end] + + m2 = SequenceMatcher(None, shorter, long_substr) + r = m2.ratio() + if r > .995: + return 100 + else: + scores.append(r) + + return int(100 * max(scores)) + + +############################## +# Advanced Scoring Functions # +############################## + +def _process_and_sort(s, force_ascii): + """Return a cleaned string with token sorted.""" + # pull tokens + tokens = utils.full_process(s, force_ascii=force_ascii).split() + + # sort tokens and join + sorted_string = u" ".join(sorted(tokens)) + return sorted_string.strip() + +# Sorted Token +# find all alphanumeric tokens in the string +# sort those tokens and take ratio of resulting joined strings +# controls for unordered string elements +def _token_sort(s1, s2, partial=True, force_ascii=True): + + if s1 is None: + raise TypeError("s1 is None") + if s2 is None: + raise TypeError("s2 is None") + + sorted1 = _process_and_sort(s1, force_ascii) + sorted2 = _process_and_sort(s2, force_ascii) + + if partial: + return partial_ratio(sorted1, sorted2) + else: + return ratio(sorted1, sorted2) + +def token_sort_ratio(s1, s2, force_ascii=True): + """Return a measure of the sequences' similarity between 0 and 100 + but sorting the token before comparing. + """ + return _token_sort(s1, s2, partial=False, force_ascii=force_ascii) + + +def partial_token_sort_ratio(s1, s2, force_ascii=True): + """Return the ratio of the most similar substring as a number between + 0 and 100 but sorting the token before comparing. + """ + return _token_sort(s1, s2, partial=True, force_ascii=force_ascii) + + +def _token_set(s1, s2, partial=True, force_ascii=True): + """Find all alphanumeric tokens in each string... + - treat them as a set + - construct two strings of the form: + + - take ratios of those two strings + - controls for unordered partial matches""" + + if s1 is None: + raise TypeError("s1 is None") + if s2 is None: + raise TypeError("s2 is None") + + p1 = utils.full_process(s1, force_ascii=force_ascii) + p2 = utils.full_process(s2, force_ascii=force_ascii) + + if not utils.validate_string(p1): + return 0 + if not utils.validate_string(p2): + return 0 + + # pull tokens + tokens1 = set(utils.full_process(p1).split()) + tokens2 = set(utils.full_process(p2).split()) + + intersection = tokens1.intersection(tokens2) + diff1to2 = tokens1.difference(tokens2) + diff2to1 = tokens2.difference(tokens1) + + sorted_sect = " ".join(sorted(intersection)) + sorted_1to2 = " ".join(sorted(diff1to2)) + sorted_2to1 = " ".join(sorted(diff2to1)) + + combined_1to2 = sorted_sect + " " + sorted_1to2 + combined_2to1 = sorted_sect + " " + sorted_2to1 + + # strip + sorted_sect = sorted_sect.strip() + combined_1to2 = combined_1to2.strip() + combined_2to1 = combined_2to1.strip() + + if partial: + ratio_func = partial_ratio + else: + ratio_func = ratio + + pairwise = [ + ratio_func(sorted_sect, combined_1to2), + ratio_func(sorted_sect, combined_2to1), + ratio_func(combined_1to2, combined_2to1) + ] + return max(pairwise) + + +def token_set_ratio(s1, s2, force_ascii=True): + return _token_set(s1, s2, partial=False, force_ascii=force_ascii) + + +def partial_token_set_ratio(s1, s2, force_ascii=True): + return _token_set(s1, s2, partial=True, force_ascii=force_ascii) + + +# TODO: numerics + +################### +# Combination API # +################### + +# q is for quick +def QRatio(s1, s2, force_ascii=True): + + p1 = utils.full_process(s1, force_ascii=force_ascii) + p2 = utils.full_process(s2, force_ascii=force_ascii) + + if not utils.validate_string(p1): + return 0 + if not utils.validate_string(p2): + return 0 + + return ratio(p1, p2) + + +def UQRatio(s1, s2): + return QRatio(s1, s2, force_ascii=False) + + +# w is for weighted +def WRatio(s1, s2, force_ascii=True): + """Return a measure of the sequences' similarity between 0 and 100, + using different algorithms. + """ + + p1 = utils.full_process(s1, force_ascii=force_ascii) + p2 = utils.full_process(s2, force_ascii=force_ascii) + + if not utils.validate_string(p1): + return 0 + if not utils.validate_string(p2): + return 0 + + # should we look at partials? + try_partial = True + unbase_scale = .95 + partial_scale = .90 + + base = ratio(p1, p2) + len_ratio = float(max(len(p1), len(p2))) / min(len(p1), len(p2)) + + # if strings are similar length, don't use partials + if len_ratio < 1.5: + try_partial = False + + # if one string is much much shorter than the other + if len_ratio > 8: + partial_scale = .6 + + if try_partial: + partial = partial_ratio(p1, p2) * partial_scale + ptsor = partial_token_sort_ratio(p1, p2, force_ascii=force_ascii) \ + * unbase_scale * partial_scale + ptser = partial_token_set_ratio(p1, p2, force_ascii=force_ascii) \ + * unbase_scale * partial_scale + + return int(max(base, partial, ptsor, ptser)) + else: + tsor = token_sort_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale + tser = token_set_ratio(p1, p2, force_ascii=force_ascii) * unbase_scale + + return int(max(base, tsor, tser)) + + +def UWRatio(s1, s2): + """Return a measure of the sequences' similarity between 0 and 100, + using different algorithms. Same as WRatio but preserving unicode. + """ + return WRatio(s1, s2, force_ascii=False) diff --git a/fuzzywuzzy/process.py b/fuzzywuzzy/process.py new file mode 100644 index 0000000..bbc38a9 --- /dev/null +++ b/fuzzywuzzy/process.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +process.py + +Copyright (c) 2011 Adam Cohen + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +""" +import itertools + +from . import fuzz +from . import utils + + +def extract(query, choices, processor=None, scorer=None, limit=5): + """Find best matches in a list or dictionary of choices, return a + list of tuples containing the match and its score. If a dictionery + is used, also returns the key for each match. + + Arguments: + query -- an object representing the thing we want to find + choices -- a list or dictionary of objects we are attempting + to extract values from. The dictionary should + consist of {key: str} pairs. + scorer -- f(OBJ, QUERY) --> INT. We will return the objects + with the highest score by default, we use + score.WRatio() and both OBJ and QUERY should be + strings + processor -- f(OBJ_A) --> OBJ_B, where the output is an input + to scorer for example, "processor = lambda x: + x[0]" would return the first element in a + collection x (of, say, strings) this would then + be used in the scoring collection by default, we + use utils.full_process() + + """ + if choices is None: + return [] + + # Catch generators without lengths + try: + if len(choices) == 0: + return [] + except TypeError: + pass + + # default, turn whatever the choice is into a workable string + if not processor: + processor = utils.full_process + + # default: wratio + if not scorer: + scorer = fuzz.WRatio + + sl = list() + + if isinstance(choices, dict): + for key, choice in choices.items(): + processed = processor(choice) + score = scorer(query, processed) + tuple = (choice, score, key) + sl.append(tuple) + + else: + for choice in choices: + processed = processor(choice) + score = scorer(query, processed) + tuple = (choice, score) + sl.append(tuple) + + sl.sort(key=lambda i: i[1], reverse=True) + return sl[:limit] + + +def extractBests(query, choices, processor=None, scorer=None, score_cutoff=0, limit=5): + """Find best matches above a score in a list of choices, return a + list of tuples containing the match and its score. + + Convenience method which returns the choices with best scores, see + extract() for full arguments list + + Optional parameter: score_cutoff. + If the choice has a score of less than or equal to score_cutoff + it will not be included on result list + + """ + best_list = extract(query, choices, processor, scorer, limit) + return list(itertools.takewhile(lambda x: x[1] >= score_cutoff, best_list)) + + +def extractOne(query, choices, processor=None, scorer=None, score_cutoff=0): + """Find the best match above a score in a list of choices, return a + tuple containing the match and its score if it's above the threshold + or None. + + Convenience method which returns the single best choice, see + extract() for full arguments list + + Optional parameter: score_cutoff. + If the best choice has a score of less than or equal to + score_cutoff we will return none (intuition: not a good enough + match) + + """ + best_list = extract(query, choices, processor, scorer, limit=1) + if len(best_list) > 0 and best_list[0][1] >= score_cutoff: + return best_list[0] + return None diff --git a/fuzzywuzzy/string_processing.py b/fuzzywuzzy/string_processing.py new file mode 100644 index 0000000..0fee736 --- /dev/null +++ b/fuzzywuzzy/string_processing.py @@ -0,0 +1,34 @@ +from __future__ import unicode_literals +import re +import string +import sys + + +PY3 = sys.version_info[0] == 3 + + +class StringProcessor(object): + """ + This class defines method to process strings in the most + efficient way. Ideally all the methods below use unicode strings + for both input and output. + """ + + regex = re.compile(r"(?ui)\W") + + @classmethod + def replace_non_letters_non_numbers_with_whitespace(cls, a_string): + """ + This function replaces any sequence of non letters and non + numbers with a single white space. + """ + return cls.regex.sub(u" ", a_string) + + if PY3: + strip = staticmethod(str.strip) + to_lower_case = staticmethod(str.lower) + to_upper_case = staticmethod(str.upper) + else: + strip = staticmethod(string.strip) + to_lower_case = staticmethod(string.lower) + to_upper_case = staticmethod(string.upper) diff --git a/fuzzywuzzy/utils.py b/fuzzywuzzy/utils.py new file mode 100644 index 0000000..2c68824 --- /dev/null +++ b/fuzzywuzzy/utils.py @@ -0,0 +1,73 @@ +from __future__ import unicode_literals +import sys + +from fuzzywuzzy.string_processing import StringProcessor + + +PY3 = sys.version_info[0] == 3 + + + +def validate_string(s): + try: + return len(s) > 0 + except TypeError: + return False + +bad_chars = str("").join([chr(i) for i in range(128, 256)]) # ascii dammit! +if PY3: + translation_table = dict((ord(c), None) for c in bad_chars) + + +def asciionly(s): + if PY3: + return s.translate(translation_table) + else: + return s.translate(None, bad_chars) + + +def asciidammit(s): + if type(s) is str: + return asciionly(s) + elif type(s) is unicode: + return asciionly(s.encode('ascii', 'ignore')) + else: + return asciidammit(unicode(s)) + + +def make_type_consistent(s1, s2): + """If both objects aren't either both string or unicode instances force them to unicode""" + if isinstance(s1, str) and isinstance(s2, str): + return s1, s2 + + elif isinstance(s1, unicode) and isinstance(s2, unicode): + return s1, s2 + + else: + return unicode(s1), unicode(s2) + + +def full_process(s, force_ascii=False): + """Process string by + -- removing all but letters and numbers + -- trim whitespace + -- force to lower case + if force_ascii == True, force convert to ascii""" + + if s is None: + return "" + + if force_ascii: + s = asciidammit(s) + # Keep only Letters and Numbers (see Unicode docs). + string_out = StringProcessor.replace_non_letters_non_numbers_with_whitespace(s) + # Force into lowercase. + string_out = StringProcessor.to_lower_case(string_out) + # Remove leading and trailing whitespaces. + string_out = StringProcessor.strip(string_out) + return string_out + + +def intr(n): + '''Returns a correctly rounded integer''' + return int(round(n))