Skip to content

Commit

Permalink
Basic string similarity
Browse files Browse the repository at this point in the history
  • Loading branch information
kvh committed Mar 25, 2017
1 parent e18ee66 commit b36665a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 3 deletions.
38 changes: 38 additions & 0 deletions match/similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@


def make_ngrams(s, grams=3):
return [s[i:i+grams] for i in range(len(s)-(grams-1))]


def dice_coefficient(a, b, grams=3):
"""
Dice coefficient for character ngrams.
"""
if not len(a) or not len(b): return 0.0
""" quick case for true duplicates """
if a == b: return 1.0
# if a != b, and a or b are smaller than 'grams', then they can't possibly match
if len(a) < grams or len(b) < grams: return 0.0

a_bigram_list = make_ngrams(a, grams)
b_bigram_list = make_ngrams(b, grams)

lena = len(a_bigram_list)
lenb = len(b_bigram_list)

a_bigram_list.sort()
b_bigram_list.sort()
# initialize match counters
matches = i = j = 0
while (i < lena and j < lenb):
if a_bigram_list[i] == b_bigram_list[j]:
matches += 2
i += 1
j += 1
elif a_bigram_list[i] < b_bigram_list[j]:
i += 1
else:
j += 1

score = float(matches) / float(lena + lenb)
return score
4 changes: 4 additions & 0 deletions match/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ def __missing__(self, key):
return memodict().__getitem__


"""
Text utils
"""

def clean_to_alphanum(s):
if not s:
return ''
Expand Down
24 changes: 21 additions & 3 deletions tests/test_match.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
'''
test_match
----------------------------------
Tests for `match` module.
"""
'''


import sys
Expand Down Expand Up @@ -36,6 +36,16 @@
'1234(608)-345-6789',
]

similar_strings_dice_3grams = [
('howdy', 'howdya', .8),
('here we go', 'there we are', .7),
('a', 'b', .0),
('a', 'ab', .0),
('ac', 'ab', .0),
('acd', 'abd', .0),
('acdb', 'abdb', .0)k
]


class TestDataTypes(unittest.TestCase):

Expand All @@ -55,7 +65,7 @@ def test_000_phone_number_detect(self):
def test_001_phone_number_match(self):
for s in valid_us_phone_numbers:
for s2 in valid_us_phone_numbers:
score, detected_type = match.score(s, s2)
score, detected_type = match.score_similarity(s, s2)
self.assertEqual(detected_type, datatypes.PhoneNumberType,
'{0} {1} were not detected as PhoneNumberType'.format(s, s2))
self.assertTrue(score > .9,
Expand All @@ -67,3 +77,11 @@ def test_002_non_phone_number_detect(self):
# Doesn't match phone type
self.assertEqual(dtype, datatypes.StringDataType,
'{0} was not detected as StringType but {1}'.format(s, dtype))

def test_003_string_similarity(self):
tol = .1
for s, s2, exp_sim in similar_strings_dice_3grams:
sim, dtype = match.score_similarity(s, s2)
self.assertTrue(exp_sim - tol < sim < exp_sim + tol, '{2}: {0} != {1}'.format(sim, exp_sim, (s, s2)))


0 comments on commit b36665a

Please sign in to comment.