Skip to content
Browse files

Delay lowercaseing and more general util functions

  • Loading branch information...
1 parent 2d8fa52 commit c2a3fdbd1409f49e2b60f9b6882d0c4ad195adc2 Zack Maril committed Jun 26, 2014
Showing with 12 additions and 6 deletions.
  1. +8 −4 dedupe/graph.py
  2. +4 −2 dedupe/text.py
View
12 dedupe/graph.py
@@ -1,4 +1,4 @@
-from being import groupMerge, countTypes, matchTypeAndHaveField
+from being import groupMerge, countTypes, matchTypeAndHasFields
from load import loadData
from norvig import correctSentence
from pprint import pprint
@@ -16,20 +16,24 @@ def mnf(f):
return lambda v: mineNames(correctSentence(model,v[f]))
groupMerge(universe,
- matchTypeAndHaveField("client","name"),
+ matchTypeAndHasFields("client",["name"]),
mnf("name"),
description="Merged clients based on *corrected* name")
groupMerge(universe,
- matchTypeAndHaveField("firm","orgname"),
+ matchTypeAndHasFields("firm",["orgname"]),
mnf("orgname"),
description="Merging firms based on *corrected* orgname")
groupMerge(universe,
- matchTypeAndHaveField("firm","printedname"),
+ matchTypeAndHasFields("firm",["printedname"]),
mnf("printedname"),
description="Merging firms based on *corrected* printedname")
+ project(universe,"clientnames.txt",
+ lambda v: v["type"] == "client",
+ lambda v: v["name"])
+
if __name__ == "__main__":
main()
View
6 dedupe/text.py
@@ -9,7 +9,7 @@ def preProcess(s):
org = s
s = s.encode("ascii","ignore")
s = re.sub('\n', ' ', s)
- s = s.strip().strip('"').strip("'").lower().strip()
+ s = s.strip().strip('"').strip("'").strip()
s = replaceWhitespace(s)
if s == 'legi\\x company': #LEGI\X is ridiclous
s = "legi-x company"
@@ -107,7 +107,8 @@ def processClientName(org):
"and":["a",""],
"for":["f",""],
"in":["i",""],
- "southwest":["s","sw"]
+ "southwest":["s","sw"],
+ "of":["o",""]
}
if g is not None:
gs = g.groups()
@@ -127,6 +128,7 @@ def processClientName(org):
def formerSplitter(name):
+ name = preProcess(name).lower()
if "\"fka\"" in name: # The " mess up the word boundaries
return re.split("\"fka\"",name)

0 comments on commit c2a3fdb

Please sign in to comment.
Something went wrong with that request. Please try again.