Permalink
Browse files

Rough pass at a windowed merge

On just LD1', it finds roughly 550 real mistakes (mostly spelling errors
and missing spaces) and makes about 50 mistakes itself (i.e. single word
names of companies being too close or initials for companies).
  • Loading branch information...
1 parent ac64961 commit 2f712096ae038ce57836a784117edbcd2f6fcac7 Zack Maril committed Jul 1, 2014
Showing with 49 additions and 5 deletions.
  1. +37 −2 bokonon/being.py
  2. +9 −2 bokonon/graph.py
  3. +3 −1 bokonon/text.py
View
@@ -2,6 +2,7 @@
import copy
from networkx import nx
import uuid
+from Levenshtein import distance
being = lambda : {"type": "Being"}
represents = lambda : {"relation":"represents"}
@@ -63,7 +64,6 @@ def groupMerge(universe, pred, extract,description=None):
d[s].append(k)
for k,v in d.iteritems():
merged = reduce(lambda x,y: mergeTheirBeings(universe,x,y),v)
- found = findBeing(universe,merged)
cullHermits(universe)
if description != None:
@@ -73,7 +73,42 @@ def groupMerge(universe, pred, extract,description=None):
txt += k + " " + str(v-start[k]) + " "
print(txt)
print("")
-
+
+def windowMerge(universe, pred, extract, windowSize, maxDistance, description=None):
+ if description != None:
+ print(description)
+ start = countTypes(universe)
+
+ nodes = filter(lambda t: pred(t[1]),universe.nodes(data=True))
+ d = {}
+ for k,v in nodes:
+ for s in extract(v):
+ d[s] = k
+
+ items = sorted(d.iteritems(),key=lambda x: x[0])
+ for i in range(0,len(items)-windowSize):
+ a = items[i]
+ for j in range(1,windowSize+1):
+ b = items[i+j]
+ dQ = distance(a[0],b[0]) <= maxDistance
+ bQ = findBeing(universe,a[1]) != findBeing(universe,b[1])
+ lQ = len(a[0]) > 5 and len(b[0]) > 5
+ if dQ and bQ and lQ:
+ print(a[0])
+ print(b[0])
+ print("\n")
+ mergeTheirBeings(universe,a[1],b[1])
+
+ cullHermits(universe)
+ if description != None:
+ d = countTypes(universe)
+ txt = ""
+ for k,v in d.iteritems():
+ txt += k + " " + str(v-start[k]) + " "
+ print(txt)
+ print("")
+
+
def matchTypeAndHasFields(t,fs):
return lambda v: v["type"] == t and all([v[f] != "" for f in fs])
View
@@ -1,7 +1,7 @@
-from being import groupMerge, countTypes, matchTypeAndHasFields
+from being import countTypes, groupMerge, matchTypeAndHasFields, windowMerge
from load import loadData
from pprint import pprint
-from save import steralize, save, project
+from save import project, steralize, save
from text import extractNames
def represent(v):
@@ -23,6 +23,13 @@ def main():
matchTypeAndHasFields("client",["name"]),
lambda v: extractNames(v["name"]),
description="Merged clients based on extracted and cleaned name match")
+
+ windowMerge(universe,
+ matchTypeAndHasFields("client",["name"]),
+ lambda v: extractNames(v["name"]),
+ 5,
+ 1,
+ description="Merged clients based on windowed extracted name matchs")
project(universe,"clientnames.txt", lambda v: v["type"] == "client", represent)
View
@@ -34,7 +34,9 @@ def preProcess(s):
"incorperated", "ltd","l t d","company",
"corporations",
"corps",
- "corporation","corp","companies","incorporated","inc"]
+ "corporation","corp","companies","incorporated","inc"] #North america?
+#operations
+#of america?
subs = {
"assn" : "association",

0 comments on commit 2f71209

Please sign in to comment.