Permalink
Browse files

wip

* Working on a more solid solution to logging and viewing the results of
  a merge. A logging function can be provided that formats each record
  into a line on stdin. Nothing more than piping to a file, but useful
  enough.
* Removing the spaces from names seems like a pretty solid step to use.
  • Loading branch information...
1 parent 2f71209 commit 1b74fe4da4bb74912b777691b5d071a1ade77225 Zack Maril committed Jul 1, 2014
Showing with 23 additions and 10 deletions.
  1. +15 −7 bokonon/being.py
  2. +7 −2 bokonon/graph.py
  3. +1 −1 bokonon/load.py
View
@@ -1,8 +1,9 @@
from collections import defaultdict
import copy
+from Levenshtein import distance
from networkx import nx
+from pprint import pprint
import uuid
-from Levenshtein import distance
being = lambda : {"type": "Being"}
represents = lambda : {"relation":"represents"}
@@ -52,7 +53,7 @@ def countTypes(universe):
d[universe.node[ns[0]]["type"]] += 1
return d
-def groupMerge(universe, pred, extract,description=None):
+def groupMerge(universe, pred,extract, description=None,logging=None):
if description != None:
print(description)
start = countTypes(universe)
@@ -62,9 +63,16 @@ def groupMerge(universe, pred, extract,description=None):
for k,v in nodes:
for s in extract(v):
d[s].append(k)
- for k,v in d.iteritems():
- merged = reduce(lambda x,y: mergeTheirBeings(universe,x,y),v)
+ for k,vs in d.iteritems():
+ if logging != None:
+ bs = map(lambda x: findBeing(universe,x),vs)
+ if len(bs) > 1 and len(set(bs)) != 1:
+ for l in map(lambda x: logging(universe.node[x]),vs):
+ print(l)
+ print("")
+ merged = reduce(lambda x,y: mergeTheirBeings(universe,x,y),vs)
+
cullHermits(universe)
if description != None:
d = countTypes(universe)
@@ -94,9 +102,9 @@ def windowMerge(universe, pred, extract, windowSize, maxDistance, description=No
bQ = findBeing(universe,a[1]) != findBeing(universe,b[1])
lQ = len(a[0]) > 5 and len(b[0]) > 5
if dQ and bQ and lQ:
- print(a[0])
- print(b[0])
- print("\n")
+ # print(a[0])
+ # print(b[0])
+ # print("\n")
mergeTheirBeings(universe,a[1],b[1])
cullHermits(universe)
View
@@ -1,13 +1,12 @@
from being import countTypes, groupMerge, matchTypeAndHasFields, windowMerge
from load import loadData
from pprint import pprint
+import re
from save import project, steralize, save
from text import extractNames
def represent(v):
l = ", ".join([v["name"],v["address"],v["city"],v["state"]])
- if "specific_issuse" in v:
- l += "\n"+v["specific_issues"]
return l
def main():
@@ -24,6 +23,12 @@ def main():
lambda v: extractNames(v["name"]),
description="Merged clients based on extracted and cleaned name match")
+ groupMerge(universe,
+ matchTypeAndHasFields("client",["name"]),
+ lambda v: [re.sub(" ","",v["name"])],
+ description="Merged clients based on exact match without spaces",
+ logging=represent)
+
windowMerge(universe,
matchTypeAndHasFields("client",["name"]),
lambda v: extractNames(v["name"]),
View
@@ -126,7 +126,7 @@ def loadData():
print("Loading and processing files now")
p = multiprocessing.Pool(8)
data = p.map(loadForm1,glob(os.environ["HOUSEXML"]+"/LD1/*/*/*.json"),10)
- #data += p.map(loadForm2,glob(os.environ["HOUSEXML"]+"/LD2/*/*/*.json"),10)
+# data += p.map(loadForm2,glob(os.environ["HOUSEXML"]+"/LD2/*/*/*.json"),10)
print("Starting from {} records".format(len(data)))
print("Building universe")

0 comments on commit 1b74fe4

Please sign in to comment.