Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP

Comparing changes

Choose two branches to see what's changed or to start a new pull request. If you need to, you can also compare across forks.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also compare across forks.
base fork: influence-usa/utils_bokonon
base: 9c3864258f
...
head fork: influence-usa/utils_bokonon
compare: 2f712096ae
Checking mergeability… Don't worry, you can still create the pull request.
  • 2 commits
  • 4 files changed
  • 0 commit comments
  • 1 contributor
Commits on Jul 01, 2014
@zmaril zmaril Only strip acronyms now, not match on them
ac64961
@zmaril zmaril Rough pass at a windowed merge
On just LD1', it finds roughly 550 real mistakes (mostly spelling errors
and missing spaces) and makes about 50 mistakes itself (i.e. single word
names of companies being too close or initials for companies).
2f71209
View
39 bokonon/being.py
@@ -2,6 +2,7 @@
import copy
from networkx import nx
import uuid
+from Levenshtein import distance
being = lambda : {"type": "Being"}
represents = lambda : {"relation":"represents"}
@@ -63,7 +64,6 @@ def groupMerge(universe, pred, extract,description=None):
d[s].append(k)
for k,v in d.iteritems():
merged = reduce(lambda x,y: mergeTheirBeings(universe,x,y),v)
- found = findBeing(universe,merged)
cullHermits(universe)
if description != None:
@@ -73,7 +73,42 @@ def groupMerge(universe, pred, extract,description=None):
txt += k + " " + str(v-start[k]) + " "
print(txt)
print("")
-
+
+def windowMerge(universe, pred, extract, windowSize, maxDistance, description=None):
+ if description != None:
+ print(description)
+ start = countTypes(universe)
+
+ nodes = filter(lambda t: pred(t[1]),universe.nodes(data=True))
+ d = {}
+ for k,v in nodes:
+ for s in extract(v):
+ d[s] = k
+
+ items = sorted(d.iteritems(),key=lambda x: x[0])
+ for i in range(0,len(items)-windowSize):
+ a = items[i]
+ for j in range(1,windowSize+1):
+ b = items[i+j]
+ dQ = distance(a[0],b[0]) <= maxDistance
+ bQ = findBeing(universe,a[1]) != findBeing(universe,b[1])
+ lQ = len(a[0]) > 5 and len(b[0]) > 5
+ if dQ and bQ and lQ:
+ print(a[0])
+ print(b[0])
+ print("\n")
+ mergeTheirBeings(universe,a[1],b[1])
+
+ cullHermits(universe)
+ if description != None:
+ d = countTypes(universe)
+ txt = ""
+ for k,v in d.iteritems():
+ txt += k + " " + str(v-start[k]) + " "
+ print(txt)
+ print("")
+
+
def matchTypeAndHasFields(t,fs):
return lambda v: v["type"] == t and all([v[f] != "" for f in fs])
View
11 bokonon/graph.py
@@ -1,7 +1,7 @@
-from being import groupMerge, countTypes, matchTypeAndHasFields
+from being import countTypes, groupMerge, matchTypeAndHasFields, windowMerge
from load import loadData
from pprint import pprint
-from save import steralize, save, project
+from save import project, steralize, save
from text import extractNames
def represent(v):
@@ -23,6 +23,13 @@ def main():
matchTypeAndHasFields("client",["name"]),
lambda v: extractNames(v["name"]),
description="Merged clients based on extracted and cleaned name match")
+
+ windowMerge(universe,
+ matchTypeAndHasFields("client",["name"]),
+ lambda v: extractNames(v["name"]),
+ 5,
+ 1,
+ description="Merged clients based on windowed extracted name matchs")
project(universe,"clientnames.txt", lambda v: v["type"] == "client", represent)
View
4 bokonon/text.py
@@ -34,7 +34,9 @@ def preProcess(s):
"incorperated", "ltd","l t d","company",
"corporations",
"corps",
- "corporation","corp","companies","incorporated","inc"]
+ "corporation","corp","companies","incorporated","inc"] #North america?
+#operations
+#of america?
subs = {
"assn" : "association",
View
4 tests/test_text.py
@@ -12,8 +12,8 @@
(['international buddhism sangha association', 'master wan ko yee'],
"Intl. Buddhism Sangha Assn. (FKA Dr. David Wu on behalf of Master Wan Ko Yee)"),
- (["dca", "dredging contractors of america"],"DREDGING CONTRACTORS OF AMERICA (DCA)"), #DCA
- (["hart", "housing action resource trust"],"housing action resource trust (\"hart\")"), #DCA
+ (["dredging contractors of america"],"DREDGING CONTRACTORS OF AMERICA (DCA)"), #DCA
+ ([ "housing action resource trust"],"housing action resource trust (\"hart\")"), #DCA
(["orange broadband holding"],"orange broadband holding company") #dba
]

No commit comments for this range

Something went wrong with that request. Please try again.