Browse files

wip

* The various frmly's for breakers
* Acronyms shouldn't be used to match, only should get stripped out
* More shortenings of things
* Skip records which have no information about clients
  • Loading branch information...
1 parent 235ae07 commit e102c37fac4e0f4934cb931c91a08feba93f51c8 Zack Maril committed Jun 30, 2014
Showing with 25 additions and 10 deletions.
  1. +9 −8 bokonon/graph.py
  2. +4 −1 bokonon/load.py
  3. +12 −1 bokonon/text.py
View
17 bokonon/graph.py
@@ -4,6 +4,9 @@
from save import steralize, save, project
from text import extractNames
+def represent(v):
+ l1 = ", ".join([v["name"],v["address"],v["city"],v["state"]])
+ return l1
def main():
print("Loading universe...")
universe = loadData()
@@ -18,11 +21,11 @@ def main():
lambda v: extractNames(v["name"]),
description="Merged clients based on extracted and cleaned name match")
- p = matchTypeAndHasFields("client",["address","city","country","state","zip"])
- groupMerge(universe,
- lambda v: p(v) and v["state"] not in ["DC","VA","MD"] and v["city"] != "DC",
- lambda v: [(v["address"],v["city"],v["country"],v["state"],v["zip"])],
- description="Merging clients based on exact matching of address fields (sans DC area)")
+ # p = matchTypeAndHasFields("client",["address","city","country","state","zip"])
+ # groupMerge(universe,
+ # lambda v: p(v) and v["state"] not in ["DC","VA","MD"] and v["city"] != "DC",
+ # lambda v: [(v["address"],v["city"],v["country"],v["state"],v["zip"])],
+ # description="Merging clients based on exact matching of address fields (sans DC area)")
# groupMerge(universe,
# matchTypeAndHasFields("firm",["orgname"]),
@@ -34,9 +37,7 @@ def main():
# mnf("printedname"),
# description="Merging firms based on *corrected* printedname")
- project(universe,"clientnames.txt",
- lambda v: v["type"] == "client",
- lambda v: ", ".join([v["name"],v["address"],v["city"],v["country"],v["state"],v["zip"]]))
+ project(universe,"clientnames.txt", lambda v: v["type"] == "client", represent)
if __name__ == "__main__":
View
5 bokonon/load.py
@@ -137,7 +137,10 @@ def loadData():
for col in data:
if col == None:
continue
- (client,firm,employs) = col
+ (client,firm,employs) = col
+ if client["name"] == "":
+ continue
+
cnode = str(uuid.uuid1())
fnode = str(uuid.uuid1())
cbeing = str(uuid.uuid1())
View
13 bokonon/text.py
@@ -29,6 +29,7 @@ def preProcess(s):
#ad hoc informal coalitions
useless = ["l l c","llc", "l c","lc", "l l p","llp", "l p","lp", "pllc",
+ "innc",
"pllp",
"incorperated", "ltd","l t d","company",
"corporations",
@@ -37,6 +38,8 @@ def preProcess(s):
subs = {
"assn" : "association",
+ "ass'n" : "association",
+ "nat'l" : "national",
"intl" : "international"
}
#processClientName(preProcess("assn of J.H.Christ & The-All-Mighty l c llc lp"))
@@ -81,6 +84,9 @@ def processClientName(org):
"capitol insight",
"dci group",
"dci group az",
+ "whitmer and worrall",
+ "mayer brown",
+ "fukuda gakuen usa"
]
s = preProcess(s)
for b in breakers:
@@ -127,6 +133,11 @@ def cleanCruft(s):
splitters = ["fka:","fka","f/k/a","f/k/a/",
"formerly known as",
"formerly know as",
+ "frmly filed as",
+ "frmly registered as",
+ "frmly",
+ "frly",
+ "frmly field",
"formerly filed as",
"formerly reported as",
"formerly",
@@ -178,7 +189,7 @@ def splitName(s):
ac = ac[1:-1]
for w in words:
if ac == w:
- return [preProcess(w),preProcess(gs[0])]
+ return [preProcess(gs[0])]#don't include ac
return [s]

0 comments on commit e102c37

Please sign in to comment.