Permalink
Browse files

wip

* Changing output of project to not be lowercased
* Using LD2's more
* Started work on precompiling regexp's
* Refactoring text so it is reusable
* More test cases
  • Loading branch information...
1 parent 78b8397 commit 235ae07b794461ba474a5114740d3f7662dc7c91 Zack Maril committed Jun 30, 2014
Showing with 71 additions and 50 deletions.
  1. +1 −1 bokonon/graph.py
  2. +1 −1 bokonon/load.py
  3. +53 −37 bokonon/text.py
  4. +16 −11 tests/test_text.py
View
@@ -36,7 +36,7 @@ def main():
project(universe,"clientnames.txt",
lambda v: v["type"] == "client",
- lambda v: ", ".join([v["name"],v["address"],v["city"],v["country"],v["state"],v["zip"]]).lower())
+ lambda v: ", ".join([v["name"],v["address"],v["city"],v["country"],v["state"],v["zip"]]))
if __name__ == "__main__":
View
@@ -126,7 +126,7 @@ def loadData():
print("Loading and processing files now")
p = multiprocessing.Pool(8)
data = p.map(loadForm1,glob(os.environ["HOUSEXML"]+"/LD1/*/*/*.json"),10)
- data = p.map(loadForm2,glob(os.environ["HOUSEXML"]+"/LD2/*/*/*.json"),10)
+ data += p.map(loadForm2,glob(os.environ["HOUSEXML"]+"/LD2/*/*/*.json"),10)
print "Saving processed files"
with open(processed_files,"w") as f:
View
@@ -1,6 +1,12 @@
import itertools
import re
+def mapcat(f,vs):
+ nvs = []
+ for v in map(f,vs):
+ nvs += v
+ return nvs
+
def replaceWhitespace(s):
return re.sub(' +', ' ', s)
@@ -29,7 +35,7 @@ def preProcess(s):
"corps",
"corporation","corp","companies","incorporated","inc"]
-shortenings = {
+subs = {
"assn" : "association",
"intl" : "international"
}
@@ -48,7 +54,7 @@ def processClientName(org):
s = re.sub('\\bu s\\b','na',s) #replace n a with na
s = re.sub('&',' and ',s)#replace "&" with " and "
- for k,v in shortenings.iteritems():
+ for k,v in subs.iteritems():
s = re.sub("\\b"+k+"\\b",v,s)
#remove various stopwords
@@ -59,7 +65,8 @@ def processClientName(org):
#on behalf of cassidy & associates
#on behalf of akin gump
#"on behalf of akin gump strauss hauer & feld"
- breakers = ["on behalf of", "obo","o/b/o", "on behalf",
+ breakers = ["on behalf of the", #return multiple values?
+ "on behalf of", "obo","o/b/o", "on behalf",
"public policy partners",
"the livingston group",
"akin gump strauss hauer and feld"
@@ -82,8 +89,13 @@ def processClientName(org):
for b in ["(for "]:
if b in s and len(s) > len(b) + 4:
s = re.split(re.escape(b),s)[-1]
+
+ s = re.sub('-',' ',s)
+
+ return cleanCruft(preProcess(s))
+
-
+def cleanCruft(s):
old = None
while old != s:
s=preProcess(s)
@@ -110,39 +122,39 @@ def processClientName(org):
while "(" == s[-1] and ")" not in s:
s = s[:-1]
-
- s = re.sub('-',' ',s)
-
- return preProcess(s)
-
+ return s
-def splitName(name):
- name = preProcess(name).lower()
- if "\"fka\"" in name: # The "'s mess up the word boundaries
- return re.split("\"fka\"",name)
+splitters = ["fka:","fka","f/k/a","f/k/a/",
+ "formerly known as",
+ "formerly know as",
+ "formerly filed as",
+ "formerly reported as",
+ "formerly",
+ "formally known as",
+ "also known as",
+ "formally",
+ "former", #united natural products alliance (former utah natural products alliance)?
+ "d/b/a",
+ "dba",
+ "name changed",
+ "name changed to",
+ "name changed from",
+ "name change to",
+ "name change from",
+ "name change"]
+
+splitters = [re.compile("\\b"+sp+"\\b") for sp in splitters]
+
+#MYFAMILY.COM ***client has changed it's name to The Generations Networks
+def splitName(s):
+ s = cleanCruft(preProcess(s).lower())
+ if "\"fka\"" in s: # The "'s mess up the word boundaries
+ return mapcat(splitName,re.split("\"fka\"",s))
+
+ for c in splitters:
+ if re.search(c,s) != None and s != "dba international":
+ return mapcat(splitName,re.split(c,s))
- splitters = ["fka:","fka","f/k/a","f/k/a/",
- "formerly known as",
- "formerly know as",
- "formerly filed as",
- "formerly reported as",
- "formerly",
- "formally known as",
- "also known as",
- "formally",
- "former", #united natural products alliance (former utah natural products alliance)?
- "d/b/a",
- "dba",
- ]#todo: compile regex ahead of times
- for s in splitters:
- if s in name and name != "dba international":
- return re.split("\\b"+s+"\\b",name)
-
- #remove acronyms
- #greater richmond transit company (grtc) ==> greater richmond transit (grtc)
- #housing action resource trust (hart) ==> housing action resource trust
- #ousing action resource trust ("hart")
-
g = re.match(r"([\w' ]*)\((.*)\)$",s)
mappings = {
"and":["a",""],
@@ -151,6 +163,7 @@ def splitName(name):
"southwest":["s","sw"],
"of":["o",""]
}
+
if g is not None:
gs = g.groups()
ws = []
@@ -160,11 +173,14 @@ def splitName(name):
else:
ws.append([w[0]])
words = map("".join,list(itertools.product(*ws)))
+ ac = gs[1]
+ if ac[0] == "\"" and ac[-1] == "\"":
+ ac = ac[1:-1]
for w in words:
- if gs[1] == w:
+ if ac == w:
return [preProcess(w),preProcess(gs[0])]
- return [name]
+ return [s]
def extractNames(s):
return filter(lambda x: x !="", map(processClientName,splitName(s)))
View
@@ -2,20 +2,25 @@
import unittest
+cases = [(["google"],"Google inc"),
+
+ (["boys town usa", "girls and boys town usa"],
+ "Boys town usa (formerly girls and boys town usa)"),
+
+ (["esm group"],"the livingston group llc (on behalf of esm group, inc.)"),
+
+ (['international buddhism sangha association', 'master wan ko yee'],
+ "Intl. Buddhism Sangha Assn. (FKA Dr. David Wu on behalf of Master Wan Ko Yee)"),
+
+ (["dca", "dredging contractors of america"],"DREDGING CONTRACTORS OF AMERICA (DCA)"), #DCA
+ (["hart", "housing action resource trust"],"housing action resource trust (\"hart\")"), #DCA
+
+ (["orange broadband holding"],"orange broadband holding company") #dba
+]
+
class TestExtractNames(unittest.TestCase):
def test_function(self):
- cases = [(["google"],"Google inc"),
-
- (["boys town usa", "girls and boys town usa"],
- "Boys town usa (formerly girls and boys town usa)"),
-
- (["esm group"],"the livingston group llc (on behalf of esm group, inc.)"),
-
- (['international buddhism sangha association', 'master wan ko yee'],
- "intl. buddhism sangha assn. (fka dr. david wu on behalf of master wan ko yee)"),
-
- ]
for a,b in cases:
self.assertEqual(a,extractNames(b))

0 comments on commit 235ae07

Please sign in to comment.