Permalink
Browse files

wip

* Deduping lobbyists some
* Running on pypy
  • Loading branch information...
1 parent 9ed6fd5 commit b428192e3a2e2e4f4c0e31827a36f269bca2a3c4 Zack Maril committed Jul 2, 2014
Showing with 73 additions and 25 deletions.
  1. +12 −11 bokonon/being.py
  2. +18 −8 bokonon/graph.py
  3. +20 −5 bokonon/load.py
  4. +1 −1 bokonon/save.py
  5. +22 −0 bokonon/text.py
View
@@ -1,6 +1,6 @@
from collections import defaultdict
import copy
-from Levenshtein import distance
+from text import levenshtein
from networkx import nx
from pprint import pprint
import uuid
@@ -53,25 +53,26 @@ def countTypes(universe):
d[universe.node[ns[0]]["type"]] += 1
return d
-def groupMerge(universe, pred,extract, description=None,logging=None):
+def groupMerge(universe, selector,extract, description=None,logging=None):
if description != None:
print(description)
start = countTypes(universe)
- nodes = filter(lambda t: pred(t[1]),universe.nodes(data=True))
+ nodes = filter(lambda t: selector(t[1]),universe.nodes(data=True))
d = defaultdict(list)
for k,v in nodes:
- for s in extract(v):
- d[s].append(k)
+ for s in extract(k,v):
+ d[s].append((k,v))
for k,vs in d.iteritems():
if logging != None:
bs = map(lambda x: findBeing(universe,x),vs)
if len(bs) > 1 and len(set(bs)) != 1:
- for l in map(lambda x: logging(universe.node[x]),vs):
+ for l in map(lambda x: logging(x,universe.node[x]),vs):
print(l)
print("")
-
- merged = reduce(lambda x,y: mergeTheirBeings(universe,x,y),vs)
+
+ toMerge = map(lambda x: x[0], vs)
+ merged = reduce(lambda x,y: mergeTheirBeings(universe,x,y),toMerge)
cullHermits(universe)
if description != None:
@@ -100,16 +101,16 @@ def windowMerge(universe, selector, extract, windowSize, maxDistance,
a = items[i]
for j in range(1,windowSize+1):
b = items[i+j]
- dQ = distance(a[0],b[0]) <= maxDistance
+ dQ = levenshtein(a[0],b[0]) <= maxDistance
bQ = findBeing(universe,a[1]) != findBeing(universe,b[1])
lQ = len(a[0]) > 5 and len(b[0]) > 5
an = universe.node[a[1]]
bn = universe.node[b[1]]
if dQ and bQ and lQ and pred(an,bn):
if logging != None:
- print(logging(an))
- print(logging(bn))
+ print(logging(a[1],an))
+ print(logging(b[1],bn))
print("")
mergeTheirBeings(universe,a[1],b[1])
View
@@ -5,7 +5,7 @@
from save import project, steralize, save
from text import extractNames
-def represent(v):
+def represent(i,v):
l = ", ".join([v["name"],v["address"],v["city"],v["state"]])
return l
@@ -16,40 +16,50 @@ def main():
#Solid matching
groupMerge(universe,
matchTypeAndHasFields("client",["name"]),
- lambda v: [v["name"]],
+ lambda i,v: [v["name"]],
description="Merged clients based on exact name match")
#Surprisingly solid
groupMerge(universe,
matchTypeAndHasFields("client",["name"]),
- lambda v: [re.sub(" ","",v["name"])],
+ lambda i,v: [re.sub(" ","",v["name"])],
description="Merged clients based on exact match without spaces")
#Solid
groupMerge(universe,
matchTypeAndHasFields("client",["name"]),
- lambda v: [re.sub("'","",v["name"])],
+ lambda i,v: [re.sub("'","",v["name"])],
description="Merged clients based on exact match without 's")
#Most likely solid
groupMerge(universe,
matchTypeAndHasFields("client",["name"]),
- lambda v: extractNames(v["name"]),
+ lambda i,v: extractNames(v["name"]),
description="Merged clients based on extracted and cleaned name match"
)
- #Not great
+ #Not so bad
windowMerge(universe,
matchTypeAndHasFields("client",["name"]),
lambda v: extractNames(v["name"]),
5,
1,
pred=lambda v,w: v["state"] == w["state"] and v["city"] == w["city"] and v["address"] == w["address"],
- description="Merged clients based on windowed extracted name matchs",
- logging=represent)
+ description="Merged clients based on windowed extracted name matchs")
project(universe,"clientnames.txt", lambda v: v["type"] == "client", represent)
+ groupMerge(universe,
+ matchTypeAndHasFields("lobbyist",["firstname","lastname"]),
+ lambda i,v: [(v["firstname"].lower(),v["lastname"].lower())],
+ description="Merged lobbyists based on exact name match")
+
+ def repper(i,v):
+ fid = filter(lambda x: x[2]['relation'] == "workedfor",universe.edges([i],data=True))[0][1]
+ return (v["lastname"]+", "+v["firstname"]+":"+universe.node[fid]["orgname"]).lower()
+
+ project(universe,"lobbyistnames.txt", lambda v: v["type"] == "lobbyist",repper)
+
if __name__ == "__main__":
main()
View
@@ -43,6 +43,7 @@ def loadForm(f,t):
jOb = {}
fo = codecs.open(f,"r",encoding="utf-8")
jOb = json.loads(fo.read())
+ fo.close()
if corruption+u'LOBBYINGDISCLOSURE{}'.format(t) in jOb:
jOb = clean(jOb)
@@ -107,7 +108,18 @@ def loadForm(f,t):
"houseID": preProcess(jOb["houseID"]),
"senate": preProcess(jOb["senateID"]),
}
- return (client, firm, employs)
+ lobbyists = []
+ if "lobbyists" in jOb:
+ for l in jOb["lobbyists"]:
+ if "lobbyistFirstName" in l and l["lobbyistFirstName"] != "":
+ lobbyists.append({"firstname": l["lobbyistFirstName"],
+ "lastname": l["lobbyistLastName"],
+ "suffix": l["lobbyistSuffix"],
+ "position": l["coveredPosition"],
+ "type": "lobbyist",
+ "filename": f})
+
+ return (client, firm, employs, lobbyists)
def loadForm1(x):
return loadForm(x,1)
@@ -125,21 +137,24 @@ def loadData():
universe = nx.Graph()
print("Loading and processing files now")
p = multiprocessing.Pool(8)
- data = p.map(loadForm1,glob(os.environ["HOUSEXML"]+"/LD1/*/*/*.json"),10)
-# data += p.map(loadForm2,glob(os.environ["HOUSEXML"]+"/LD2/*/*/*.json"),10)
+ data = p.map(loadForm1,glob(os.environ["HOUSEXML"]+"/LD1/*/*/*.json"))
+ data += p.map(loadForm2,glob(os.environ["HOUSEXML"]+"/LD2/*/*/*.json"))
print("Starting from {} records".format(len(data)))
print("Building universe")
for col in data:
if col == None:
continue
- (client,firm,employs) = col
+ (client,firm,employs,lobbyists) = col
if client["name"] == "":
continue
cid = addRecord(universe,client)
- fid = addRecord(universe,firm)
+ fid = addRecord(universe,firm)
universe.add_edge(fid,cid,employs)
+ for l in lobbyists:
+ lid = addRecord(universe,l)
+ universe.add_edge(lid,fid,{"relation":"workedfor"})
print("Universe loaded and built, saving now")
with open(processed_graph,"w") as f:
View
@@ -27,7 +27,7 @@ def project(universe,fo,pred,extract):
for b in beings:
ns = nx.neighbors(universe,b[0])
if pred(universe.node[ns[0]]):
- fs = list(set(map(lambda x: extract(universe.node[x]), ns)))
+ fs = list(set(map(lambda x: extract(x,universe.node[x]), ns)))
fs = sorted(fs)
lst.append(fs)
View
@@ -1,6 +1,28 @@
import itertools
import re
+#http://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
+def levenshtein(s1, s2):
+ if len(s1) < len(s2):
+ return levenshtein(s2, s1)
+
+ # len(s1) >= len(s2)
+ if len(s2) == 0:
+ return len(s1)
+
+ previous_row = xrange(len(s2) + 1)
+ for i, c1 in enumerate(s1):
+ current_row = [i + 1]
+ for j, c2 in enumerate(s2):
+ insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
+ deletions = current_row[j] + 1 # than s2
+ substitutions = previous_row[j] + (c1 != c2)
+ current_row.append(min(insertions, deletions, substitutions))
+ previous_row = current_row
+
+ return previous_row[-1]
+
+
def mapcat(f,vs):
nvs = []
for v in map(f,vs):

0 comments on commit b428192

Please sign in to comment.