In [198]:
import os
import sys

import pandas as pd
pd.options.display.max_columns = 999
import sqlalchemy as sqla
from sqlalchemy import create_engine

DB_URI = os.getenv('CD_DWH')
engine = create_engine(DB_URI)

In [199]:
QUERY = """
select
    regexp_replace(lower(recipient_candidate_name), '[^a-z ]', '', 'g') as clean_name,
    count(recipient_candidate_name),
    recipient_candidate_name
from trg_analytics.candidate_contributions
group by recipient_candidate_name
"""
with engine.begin() as conn:
    results = pd.read_sql(QUERY, conn)
    
print("Size:", results.size)
results.head(100)

Size: 4692


Unnamed: 0,clean_name,count,recipient_candidate_name
0,,0,
1,mc hugh peter a,33,"MC HUGH, PETER A."
2,ali anila,163,"ALI, ANILA"
3,knight william j pete,195,"KNIGHT, WILLIAM J. \PETE\"""""
4,hill jerry a,5404,"HILL, JERRY A."
5,harris elihu,22,"HARRIS, ELIHU"
6,acosta georgia l,26,"ACOSTA, GEORGIA L."
7,guillen abel,1300,"GUILLEN, ABEL"
8,gerber donna c,1026,"GERBER, DONNA C."
9,mc iver barbara g,259,"MC IVER, BARBARA G."


In [28]:
from fuzzywuzzy import fuzz

size = len(results["clean_name"])

for i in range(size):
    name = results["clean_name"][i]
    for j in range(i + 1, size):
        test = results["clean_name"][j]
        fuz = fuzz.ratio(name, test)
        if fuz >= 77: # need to find how low this can go
            print("{}% Similar: {:^25}  {:^25} {:>5} {:>5}".format(fuz, name, test, results["count"][i], results["count"][j]))

94% Similar:     mc iver barbara g           mc iver barbara        259     1
93% Similar:      runner george c             runner george        2816    63
83% Similar:        hill steve               phillips steve          28   416
92% Similar:        hanson mark               hanson mark n           5    57
91% Similar:       dutra john a                dutra john          2238    17
93% Similar:       yamada mariko             yamada mariko m        292  2171
78% Similar:        adams steve               davis steven          212    14
78% Similar:    dominguez francisco         ramirez francisco       121     1
92% Similar:       gardner dean              gardner m dean          36   732
92% Similar:       price curren              price curren d        1017  1583
87% Similar:       akili gregory               akili greg           118     4
90% Similar:  mcdonald kristine lang      mc donald kristine l        9    26
91% Similar:        king david                king david a      

86% Similar:         leyes mak                leyes mark a          130   252
92% Similar:        medina jose               medina jose j          59  1740
92% Similar:        gaines beth               gaines beth b         111  1214
94% Similar:     schaupp charles e           schaupp charles        149     4
90% Similar:         cook paul                 cook paul j           30  1618
93% Similar:       ortiz deborah             ortiz deborah v          1  1537
81% Similar:   johannessen k maurice        johannessen mark        420   178
83% Similar:       jenkins stew             jenkins stewart d         6   199
78% Similar:  dickerson richard dick         dickerson dick         842     1
93% Similar:       hodges sherry             hodges sherry m          9  1096
87% Similar:       gaines edward            gaines edward ted       200   954
93% Similar:       gaines edward             gaines edward t        200  3503
78% Similar:       gaines edward               gaines ted       

95% Similar:   antonovich michael d        antonovich michael      1406     5
78% Similar:       mullin kevin                mullin gene         2093   994
93% Similar:      mc carthy kevin             mccarty kevin        3417   212
78% Similar:       leonard bill                conrad bill          755   112
85% Similar:      polanco richard              pan richard           59  4566
93% Similar:     keating janice e            keating janice         161    62
88% Similar:        roesch jean              roesch jean dr          12    58
95% Similar:    von szeliski heidi        von szeliski heidi j        3   340
82% Similar:        rao robert                frost robert          235    50
88% Similar:     gutierrez richard           aguirre richard          5    14
91% Similar:    mc fadden bradley j         mcfadden bradley         24     1
92% Similar:      marquez luis h              marquez luis          392    31
90% Similar:      ebenstein jeff            ebenstein jeffery   

In [29]:
# Noticing things like:
# 80% Similar:       blount steve               blanton steve          14     6 are these 4
# 78% Similar:       blount steve                young steve           14    20 different people?
# 78% Similar:       davis michael               davis mike           322  1559 
# 80% Similar:        diep tyler                 izen tyler           151    39 80%, possibly different people
# 80% Similar:     harrison michael            wilson michael         315    39 
# 88% Similar:     harrison michael          harrington michael       315     3 80% for 4 people
# 85% Similar:    camejo peter miguel          camejo peter m         472     2 85% for a probably match
# 78% Similar:    dominguez francisco         ramirez francisco       121     1 
# 79% Similar:      cohelan timothy        cohelan timothy douglas     26   145 79% same person?

In [214]:
# data.sort(key=lambda tup: tup[1])  # sorts in place
from fuzzywuzzy import fuzz

size = len(results["clean_name"])
orgi_name = results["recipient_candidate_name"]

print("Fuzzy comparing")

# Above cell but compressed with original name and stored to a variable -- I wanted to try it with list comprehension... 
data = [[results["clean_name"][i], [[fuzz.ratio(results["clean_name"][i], results["clean_name"][j]), results["clean_name"][j], results["recipient_candidate_name"][j], results["count"][j]] for j in range(i + 1 , size)], results["recipient_candidate_name"][i], results["count"][i]] for i in range(size)]

# ignore None 
if data[0][0] is None:
    data = data[1:]


Fuzzy comparing


In [215]:
print("Sorting comparisons")
# Need to ignore the last few because their comparisons are 0
for dat in data[:-2]:
    # print(dat[1][1][0])
    # break
    dat[1].sort(reverse=True, key=lambda t: t[0])
    
              


Sorting comparisons


In [216]:
# Taking a peak at the 3rd element from the back
print(data[-3])
# Because we (should be) only comparing unique rows the last few have smaller and more viewable 'comparison array'

['maze bill', [[29, 'harrington michael mickey', 'HARRINGTON, MICHAEL \\MICKEY\\""', 44], [11, 'scott jack', 'SCOTT, JACK', 671]], 'MAZE, BILL', 1536]


In [259]:
print("Everyone with at least 96 % match in this dataset")
print("Match% name (count) 'name_in_db' -- 'name_in_db (count)")

print()
for dat in data:
    for d in dat[1]:
        # print(d[1])
        if d[0] < 96:
            break
        else:
            print("{:>3}% {:>20}: ({}) {:^25} -- {:^25} ({})".format(d[0], dat[0], dat[3], dat[2], d[2], d[3]))

Everyone with at least 96 % match in this dataset
Match% name (count) 'name_in_db' -- 'name_in_db (count)

 96%         mc cann john: (616)       MC CANN, JOHN       --       MCCANN, JOHN        (6)
100%    dickson kenneth c: (2)    DICKSON, KENNETH C     --    DICKSON, KENNETH C.    (69)
 97%     de saulnier mark: (2517)     DE SAULNIER, MARK     --     DESAULNIER, MARK      (658)
100%        vargas juan c: (8)      VARGAS, JUAN C       --      VARGAS, JUAN C.      (2845)
 96%        papan virgina: (1)      PAPAN, VIRGINA       --      PAPAN, VIRGINIA      (8)
 96%         yee  betty t: (5048)      YEE , BETTY T.       --       YEE, BETTY T.       (351)
100%          baca jr joe: (12)      BACA. JR., JOE       --       BACA JR., JOE       (2013)
 96%       garamendi john: (9487)      GARAMENDI, JOHN      --      GARAMEDI, JOHN       (234)
 97%       mcgarry nellie: (9)      MCGARRY, NELLIE      --     MC GARRY, NELLIE      (626)
 97%      quintero andres: (99)     QUINTERO, ANDRES    

In [218]:
print(data[-5][0]) ## Clean name
print(data[-5][2]) ## 'original' name in the db
print(data[-5][3]) ## that name's count
print(data[-5][1]) ## Comparisons against all the other people
print(data[-5][1][-1]) ## the last comparison for this person [fuzzy%, clean_name, original_name, count]

renison john
RENISON, JOHN
93
[[36, 'scott jack', 'SCOTT, JACK', 671], [32, 'harrington michael mickey', 'HARRINGTON, MICHAEL \\MICKEY\\""', 44], [25, 'wayne howard', 'WAYNE, HOWARD', 475], [19, 'maze bill', 'MAZE, BILL', 1536]]
[19, 'maze bill', 'MAZE, BILL', 1536]
