# Deduplicación de registros

* *60:00 min* | Última modificación: Agosto 11, 2021 | YouTube

In [10]:
%%writefile /tmp/data.csv
first_name,last_name,birtdate,phone
Kayne,Taffie,2/10/1985,+86 (669) 916-2473
Daisey S,Heisham,5/19/1990,+55 (858) 758-7630
Clair W,Brik,10/3/1976,+351 (509) 289-3191
Kippy L,Frome,9/18/1992,+420 (195) 491-9791
Burgess Jr,Klimes,2/4/1977,+86 (762) 990-4484
Dermot R,Garwill,9/27/1984,+86 (699) 948-9318
Hadley P,Gosker,2/15/1993,+48 (457) 883-3998
S Jackqueline,Papes,6/18/1983,+86 (784) 978-0726
BURGESS,Klimes,2/4/1977,+86 (762) 990-4484
Dermot,Garwill,9/27/1984,+86 (699) 948-9318
Hadley,GOSKER,2/15/1993,+48 (457) 883-3998
Jackqueline,Papes,6/18/1983,+86 (784) 978-0726
Alastair,Barge,3/9/1971,+33 (182) 729-8581
Theobald,Bastian,11/15/1987,+62 (397) 242-4366
Pammi,Daffey,9/5/1986,+86 (761) 567-4803
Marcus,Charlo,7/7/1974,+86 (928) 602-4540
Burgess,KLIMES,2/4/1977,+86 (762) 990-4484
Dermot,Garwill,9/27/1984,+86 (699) 948-9318

Overwriting /tmp/data.csv


In [11]:
import pandas as pd

pd.set_option("display.notebook_repr_html", False)

In [12]:
df = pd.read_csv('/tmp/data.csv')
df.head()

   first_name last_name   birtdate                phone
0       Kayne    Taffie  2/10/1985   +86 (669) 916-2473
1    Daisey S   Heisham  5/19/1990   +55 (858) 758-7630
2     Clair W      Brik  10/3/1976  +351 (509) 289-3191
3     Kippy L     Frome  9/18/1992  +420 (195) 491-9791
4  Burgess Jr    Klimes   2/4/1977   +86 (762) 990-4484

In [13]:
#
# Bloking une registros que son identicos en uno
# o mas campos
#
import recordlinkage

indexer = recordlinkage.Index()
indexer.full()
candidate_links = indexer.index(df)
candidate_links



MultiIndex([( 1,  0),
            ( 2,  0),
            ( 2,  1),
            ( 3,  0),
            ( 3,  1),
            ( 3,  2),
            ( 4,  0),
            ( 4,  1),
            ( 4,  2),
            ( 4,  3),
            ...
            (17,  7),
            (17,  8),
            (17,  9),
            (17, 10),
            (17, 11),
            (17, 12),
            (17, 13),
            (17, 14),
            (17, 15),
            (17, 16)],
           length=153)

In [25]:
#
# Comparación de registros
#
compare_cl = recordlinkage.Compare()

compare_cl.string(
    "first_name",
    "first_name",
    method="levenshtein",
    threshold=0.50,
    label="first_name",
)

compare_cl.string(
    "last_name",
    "last_name",
    method="levenshtein",
    threshold=0.50,
    label="last_name",
)

features = compare_cl.compute(candidate_links, df)
features.head()

     first_name  last_name
1 0         0.0        0.0
2 0         0.0        0.0
  1         0.0        0.0
3 0         0.0        0.0
  1         0.0        0.0

In [26]:
features.sum(axis=1).value_counts().sort_index(ascending=False)

2.0      4
1.0      5
0.0    144
dtype: int64

In [32]:
potential_matches = features[features.sum(axis=1) > 0]
potential_matches

      first_name  last_name
6  1         1.0        0.0
8  4         0.0        1.0
9  5         1.0        1.0
10 6         1.0        0.0
11 7         1.0        1.0
14 0         0.0        1.0
16 4         1.0        0.0
17 5         1.0        1.0
   9         1.0        1.0

In [33]:
duplicate_rows = potential_matches.index.get_level_values(1)
duplicate_rows

Int64Index([1, 4, 5, 6, 7, 0, 4, 5, 9], dtype='int64')

In [34]:
df[~df.index.isin(duplicate_rows)].sort_values('first_name')

     first_name last_name    birtdate                phone
12     Alastair     Barge    3/9/1971   +33 (182) 729-8581
8       BURGESS    Klimes    2/4/1977   +86 (762) 990-4484
16      Burgess    KLIMES    2/4/1977   +86 (762) 990-4484
2       Clair W      Brik   10/3/1976  +351 (509) 289-3191
17       Dermot   Garwill   9/27/1984   +86 (699) 948-9318
10       Hadley    GOSKER   2/15/1993   +48 (457) 883-3998
11  Jackqueline     Papes   6/18/1983   +86 (784) 978-0726
3       Kippy L     Frome   9/18/1992  +420 (195) 491-9791
15       Marcus    Charlo    7/7/1974   +86 (928) 602-4540
14        Pammi    Daffey    9/5/1986   +86 (761) 567-4803
13     Theobald   Bastian  11/15/1987   +62 (397) 242-4366