## Duplicate checks

In [1]:
import pandas as pd
import numpy as np


In [2]:
df= pd.read_excel("./input/data.xlsx", sheet_name="Sheet1")
print(df.shape) # "Shape" shows count of columns and rows


(10000, 12)


In [3]:
df.columns.to_list()

['customer_id',
 'gender',
 'first_name',
 'last_name',
 'dob',
 'date_enrollment',
 'city',
 'country',
 'residential address',
 'date_renewal',
 'lifetime_id',
 'registration_id']

This is the basic command to select the duplicates.  
We are telling python to retrieve the dataframe records for which there are duplicated customer_id, keeping all of the duplicates ("keep" can be used also to get only the first or last instance)

In [5]:
df_duplicates = df[df.duplicated(['customer_id'],keep=False)]
df_duplicates

Unnamed: 0,customer_id,gender,first_name,last_name,dob,date_enrollment,city,country,residential address,date_renewal,lifetime_id,registration_id
20,21,M,Norman,Morales,2007-09-26,2019-10-19,Lake Wendy,Togo,43340 Nash Ridge Suite 093,2021-12-19,266-49197945,78-4916288210733
21,21,M,Norman,Morales,2007-09-26,2019-10-19,Lake Wendy,Togo,43340 Nash Ridge Suite 093,2021-12-19,266-49197945,78-4916288210733
34,34,F,Kimberly,Taylor,1998-08-14,2011-02-22,Hannaborough,Iceland,903 Phillips Drive,2022-04-25,054-85642973,03-8999397667052
35,34,F,Kimberly,Taylor,1998-08-14,2011-02-22,Hannaborough,Iceland,903 Phillips Drive,2022-04-25,054-85642973,03-8999397667052
145,144,F,Julie,Johnson,1985-11-07,2018-09-10,Sanchezton,Antarctica (the territory South of 60 deg S),5732 Wagner Knoll Apt. 609,2022-06-04,062-82294748,74-8215857652117
...,...,...,...,...,...,...,...,...,...,...,...,...
9787,9788,M,Ryan,Nguyen,2001-06-02,2004-12-18,Lake Donna,Trinidad and Tobago,52354 Cooper Spurs,2021-09-03,170-63505412,85-1166317253597
9882,9884,F,Audrey,Solis,2003-09-06,2001-10-19,North Jerome,Croatia,27423 Amy Parkways Apt. 438,2021-12-22,232-95915999,34-7906412092141
9883,9884,F,Lisa,Hahn,1979-07-19,2005-06-14,Bennettberg,Nauru,40138 Michael Radial,2021-12-24,232-95915999,25-5216844241867
9885,9886,M,Willie,Goodman,1999-09-15,2012-11-17,Calvinstad,Latvia,07488 Wilson Lane,2022-05-29,279-58871010,56-0130166283471


In [6]:
df_duplicates.to_excel("./output/102_Duplicates.xlsx", sheet_name='Duplicates', index=False)


Then we can use "drop_duplicates" to cleanup the main file.

In [9]:
df_deduped= df.drop_duplicates(['customer_id'], keep=False)

In [10]:
df_deduped.shape

(9920, 12)

In [11]:
df.drop_duplicates() # this just gets rid of entire duplicate rows 

# remember that the command doesnt affect df, the resulting dataframe is returned but not assigned back to df (i.e. not "in place"),  you need to do df_deduped =df.drop_duplicates() to save the output somewhere or use the df.drop_duplicates(inplace=True) option.

Unnamed: 0,customer_id,gender,first_name,last_name,dob,date_enrollment,city,country,residential address,date_renewal,lifetime_id,registration_id
0,1,M,Chris,Thomas,2013-02-08,2010-09-06,Williamstad,Guinea-Bissau,4077 Estrada Fort,2022-05-25,134-59672845,25-4658556733024
1,2,F,Ashley,Williams,1993-03-07,2010-03-17,Bergerstad,Ecuador,16094 Arthur Grove Suite 710,2021-07-02,290-53438249,03-7038076507850
2,3,M,Juan,Campos,1983-06-30,2009-11-03,West Brandon,Jamaica,67126 Mcneil Forest,2021-07-16,046-01329377,28-6714621613580
3,4,M,Jesse,Gaines,2017-08-21,2013-07-28,Shawnport,Grenada,48604 Alexander Station Suite 115,2021-08-01,070-34526750,51-6976675184441
4,5,M,Benjamin,Gray,1995-02-05,2017-07-14,Torreshaven,Chad,0143 Burch Grove,2022-04-16,044-14256714,48-1752335900362
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,F,Emily,Holloway,2013-05-11,2019-03-07,North Kevin,Central African Republic,2664 Key Terrace,2022-05-15,089-48537049,59-3070383760546
9996,9996,F,Karen,Morton,2004-01-09,2004-01-11,Taylorton,China,59770 Campbell Fort Suite 880,2022-05-18,276-66361879,72-1787498076784
9997,9998,M,Thomas,Grimes,1975-10-15,2020-01-25,Stokesville,Gabon,014 Tate Locks Apt. 811,2021-12-11,286-26019857,17-7948712377504
9998,9999,F,Kathleen,Turner,1997-06-30,2009-04-26,North Clayton,Equatorial Guinea,51521 Kim Creek,2022-01-20,143-89953228,65-2913435498480


In [12]:
df.drop_duplicates(['customer_id','first_name']) # looking just at two column, that is allowing some fuzziness regarding the rest.

Unnamed: 0,customer_id,gender,first_name,last_name,dob,date_enrollment,city,country,residential address,date_renewal,lifetime_id,registration_id
0,1,M,Chris,Thomas,2013-02-08,2010-09-06,Williamstad,Guinea-Bissau,4077 Estrada Fort,2022-05-25,134-59672845,25-4658556733024
1,2,F,Ashley,Williams,1993-03-07,2010-03-17,Bergerstad,Ecuador,16094 Arthur Grove Suite 710,2021-07-02,290-53438249,03-7038076507850
2,3,M,Juan,Campos,1983-06-30,2009-11-03,West Brandon,Jamaica,67126 Mcneil Forest,2021-07-16,046-01329377,28-6714621613580
3,4,M,Jesse,Gaines,2017-08-21,2013-07-28,Shawnport,Grenada,48604 Alexander Station Suite 115,2021-08-01,070-34526750,51-6976675184441
4,5,M,Benjamin,Gray,1995-02-05,2017-07-14,Torreshaven,Chad,0143 Burch Grove,2022-04-16,044-14256714,48-1752335900362
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,F,Emily,Holloway,2013-05-11,2019-03-07,North Kevin,Central African Republic,2664 Key Terrace,2022-05-15,089-48537049,59-3070383760546
9996,9996,F,Karen,Morton,2004-01-09,2004-01-11,Taylorton,China,59770 Campbell Fort Suite 880,2022-05-18,276-66361879,72-1787498076784
9997,9998,M,Thomas,Grimes,1975-10-15,2020-01-25,Stokesville,Gabon,014 Tate Locks Apt. 811,2021-12-11,286-26019857,17-7948712377504
9998,9999,F,Kathleen,Turner,1997-06-30,2009-04-26,North Clayton,Equatorial Guinea,51521 Kim Creek,2022-01-20,143-89953228,65-2913435498480


In [13]:
df.drop_duplicates(['customer_id'], keep='first') # keep can be first, last or False, default is first

Unnamed: 0,customer_id,gender,first_name,last_name,dob,date_enrollment,city,country,residential address,date_renewal,lifetime_id,registration_id
0,1,M,Chris,Thomas,2013-02-08,2010-09-06,Williamstad,Guinea-Bissau,4077 Estrada Fort,2022-05-25,134-59672845,25-4658556733024
1,2,F,Ashley,Williams,1993-03-07,2010-03-17,Bergerstad,Ecuador,16094 Arthur Grove Suite 710,2021-07-02,290-53438249,03-7038076507850
2,3,M,Juan,Campos,1983-06-30,2009-11-03,West Brandon,Jamaica,67126 Mcneil Forest,2021-07-16,046-01329377,28-6714621613580
3,4,M,Jesse,Gaines,2017-08-21,2013-07-28,Shawnport,Grenada,48604 Alexander Station Suite 115,2021-08-01,070-34526750,51-6976675184441
4,5,M,Benjamin,Gray,1995-02-05,2017-07-14,Torreshaven,Chad,0143 Burch Grove,2022-04-16,044-14256714,48-1752335900362
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9995,F,Emily,Holloway,2013-05-11,2019-03-07,North Kevin,Central African Republic,2664 Key Terrace,2022-05-15,089-48537049,59-3070383760546
9996,9996,F,Karen,Morton,2004-01-09,2004-01-11,Taylorton,China,59770 Campbell Fort Suite 880,2022-05-18,276-66361879,72-1787498076784
9997,9998,M,Thomas,Grimes,1975-10-15,2020-01-25,Stokesville,Gabon,014 Tate Locks Apt. 811,2021-12-11,286-26019857,17-7948712377504
9998,9999,F,Kathleen,Turner,1997-06-30,2009-04-26,North Clayton,Equatorial Guinea,51521 Kim Creek,2022-01-20,143-89953228,65-2913435498480


If we want to get the unique values we have a few other ways
ways:

a) use the "unique" method from pandas, but note that it returns a Series object

In [22]:
df.customer_id.unique

<bound method Series.unique of 0           1
1           2
2           3
3           4
4           5
        ...  
9995     9995
9996     9996
9997     9998
9998     9999
9999    10000
Name: customer_id, Length: 10000, dtype: int64>

b) use the groupby commands

In [25]:
# TODO

c) use the set of unique keys and joining (concat in pandas) with the rest having been converted into a dictionary beforehand. I ve seen it in StackOverflow and could be a faster, anyway it is more convenient than using groupby when you want to just collapse duplicate keys. This approach will return an arbitrary value of other fields for each key (ie if you have three duplicated keys and collapse using the df.to_dict it may not bring the values from the last record found, afaik)


In [23]:
pd.concat({k: pd.Series(list(set(v))) for k, v in df.to_dict('list').items()}, axis=1)

Unnamed: 0,customer_id,gender,first_name,last_name,dob,date_enrollment,city,country,residential address,date_renewal,lifetime_id,registration_id
0,1.0,M,Timothy,Rodriguez,1979-05-17,2008-08-27,Port Denise,Croatia,2510 Henderson Rapid,2022-03-07,264-37506742,95-9373226427083
1,2.0,F,Yvonne,Fletcher,1979-05-24,2009-11-27,Lake Aprilshire,Thailand,2329 Robinson Trace,2022-03-19,290-43897551,69-7770324045768
2,3.0,,Bradley,Colon,1979-05-20,2008-03-07,New Shanefurt,France,90953 Gonzalez Mill Suite 527,2021-12-12,105-90971563,54-4131912002236
3,4.0,,Levi,Wiley,1979-05-23,2008-03-22,Lake Edwardside,Saint Helena,93780 Sherry River,2021-12-28,256-04644774,67-0501383013095
4,5.0,,Bailey,Becker,1979-05-28,2008-03-04,Port Matthew,Palestinian Territory,2427 Kelsey Mission,2021-12-24,167-88303970,56-8067202461901
...,...,...,...,...,...,...,...,...,...,...,...,...
9975,,,,,NaT,NaT,,,980 Rebecca Camp Apt. 312,NaT,,74-6264243627347
9976,,,,,NaT,NaT,,,0291 Gonzales Brook Suite 559,NaT,,100-7996967195237
9977,,,,,NaT,NaT,,,615 Pearson Villages,NaT,,88-1579520922707
9978,,,,,NaT,NaT,,,469 Matthew Inlet,NaT,,33-1003386014110


There is a very useful module in python called pandas-profiling that takes a dataframe and does all sorts of analysis, including duplicates and renders it in a beautiful html interactive format.

It doesnt come in the standard library and you need to get it via:

install -c conda-forge pandas-profiling

In [14]:
# conda 
try:
    from pandas_profiling import ProfileReport
    profile = ProfileReport(df, title="Profiling Report")
    profile.to_file("./output/profiling report.html")
except ModuleNotFoundError as err:
    # Error handling
    print(err)



Summarize dataset: 100%|██████████| 25/25 [00:13<00:00,  1.91it/s, Completed]
Generate report structure: 100%|██████████| 1/1 [00:07<00:00,  7.47s/it]
Render HTML: 100%|██████████| 1/1 [00:01<00:00,  1.43s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 111.41it/s]
