## Finding Duplicates

#### There are times when you will need to review a file for duplicates. Python 'Pandas' has a function that allows you easily select the duplicates for review. You can select them all, the first occurrences or the prior occurrences.

In [31]:
# First I will import some libararies
import pandas as pd
import numpy as np

In [32]:
#  Then I will read an excel file as a data frame
df1=pd.read_excel('Sales_Customers.xlsx')

In [33]:
# Now, let's have a preview of the dataframe
df1.head()

Unnamed: 0,custid,companyname,contactname,contacttitle,address,city,region,postalcode,country,phone,fax
0,1,Customer NRZBB,"Allen, Michael",Sales Representative,Obere Str. 0123,Berlin,,10092,Germany,030-3456789,030-0123456
1,2,Customer MLTDN,"Hassall, Mark",Owner,Avda. de la Constitución 5678,México D.F.,,10077,Mexico,(5) 789-0123,(5) 456-7890
2,3,Customer KBUDE,"Strome, David",Owner,Mataderos 7890,México D.F.,,10097,Mexico,(5) 123-4567,
3,4,Customer HFBZG,"Cunningham, Conor",Sales Representative,7890 Hanover Sq.,London,,10046,UK,(171) 456-7890,(171) 456-7891
4,5,Customer HGVLZ,"Higginbotham, Tom",Order Administrator,Berguvsvägen 5678,Luleå,,10112,Sweden,0921-67 89 01,0921-23 45 67


In [34]:
# For the purpose of this project, I will grab a sample from the 'Sales_Customers' file 
df2 = df1.sample(n=15)

In [35]:
# not necessary, but I will preview the sample
df2.head()

Unnamed: 0,custid,companyname,contactname,contacttitle,address,city,region,postalcode,country,phone,fax
3,4,Customer HFBZG,"Cunningham, Conor",Sales Representative,7890 Hanover Sq.,London,,10046,UK,(171) 456-7890,(171) 456-7891
36,37,Customer FRXZL,"Óskarsson, Jón Harry",Sales Associate,9012 Johnstown Road,Cork,Co. Cork,10051,Ireland,8901 234,5678 9012
87,88,Customer SRQVM,"Li, Yan",Sales Manager,"Rua do Mercado, 4567",Resende,SP,10084,Brazil,(14) 234-5678,
33,34,Customer IBVRG,"Zhang, Frank",Accounting Manager,"Rua do Paço, 7890",Rio de Janeiro,RJ,10076,Brazil,(21) 789-0123,(21) 789-0124
54,55,Customer KZQZT,"Wood, Robin",Sales Representative,7890 Bering St.,Anchorage,AK,10050,USA,(907) 555-0115,(907) 555-0128


In [36]:
# Now I will join to two files
df3=pd.concat([df1, df2], ignore_index=True)

In [37]:
# Lets get a row count for the original file
df1.shape

(91, 11)

In [38]:
# And a row count for the sample file
df2.shape

(15, 11)

In [39]:
# Finally we will compare the row count for the combined files
df3.shape

(106, 11)

In [40]:
# We will add the two counts for confirmation
91 + 15

106

In [41]:
# Now let's create a seperate data frame to look at the duplicates
dupes=df3.loc[df3.duplicated(keep=False), :]

In [42]:
# Now I will sort the duplicated data by the column 'custid'
dupes.sort_values('custid')

Unnamed: 0,custid,companyname,contactname,contacttitle,address,city,region,postalcode,country,phone,fax
0,1,Customer NRZBB,"Allen, Michael",Sales Representative,Obere Str. 0123,Berlin,,10092,Germany,030-3456789,030-0123456
102,1,Customer NRZBB,"Allen, Michael",Sales Representative,Obere Str. 0123,Berlin,,10092,Germany,030-3456789,030-0123456
3,4,Customer HFBZG,"Cunningham, Conor",Sales Representative,7890 Hanover Sq.,London,,10046,UK,(171) 456-7890,(171) 456-7891
91,4,Customer HFBZG,"Cunningham, Conor",Sales Representative,7890 Hanover Sq.,London,,10046,UK,(171) 456-7890,(171) 456-7891
105,5,Customer HGVLZ,"Higginbotham, Tom",Order Administrator,Berguvsvägen 5678,Luleå,,10112,Sweden,0921-67 89 01,0921-23 45 67
4,5,Customer HGVLZ,"Higginbotham, Tom",Order Administrator,Berguvsvägen 5678,Luleå,,10112,Sweden,0921-67 89 01,0921-23 45 67
10,11,Customer UBHAU,"Jaffe, David",Sales Representative,Fauntleroy Circus 4567,London,,10064,UK,(171) 789-0123,
103,11,Customer UBHAU,"Jaffe, David",Sales Representative,Fauntleroy Circus 4567,London,,10064,UK,(171) 789-0123,
13,14,Customer WNMAF,"Jelitto, Jacek",Owner,Hauptstr. 0123,Bern,,10065,Switzerland,0452-678901,
104,14,Customer WNMAF,"Jelitto, Jacek",Owner,Hauptstr. 0123,Bern,,10065,Switzerland,0452-678901,
