In [1]:
import pandas as pd

In [2]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zipcode']
users = pd.read_csv('http://bit.ly/movieusers', sep = '|', header = None, names = user_cols, index_col = 'user_id')
users.head(10)

Unnamed: 0_level_0,age,gender,occupation,zipcode
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213
6,42,M,executive,98101
7,57,M,administrator,91344
8,36,M,administrator,5201
9,29,M,student,1002
10,53,M,lawyer,90703


In [9]:
users.index

Int64Index([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,
            ...
            934, 935, 936, 937, 938, 939, 940, 941, 942, 943],
           dtype='int64', name='user_id', length=943)

In [3]:
users.shape

(943, 4)

In [4]:
# to find duplicate zipcodes
users['zipcode'].duplicated().head(12)

user_id
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
Name: zipcode, dtype: bool

In [5]:
# to find duplicate zipcodes
# True -> repitions of zipcode
users['zipcode'].duplicated().value_counts()

False    795
True     148
Name: zipcode, dtype: int64

In [6]:
# To get count of duplicate zipcodes
users['zipcode'].duplicated().sum()

148

### Duplication in dataframe as a whole instead of duplication in series

In [10]:
users.duplicated().sum()   # so 7 rows are repetion

7

In [11]:
# to see the repeated rows
users.loc[users.duplicated(),:]

Unnamed: 0_level_0,age,gender,occupation,zipcode
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [12]:
# to see the repeated rows - by default - keep = 'first' -  keeps the first of the rows whose elements are duplicated
users.loc[users.duplicated(keep = 'first'),:]

Unnamed: 0_level_0,age,gender,occupation,zipcode
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402
684,28,M,student,55414
733,44,F,other,60630
805,27,F,other,20009
890,32,M,student,97301


In [13]:
# to see the repeated rows - by default - keep = 'last' -  keeps the last of the repeated rows whose elements are duplicated
users.loc[users.duplicated(keep = 'last'),:]

Unnamed: 0_level_0,age,gender,occupation,zipcode
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630


In [14]:
users.loc[users.duplicated(keep = False),:]   # mark all duplicates as True so all rows are shown with loc method

Unnamed: 0_level_0,age,gender,occupation,zipcode
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
67,17,M,student,60402
85,51,M,educator,20003
198,21,F,student,55414
350,32,M,student,97301
428,28,M,student,55414
437,27,F,other,20009
460,44,F,other,60630
496,21,F,student,55414
572,51,M,educator,20003
621,17,M,student,60402


### How to delete duplicate rows from dataframe

In [15]:
users.drop_duplicates().shape   # numberof rows reduced

(936, 4)

In [18]:
# to remove all rows that are duplicates including the first occurence
users.drop_duplicates(keep = False).shape

(929, 4)

### how to identify certain columns only when searching for duplicates

Eg - we need age + zipcode must be a unique identifier in dataset  

In [20]:
users.duplicated(subset = ['age', 'zipcode']).sum()   # there are 16 duplicates for age and zipcode in df

16

In [21]:
users.drop_duplicates(subset = ['age', 'zipcode']).shape   # df with those 16 rows dropped

(927, 4)