# Pandas Tips: `drop_duplicates()`

In [1]:
import pandas as pd

Check your pandas version to ensure similar behavior. 

_Version 2.1.1 was released September 2023._

In [2]:
pd.__version__

'2.1.1'

### Load data

Load data from GitHub.

_Check out my [read_csv video](https://youtu.be/sTXr73fqybc) to learn more about this step._

In [3]:
df = pd.read_csv(
    'https://raw.githubusercontent.com/kimfetti/Videos/master/Pandas_Tips/data/pet_data.csv'
)

In [4]:
df.head()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,


### Drop blank rows

_Check out my [dropna video]() to learn more about this step._

In [5]:
df.dropna(how='all', inplace=True)

In [6]:
df.shape

(496, 5)

## Basics

### Find duplicate rows

In [7]:
df[df.duplicated(keep=False)]

Unnamed: 0,name,pet_type,food_type,amount,brand
28,Cooper,Guinea Pig,Pellets,,
95,Luna,Cat,,,
144,Rocky,Fish,Pellets,,
152,Stella,Snake,,,
242,Cooper,Guinea Pig,Pellets,,
275,Rocky,Fish,Pellets,,
323,Stella,Snake,,,
485,Luna,Cat,,,


### Drop duplicate rows

In [8]:
df.drop_duplicates()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
...,...,...,...,...,...
495,Shadow,Dog,Wet Food,229g,TropicalFeast
496,Nala,Snake,Eggs,114g,
497,Finn,Turtle,Pellets,,
498,Finn,Hamster,Seeds,55g,


In [9]:
df.drop_duplicates().shape

(492, 5)

In [10]:
no_dupes = df.drop_duplicates()

In [11]:
no_dupes[no_dupes.duplicated()]

Unnamed: 0,name,pet_type,food_type,amount,brand


## $\star$ Level Up $\star$

### Drop duplicates based on specific column(s)

In [12]:
df.head()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,


In [13]:
df.name.value_counts()

name
Charlie    18
Luna       18
Lucy       16
Oliver     15
Lily       15
Lola       14
Cooper     14
Oscar      14
Chloe      14
Winston    13
Sophie     13
Stella     13
Finn       13
Shadow     13
Mocha      13
Zoe        12
Nala       12
Riley      12
Kitty      12
Loki       12
Harley     11
Jasper     11
Coco       11
Buddy      10
Duke       10
Sadie      10
Tiger      10
Ruby        9
Simba       9
Lulu        9
Penny       9
Leo         9
Zeus        9
Mittens     9
Milo        9
Rocky       8
Tucker      7
Max         7
Molly       7
Casper      7
Bailey      7
Gizmo       6
Cleo        6
Rosie       5
Bella       5
Daisy       5
Name: count, dtype: int64

In [14]:
df.drop_duplicates(subset='name')

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
5,Zoe,Ferret,Dry Food,114g,
6,Casper,Rabbit,Vegetables,62g,
7,Gizmo,Lizard,Insects,179g,
8,Lily,Hamster,Seeds,149g,
9,Tiger,Fish,Pellets,173g,ChowTime


In [15]:
df.drop_duplicates(subset='name').shape

(47, 5)

In [16]:
df.drop_duplicates(subset='name').name.value_counts()

name
Simba      1
Loki       1
Rocky      1
Penny      1
Chloe      1
Buddy      1
Nala       1
Kitty      1
Ruby       1
Stella     1
Sadie      1
Luna       1
Winston    1
Molly      1
Lola       1
Finn       1
Oliver     1
Oscar      1
Bailey     1
Riley      1
Mocha      1
Daisy      1
Coco       1
Lulu       1
Tucker     1
Cooper     1
Bella      1
Shadow     1
Milo       1
Zoe        1
Casper     1
Gizmo      1
Lily       1
Tiger      1
Mittens    1
Zeus       1
Leo        1
Lucy       1
Rosie      1
Charlie    1
Harley     1
Jasper     1
Sophie     1
Max        1
Duke       1
Cleo       1
Name: count, dtype: int64

In [17]:
df.head()

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,


In [18]:
df.drop_duplicates(subset=['name', 'pet_type'])

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
...,...,...,...,...,...
486,Luna,Hamster,Seeds,53g,
489,Rocky,Lizard,Insects,135g,ChowTime
491,Mocha,Turtle,Pellets,249g,
493,Shadow,Dog,Wet Food,176g,


In [19]:
df.drop_duplicates(subset=['name', 'pet_type']).shape

(323, 5)

### Specify which dupllicate to keep

In [20]:
df.drop_duplicates(subset='name')  #Note: Zoe is a ferret in row with id=5.

Unnamed: 0,name,pet_type,food_type,amount,brand
0,Simba,Lizard,Insects,212g,
1,Winston,Ferret,Dry Food,,Pawsome
2,Bella,Parrot,Seeds,,
3,Shadow,Hamster,Pellets,58g,
4,Milo,Rabbit,Vegetables,124g,
5,Zoe,Ferret,Dry Food,114g,
6,Casper,Rabbit,Vegetables,62g,
7,Gizmo,Lizard,Insects,179g,
8,Lily,Hamster,Seeds,149g,
9,Tiger,Fish,Pellets,173g,ChowTime


In [21]:
df.drop_duplicates(subset='name', keep='last')  #Note: Zoe is a hamster in row with id=429.

Unnamed: 0,name,pet_type,food_type,amount,brand
313,Tucker,Cat,Dry Kibble,,
338,Milo,Rabbit,,,
344,Zeus,Dog,Wet Food,147g,
360,Lulu,Turtle,Pellets,143g,
382,Leo,Fish,Pellets,66g,
405,Rosie,Guinea Pig,Vegetables,96g,
415,Daisy,Fish,Flakes,,SnackLand
418,Molly,Parrot,Seeds,243g,
422,Duke,Parrot,,,
429,Zoe,Hamster,Seeds,190g,


In [22]:
df.drop_duplicates(subset='name', keep=False) #Gets rid of all duplicates; here, that's all the animals.

Unnamed: 0,name,pet_type,food_type,amount,brand


### Make permanent changes

In [23]:
df.shape

(496, 5)

In [24]:
df.drop_duplicates().shape

(492, 5)

In [25]:
df.drop_duplicates(inplace=True)

In [26]:
df.shape

(492, 5)