# Dealing with duplicates
## Identifying duplicates

- Basic method: `df.duplicated()`, returns bool
- `df.duplicated().sum()`, return the number of duplicates

In [27]:
import pandas as pd

df = pd.DataFrame({
    "student_name": ["Alice", "Alice","Bob", "Charlie", "Bob", "David", "Charlie"],
    "exam_score": [85, 86, 90, 88, 90, 92, 88]
})

print("Original DataFrame:")
print(df)


Original DataFrame:
  student_name  exam_score
0        Alice          85
1        Alice          86
2          Bob          90
3      Charlie          88
4          Bob          90
5        David          92
6      Charlie          88


In [15]:
# check for duplicated rows:
# %%
print("Duplicate check (True means this row is a duplicate of a previous row):")
print(df.duplicated()) # we can try change the value of subset and keep


Duplicate check (True means this row is a duplicate of a previous row):
0    False
1    False
2     True
3     True
4     True
5    False
6     True
dtype: bool


In [20]:
# Count the number of duplicates

print("Number of duplicate rows:")
print(df.duplicated().sum())


Number of duplicate rows:
2


In [21]:
# Show the Actual Duplicate Rows

duplicate_rows = df[df.duplicated()]

print("Actual duplicate rows:")
print(duplicate_rows)


Actual duplicate rows:
  student_name  exam_score
4          Bob          90
6      Charlie          88


## Removing duplicates
`df.drop_duplicates()`

In [22]:
# remove all duplicates except for the first occurrence

cleaned_df = df.drop_duplicates()

print("After removing exact duplicates:")
print(cleaned_df)


After removing exact duplicates:
  student_name  exam_score
0        Alice          85
1        Alice          86
2          Bob          90
3      Charlie          88
5        David          92


In [29]:
# If you wanted to drop the duplicates from the original dataframe, there are two options:
# (1) you can assign the cleaned df to the original df variable name, or
# (2) you can simply call df.drop_duplicates(inplace=True)

tmp_df = df.copy()
# tmp_df = tmp_df.drop_duplicates()
tmp_df.drop_duplicates(inplace=True)
print(tmp_df)


  student_name  exam_score
0        Alice          85
1        Alice          86
2          Bob          90
3      Charlie          88
5        David          92
