# Data Cleaning & Preprocessing

In [3]:
import pandas as pd
df = pd.read_csv("1745501008647-data_cleaning_sample.csv")

In [4]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


## Handling Missing Values
Check for Missing Data

In [6]:
df.isnull()              # True for NaNs

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,False,False,False,False,False,False
1,False,True,False,False,False,False
2,False,False,False,False,False,False
3,False,True,False,False,False,False
4,False,False,False,False,False,False
5,True,False,False,False,False,True
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,True,False,False,False,False


In [7]:
df.isnull().sum()        # Count missing per column

Name         1
Age          3
City         0
Gender       0
Email        0
Join Date    1
dtype: int64

## Drop Missing Data

In [8]:
df.dropna()              # Drop rows with *any* missing values

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021


In [9]:
df.dropna(axis=1)        # Drop columns with missing values

Unnamed: 0,City,Gender,Email
0,New York,F,alice@example.com
1,Delhi,M,charlie@example
2,Los Angeles,M,bob@example.com
3,Delhi,M,charlie@example
4,Mumbai,M,david@example.com
5,Delhi,F,eve@domain.com
6,New York,F,alice@example.com
7,New York,F,alice@example.com
8,Delhi,M,charlie@example


## Fill Missing Data
In pandas, fillna is used to fill unknown values. ffill and bfill are methods used to fill missing values (like NaN, None, or pd.NA) by propagating values forward or backward.

In [11]:
df.fillna(0)                     # Replace NaN with 0

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,0.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,0,28.0,Delhi,F,eve@domain.com,0
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,0.0,Delhi,M,charlie@example,20-07-2021


In [12]:
df["Age"].fillna(df["Age"].mean())  # Replace with mean

0    25.000000
1    25.833333
2    30.000000
3    25.833333
4    22.000000
5    28.000000
6    25.000000
7    25.000000
8    25.833333
Name: Age, dtype: float64

In [13]:
df.ffill()      # Forward fill

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,25.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,David,28.0,Delhi,F,eve@domain.com,12-11-2019
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,25.0,Delhi,M,charlie@example,20-07-2021


In [14]:
df.bfill()      # Backward fill

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,30.0,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,22.0,Delhi,M,charlie@example,20-07-2021
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,Alice,28.0,Delhi,F,eve@domain.com,01-05-2021
6,Alice,25.0,New York,F,alice@example.com,01-05-2021
7,Alice,25.0,New York,F,alice@example.com,01-05-2021
8,Charlie,,Delhi,M,charlie@example,20-07-2021


## Detecting & Removing Duplicates
df.duplicated() returns a boolean Series where: True means that row is a duplicate of a previous row. False means it's the first occurrence (not a duplicate yet).

In [15]:
df.duplicated()          # True for duplicates

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

In [16]:
df.drop_duplicates()     # Remove duplicate rows

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25.0,New York,F,alice@example.com,01-05-2021
1,Charlie,,Delhi,M,charlie@example,20-07-2021
2,Bob,30.0,Los Angeles,M,bob@example.com,15-06-2020
4,David,22.0,Mumbai,M,david@example.com,12-11-2019
5,,28.0,Delhi,F,eve@domain.com,


### Check based on specific columns:

In [17]:
df.duplicated(subset=["Name", "Age"])

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7     True
8     True
dtype: bool

# String Operations with .str
Works like vectorized string methods and returns a pandas Series:

In [18]:
df["Name"].str.lower() # Converts all names to lowercase.

0      alice
1    charlie
2        bob
3    charlie
4      david
5        NaN
6      alice
7      alice
8    charlie
Name: Name, dtype: object

In [19]:
df["City"].str.contains("delhi", case=False) # Checks if 'delhi' is in the city name, case-insensitive.

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7    False
8     True
Name: City, dtype: bool

In [20]:
df["Email"].str.split("@") # Outputs a pandas Series where each element is a list of strings (the split parts). This is where a Python list comes into play, but the outer object is still a pandas Series.

0    [alice, example.com]
1      [charlie, example]
2      [bob, example.com]
3      [charlie, example]
4    [david, example.com]
5       [eve, domain.com]
6    [alice, example.com]
7    [alice, example.com]
8      [charlie, example]
Name: Email, dtype: object

# Type Conversions with .astype()

Convert column data types:

In [22]:
df["Age"] = df["Age"].fillna(0).astype(int)

In [23]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date
0,Alice,25,New York,F,alice@example.com,01-05-2021
1,Charlie,0,Delhi,M,charlie@example,20-07-2021
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020
3,Charlie,0,Delhi,M,charlie@example,20-07-2021
4,David,22,Mumbai,M,david@example.com,12-11-2019
5,,28,Delhi,F,eve@domain.com,
6,Alice,25,New York,F,alice@example.com,01-05-2021
7,Alice,25,New York,F,alice@example.com,01-05-2021
8,Charlie,0,Delhi,M,charlie@example,20-07-2021


# Applying Functions

- .apply() → Apply any function to rows or columns

In [27]:
df["Age Group"] = df["Age"].apply(lambda x: "Adult" if x >= 18 else "Minor")

In [28]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25,New York,F,alice@example.com,01-05-2021,Adult
1,Charlie,0,Delhi,M,charlie@example,20-07-2021,Minor
2,Bob,30,Los Angeles,M,bob@example.com,15-06-2020,Adult
3,Charlie,0,Delhi,M,charlie@example,20-07-2021,Minor
4,David,22,Mumbai,M,david@example.com,12-11-2019,Adult
5,,28,Delhi,F,eve@domain.com,,Adult
6,Alice,25,New York,F,alice@example.com,01-05-2021,Adult
7,Alice,25,New York,F,alice@example.com,01-05-2021,Adult
8,Charlie,0,Delhi,M,charlie@example,20-07-2021,Minor


- .map() → Element-wise mapping for Series

In [29]:
gender_map = {"M": "Male", "F": "Female"}
df["Gender"] = df["Gender"].map(gender_map)

In [30]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
1,Charlie,0,Delhi,Male,charlie@example,20-07-2021,Minor
2,Bob,30,Los Angeles,Male,bob@example.com,15-06-2020,Adult
3,Charlie,0,Delhi,Male,charlie@example,20-07-2021,Minor
4,David,22,Mumbai,Male,david@example.com,12-11-2019,Adult
5,,28,Delhi,Female,eve@domain.com,,Adult
6,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
8,Charlie,0,Delhi,Male,charlie@example,20-07-2021,Minor


- .replace() → Replace specific values

In [31]:
df["City"].replace({"Del": "Delhi", "Mum": "Mumbai"})

0       New York
1          Delhi
2    Los Angeles
3          Delhi
4         Mumbai
5          Delhi
6       New York
7       New York
8          Delhi
Name: City, dtype: object

In [32]:
df

Unnamed: 0,Name,Age,City,Gender,Email,Join Date,Age Group
0,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
1,Charlie,0,Delhi,Male,charlie@example,20-07-2021,Minor
2,Bob,30,Los Angeles,Male,bob@example.com,15-06-2020,Adult
3,Charlie,0,Delhi,Male,charlie@example,20-07-2021,Minor
4,David,22,Mumbai,Male,david@example.com,12-11-2019,Adult
5,,28,Delhi,Female,eve@domain.com,,Adult
6,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
7,Alice,25,New York,Female,alice@example.com,01-05-2021,Adult
8,Charlie,0,Delhi,Male,charlie@example,20-07-2021,Minor
