# 🧹 Data Cleaning with Pandas
This notebook demonstrates a complete data validation and cleansing pipeline using sample data in Pandas.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Create sample dataset
data = {
    'id': [1, 2, 3, 4, 5, 5],
    'name': [' Alice ', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
    'dob': ['1990-01-01', '1985-05-12', 'bad_date', '2000-07-19', None, '2001-03-04'],
    'salary': ['50000', '60000', '70000', 'N/A', '80000', '90000'],
    'department': ['HR', None, 'Engineering', 'Sales', 'HR', 'HR'],
    'country': ['UK', 'U.K.', 'united kingdom', 'USA', 'UK ', ' UK'],
    'phone': ['+44-123-456-789', '1234567890', '(020)1234567', 'N/A', '44123456789', '44123456789'],
    'weight_lbs': [150, 180, 200, None, 130, np.inf],
    'age': [25, -1, 35, 40, np.nan, 30]
}
df = pd.DataFrame(data)
df

## 🔧 Data Cleansing Steps

In [None]:
# Strip whitespace
df['name'] = df['name'].str.strip()

# Standardize text (lowercase and remove dots)
df['country'] = df['country'].str.lower().str.replace('.', '', regex=False).str.strip()

# Remove special characters from phone
df['phone'] = df['phone'].str.replace(r'[^0-9]', '', regex=True)

# Convert date of birth to datetime
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Convert salary to numeric
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')

# Replace invalid ages
df['age'] = df['age'].apply(lambda x: np.nan if pd.notnull(x) and x < 0 else x)

# Unit conversion: lbs to kg
df['weight_kg'] = df['weight_lbs'] * 0.453592

# Drop rows with any NaNs
df_drop_nulls = df.dropna()

# Drop rows with inf values
df_cleaned = df[~df.isin([np.inf, -np.inf]).any(axis=1)].dropna()

df_cleaned