# ✅ Data Validation with Pandas
This notebook demonstrates how to perform schema checks, data type validations, null checks, and duplicate detection.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Sample dataset for validation
data = {
    'id': [1, 2, 3, 4, 5, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Eve'],
    'dob': ['1990-01-01', '1985-05-12', '1992-03-25', '2000-07-19', None, '2001-03-04'],
    'salary': ['50000', '60000', '70000', 'not_available', '80000', '90000'],
    'age': [25, 30, 35, 40, np.nan, 30]
}
df = pd.DataFrame(data)
df

## 🔍 Schema Validation

In [None]:
# Expected schema
expected_columns = ['id', 'name', 'dob', 'salary', 'age']
assert set(expected_columns).issubset(df.columns), 'Schema mismatch!'
print('Schema validation passed.')

## 🔍 Data Type Validation

In [None]:
# Convert 'dob' and 'salary' to appropriate types
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')
df['salary'] = pd.to_numeric(df['salary'], errors='coerce')

# Validate types
assert pd.api.types.is_numeric_dtype(df['salary']), 'Salary column is not numeric'
assert pd.api.types.is_datetime64_any_dtype(df['dob']), 'DOB column is not datetime'
print('Data type validation passed.')

## 🔍 Null Value Detection

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print('Missing values per column:')
print(missing_values)

## 🔍 Duplicate Detection

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]
print(f'Total duplicate rows: {len(duplicate_rows)}')
duplicate_rows