# Data Cleaning - Data Science Koans

Master data cleaning techniques!

## What You Will Learn
- Handling missing values
- Removing duplicates
- Data type conversions
- String cleaning
- Date/time parsing

## How to Use
1. Read each koan
2. Complete TODOs
3. Run validation
4. Iterate

In [None]:
# Setup
import sys
sys.path.append('../..')
import numpy as np
import pandas as pd
from koans.core.validator import KoanValidator
from koans.core.progress import ProgressTracker

validator = KoanValidator('04_data_cleaning')
tracker = ProgressTracker()
print('Setup complete!')
print(f"Progress: {tracker.get_notebook_progress('04_data_cleaning')}%")

## KOAN 4.1: Drop Missing Values
**Objective**: Remove nulls
**Difficulty**: Beginner

In [None]:
def drop_nulls():
    df = pd.DataFrame({'A': [1, None, 3], 'B': [4, 5, None]})
    # TODO: Drop rows with any null values
    pass

@validator.koan(1, "Drop Missing Values", difficulty="Beginner")
def validate():
    result = drop_nulls()
assert len(result) == 1
validate()

## KOAN 4.2: Fill Missing Values
**Objective**: Impute nulls
**Difficulty**: Beginner

In [None]:
def fill_nulls():
    df = pd.DataFrame({'A': [1, None, 3, None, 5]})
    # TODO: Fill nulls with mean value
    pass

@validator.koan(2, "Fill Missing Values", difficulty="Beginner")
def validate():
    result = fill_nulls()
assert result.isna().sum() == 0
validate()

## KOAN 4.3: Remove Duplicates
**Objective**: Drop dupes
**Difficulty**: Beginner

In [None]:
def remove_dupes():
    df = pd.DataFrame({'A': [1, 2, 2, 3, 3, 3]})
    # TODO: Remove duplicate rows
    pass

@validator.koan(3, "Remove Duplicates", difficulty="Beginner")
def validate():
    result = remove_dupes()
assert len(result) == 3
validate()

## KOAN 4.4: Convert Data Types
**Objective**: Cast columns
**Difficulty**: Beginner

In [None]:
def convert_types():
    df = pd.DataFrame({'nums': ['1', '2', '3']})
    # TODO: Convert 'nums' column to int
    pass

@validator.koan(4, "Convert Data Types", difficulty="Beginner")
def validate():
    result = convert_types()
assert result['nums'].dtype == 'int64'
validate()

## KOAN 4.5: Strip Whitespace
**Objective**: Clean strings
**Difficulty**: Beginner

In [None]:
def strip_spaces():
    df = pd.DataFrame({'text': ['  hello  ', '  world  ']})
    # TODO: Strip whitespace from 'text' column
    pass

@validator.koan(5, "Strip Whitespace", difficulty="Beginner")
def validate():
    result = strip_spaces()
assert result['text'].iloc[0] == 'hello'
validate()

## KOAN 4.6: Lowercase Strings
**Objective**: Normalize case
**Difficulty**: Beginner

In [None]:
def lower_case():
    df = pd.DataFrame({'text': ['HELLO', 'WORLD']})
    # TODO: Convert 'text' column to lowercase
    pass

@validator.koan(6, "Lowercase Strings", difficulty="Beginner")
def validate():
    result = lower_case()
assert result['text'].iloc[0] == 'hello'
validate()

## KOAN 4.7: Replace Values
**Objective**: Substitute data
**Difficulty**: Beginner

In [None]:
def replace_vals():
    df = pd.DataFrame({'status': ['yes', 'no', 'yes', 'no']})
    # TODO: Replace 'yes' with 1 and 'no' with 0
    pass

@validator.koan(7, "Replace Values", difficulty="Beginner")
def validate():
    result = replace_vals()
assert result['status'].iloc[0] == 1
validate()

## KOAN 4.8: Parse Dates
**Objective**: Convert to datetime
**Difficulty**: Beginner

In [None]:
def parse_dates():
    df = pd.DataFrame({'date': ['2023-01-01', '2023-01-02']})
    # TODO: Convert 'date' column to datetime
    pass

@validator.koan(8, "Parse Dates", difficulty="Beginner")
def validate():
    result = parse_dates()
assert pd.api.types.is_datetime64_any_dtype(result['date'])
validate()

## KOAN 4.9: Handle Outliers
**Objective**: Cap extremes
**Difficulty**: Beginner

In [None]:
def cap_outliers():
    df = pd.DataFrame({'val': [1, 2, 3, 100, 4, 5]})
    # TODO: Cap values above 10 to 10
    pass

@validator.koan(9, "Handle Outliers", difficulty="Beginner")
def validate():
    result = cap_outliers()
assert result['val'].max() == 10
validate()

## KOAN 4.10: Rename Columns
**Objective**: Fix names
**Difficulty**: Beginner

In [None]:
def rename_cols():
    df = pd.DataFrame({'Old Name': [1, 2, 3]})
    # TODO: Rename 'Old Name' to 'new_name'
    pass

@validator.koan(10, "Rename Columns", difficulty="Beginner")
def validate():
    result = rename_cols()
assert 'new_name' in result.columns
validate()

## Congratulations!

You completed Data Cleaning!

In [None]:
progress = tracker.get_notebook_progress('04_data_cleaning')
print(f'Final Progress: {progress}%')
if progress == 100:
    print('Excellent! You mastered Data Cleaning!')