# ðŸ§¹ BÃ€I 2: DATA CLEANING WITH PYTHON

## Má»¥c tiÃªu:
- Handle missing values
- Remove duplicates
- Standardize data formats
- Handle outliers
- Data type conversions

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('/home/jovyan/week-03-04-python-etl/scripts')

from db_connector import DatabaseConnector
from data_cleaner import DataCleaner, quick_clean

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("âœ… Setup complete!")

---
## ðŸ“Š PART 1: Load Dirty Data

In [None]:
# Create sample dirty data
dirty_data = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5, 5, 6, 7, 8, 9, 10],
    'customer_name': [
        '  John Doe  ',
        'jane    SMITH',
        'JOHN DOE',
        'Bob   Wilson',
        None,
        'Alice Brown',
        'Charlie  Davis',
        'Eve   White',
        'Frank   Black',
        'Grace   Green',
        'Henry   Blue'
    ],
    'email': [
        'john@example.com',
        'invalid-email',
        'jane@test.com',
        'bob@company.com',
        None,
        'alice@test.com',
        'charlie@example',
        'eve@test.com',
        'frank@company.com',
        'grace@test.com',
        'henry@example.com'
    ],
    'age': [25, 30, 25, np.nan, 35, 35, 40, -5, 200, 28, 32],
    'salary': [50000, 60000, 50000, 75000, np.nan, 80000, 90000, 55000, 1000000, 65000, 70000],
    'join_date': [
        '2020-01-15',
        '2020-02-20',
        '2020-01-15',
        '2020-03-10',
        '2020-04-05',
        '2020-04-05',
        'invalid-date',
        '2020-05-12',
        '2020-06-18',
        '2020-07-22',
        '2020-08-30'
    ],
    'country': ['Vietnam', 'vietnam', 'VIETNAM', 'USA', 'usa', 'USA', 'UK', 'uk', 'Vietnam', 'USA', 'UK']
})

print(f"Original shape: {dirty_data.shape}")
print(f"\nData types:\n{dirty_data.dtypes}")
print(f"\nMissing values:\n{dirty_data.isna().sum()}")
print(f"\nDuplicates: {dirty_data.duplicated().sum()}")

dirty_data

---
## ðŸŽ¯ EXERCISE 1: Identify Data Quality Issues

In [None]:
# TODO: List all data quality issues you can find
# Write your observations here:

"""
Issues found:
1. 
2. 
3. 
4. 
5. 
"""

---
## ðŸŽ¯ EXERCISE 2: Remove Duplicates

In [None]:
# TODO: Check for duplicate rows
# YOUR CODE HERE

print(f"Duplicate rows: {dirty_data.duplicated().sum()}")
print(f"\nDuplicate customer_ids: {dirty_data['customer_id'].duplicated().sum()}")

# Show duplicates
dirty_data[dirty_data.duplicated(keep=False)]

In [None]:
# TODO: Remove duplicate rows based on customer_id
# Keep the first occurrence
# YOUR CODE HERE

cleaned_df = None

print(f"Shape after removing duplicates: {cleaned_df.shape}")
cleaned_df

---
## ðŸŽ¯ EXERCISE 3: Handle Missing Values

In [None]:
# TODO: Analyze missing values
# YOUR CODE HERE

missing_summary = None

print("Missing Values Summary:")
print(missing_summary)

# Visualize
missing_summary.plot(kind='bar', figsize=(10, 5))
plt.title('Missing Values by Column')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
# TODO: Fill missing values with appropriate strategies:
# - customer_name: Fill with 'Unknown'
# - email: Fill with 'no-email@unknown.com'
# - age: Fill with median
# - salary: Fill with median

# YOUR CODE HERE

print(f"Missing values after filling:\n{cleaned_df.isna().sum()}")
cleaned_df

---
## ðŸŽ¯ EXERCISE 4: Standardize Text Data

In [None]:
# TODO: Standardize customer_name:
# - Remove leading/trailing spaces
# - Remove extra spaces
# - Convert to title case

# YOUR CODE HERE

print("Standardized names:")
cleaned_df[['customer_name']]

In [None]:
# TODO: Standardize country names to uppercase
# YOUR CODE HERE

print("Country distribution:")
print(cleaned_df['country'].value_counts())

---
## ðŸŽ¯ EXERCISE 5: Validate and Clean Email

In [None]:
# TODO: Create email validation function
import re

def is_valid_email(email):
    """Check if email is valid"""
    # YOUR CODE HERE
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, str(email)))

# Test
test_emails = ['john@test.com', 'invalid-email', 'test@example', 'valid@test.co.uk']
for email in test_emails:
    print(f"{email}: {is_valid_email(email)}")

In [None]:
# TODO: Add email_valid column
# YOUR CODE HERE

cleaned_df['email_valid'] = None

print("Email validation results:")
print(cleaned_df[['email', 'email_valid']])
print(f"\nValid emails: {cleaned_df['email_valid'].sum()}")
print(f"Invalid emails: {(~cleaned_df['email_valid']).sum()}")

---
## ðŸŽ¯ EXERCISE 6: Handle Outliers

In [None]:
# TODO: Visualize age distribution
# YOUR CODE HERE

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Box plot
cleaned_df['age'].plot(kind='box', ax=axes[0])
axes[0].set_title('Age Distribution (Box Plot)')
axes[0].set_ylabel('Age')

# Histogram
cleaned_df['age'].plot(kind='hist', bins=20, ax=axes[1])
axes[1].set_title('Age Distribution (Histogram)')
axes[1].set_xlabel('Age')

plt.tight_layout()
plt.show()

print(f"Age statistics:\n{cleaned_df['age'].describe()}")

In [None]:
# TODO: Remove age outliers using IQR method
# Outliers: age < 18 or age > 100
# YOUR CODE HERE

# Calculate IQR
Q1 = None
Q3 = None
IQR = None

lower_bound = None
upper_bound = None

print(f"Age bounds: [{lower_bound:.1f}, {upper_bound:.1f}]")

# Filter outliers
outliers = None
print(f"\nOutliers found: {len(outliers)}")
print(outliers[['customer_name', 'age']])

In [None]:
# TODO: Handle outliers - replace with median or remove
# YOUR CODE HERE

print(f"Shape after handling outliers: {cleaned_df.shape}")
print(f"\nAge statistics after cleaning:\n{cleaned_df['age'].describe()}")

---
## ðŸŽ¯ EXERCISE 7: Data Type Conversions

In [None]:
# TODO: Convert join_date to datetime
# Handle invalid dates
# YOUR CODE HERE

cleaned_df['join_date'] = None

print(f"Data types after conversion:\n{cleaned_df.dtypes}")
print(f"\nJoin date range: {cleaned_df['join_date'].min()} to {cleaned_df['join_date'].max()}")

In [None]:
# TODO: Convert country to categorical
# YOUR CODE HERE

cleaned_df['country'] = None

print(f"Country categories: {cleaned_df['country'].cat.categories.tolist()}")
print(f"Memory usage before: {dirty_data['country'].memory_usage(deep=True)} bytes")
print(f"Memory usage after: {cleaned_df['country'].memory_usage(deep=True)} bytes")

---
## ðŸŽ¯ EXERCISE 8: Using DataCleaner Class

In [None]:
# TODO: Use DataCleaner class to clean the original dirty data
# YOUR CODE HERE

cleaner = DataCleaner(dirty_data)

cleaned_with_class = (
    cleaner
    # Add your cleaning steps here
    .get_cleaned_data()
)

print("Cleaning Report:")
print(cleaner.get_cleaning_report())

print("\nCleaned Data:")
cleaned_with_class

---
## ðŸŽ¯ CHALLENGE: Complete Data Cleaning Pipeline

In [None]:
# TODO: Create a complete cleaning function
def clean_customer_data(df):
    """
    Complete data cleaning pipeline
    
    Steps:
    1. Remove duplicates
    2. Handle missing values
    3. Standardize text
    4. Validate emails
    5. Handle outliers
    6. Convert data types
    7. Add derived columns
    """
    # YOUR CODE HERE
    
    return df

# Test your function
final_cleaned = clean_customer_data(dirty_data.copy())

print(f"Original shape: {dirty_data.shape}")
print(f"Cleaned shape: {final_cleaned.shape}")
print(f"\nCleaned data info:")
print(final_cleaned.info())

final_cleaned

---
## ðŸŽ¯ REAL-WORLD EXERCISE: Clean Database Data

In [None]:
# TODO: Load customers from database and clean
db = DatabaseConnector()

# Load data
customers = db.read_sql("SELECT * FROM analytics.customers LIMIT 1000")

print(f"Loaded {len(customers)} customers")
print(f"\nData quality issues:")
print(f"Missing values:\n{customers.isna().sum()}")
print(f"\nDuplicates: {customers.duplicated().sum()}")

# YOUR CLEANING CODE HERE

customers.head()

---
## ðŸ“š SOLUTIONS (Uncomment to view)

In [1]:
# SOLUTION: Remove duplicates (thÃªm .copy() Ä‘á»ƒ trÃ¡nh warning)
cleaned_df = dirty_data.drop_duplicates(subset=['customer_id'], keep='first').copy()

# SOLUTION: Fill missing values
cleaned_df['customer_name'] = cleaned_df['customer_name'].fillna('Unknown')
cleaned_df['email'] = cleaned_df['email'].fillna('no-email@unknown.com')
cleaned_df['age'] = cleaned_df['age'].fillna(cleaned_df['age'].median())
cleaned_df['salary'] = cleaned_df['salary'].fillna(cleaned_df['salary'].median())

# SOLUTION: Standardize text
cleaned_df['customer_name'] = cleaned_df['customer_name'].str.strip().str.replace(r'\s+', ' ', regex=True).str.title()
cleaned_df['country'] = cleaned_df['country'].str.upper()

# SOLUTION: Validate email (Ä‘á»‹nh nghÄ©a hÃ m trÆ°á»›c)
import re

def is_valid_email(email):
    """Check if email is valid"""
    pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return bool(re.match(pattern, str(email)))

cleaned_df['email_valid'] = cleaned_df['email'].apply(is_valid_email)

# SOLUTION: Handle outliers
Q1 = cleaned_df['age'].quantile(0.25)
Q3 = cleaned_df['age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
cleaned_df = cleaned_df[(cleaned_df['age'] >= lower_bound) & (cleaned_df['age'] <= upper_bound)].copy()

# SOLUTION: Convert data types
cleaned_df['join_date'] = pd.to_datetime(cleaned_df['join_date'], errors='coerce')
cleaned_df['country'] = cleaned_df['country'].astype('category')

# Show data
cleaned_df.head()

NameError: name 'dirty_data' is not defined