# Setup: Generate Sample Dataset

This cell creates the required folder structure (`data/raw/` and `data/processed/`) relative to the notebook, and generates the sample CSV dataset with missing values. 
This ensures the dataset is ready for cleaning functions and saves it to `data/raw/sample_data.csv`.

In [7]:
import os
import pandas as pd
import numpy as np

# Define folder paths relative to this notebook
raw_dir = '../data/raw'
processed_dir = '../data/processed'

# Create folders if they don't exist
os.makedirs(raw_dir, exist_ok=True)
os.makedirs(processed_dir, exist_ok=True)

# Define the sample data
data = {
    'age': [34, 45, 29, 50, 38, np.nan, 41],
    'income': [55000, np.nan, 42000, 58000, np.nan, np.nan, 49000],
    'score': [0.82, 0.91, np.nan, 0.76, 0.88, 0.65, 0.79],
    'zipcode': ['90210', '10001', '60614', '94103', '73301', '12345', '94105'],
    'city': ['Beverly', 'New York', 'Chicago', 'SF', 'Austin', 'Unknown', 'San Francisco'],
    'extra_data': [np.nan, 42, np.nan, np.nan, np.nan, 5, np.nan]
}
# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV in raw data folder
csv_path = os.path.join(raw_dir, 'sample_data.csv')
if not os.path.exists(csv_path):
    df.to_csv(csv_path, index=False)
    print(f'Sample dataset created and saved to {csv_path}')
else:
    print(f'File already exists at {csv_path}. Skipping CSV creation to avoid overwrite.')


File already exists at ../data/raw\sample_data.csv. Skipping CSV creation to avoid overwrite.


# Homework Starter — Stage 6: Data Preprocessing
Use this notebook to apply your cleaning functions and save processed data.

In [26]:
import sys
import os
import pandas as pd

# Determine project root
try:
    # Works if running as a script
    project_root = os.path.abspath(os.path.dirname(__file__))
except NameError:
    # Works in Jupyter notebooks
    project_root = os.path.abspath(os.getcwd())

# Ensure project root is in sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

# Import cleaning function
from src.cleaning import clean_data

# Load CSV
file_path = os.path.join(project_root, 'data/raw/sample_data.csv')
try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!")
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit(1)

# Show original data
print("Original Data:")
print(df.head())

# Clean data
df_cleaned = clean_data(df)
print("\nCleaned Data:")
print(df_cleaned.head())

# Optionally save cleaned data
cleaned_file = os.path.join(project_root, 'data/raw/sample_data_cleaned.csv')
df_cleaned.to_csv(cleaned_file, index=False)
print(f"\nCleaned data saved to {cleaned_file}")


Data loaded successfully!
Original Data:
    age   income  score  zipcode      city  extra_data
0  34.0  55000.0   0.82    90210   Beverly         NaN
1  45.0      NaN   0.91    10001  New York        42.0
2  29.0  42000.0    NaN    60614   Chicago         NaN
3  50.0  58000.0   0.76    94103        SF         NaN
4  38.0      NaN   0.88    73301    Austin         NaN

Cleaned Data:
    age   income  score  zipcode      city  extra_data
0  34.0  55000.0  0.820    90210   beverly        23.5
1  45.0  52000.0  0.910    10001  new york        42.0
2  29.0  42000.0  0.805    60614   chicago        23.5
3  50.0  58000.0  0.760    94103        sf        23.5
4  38.0  52000.0  0.880    73301    austin        23.5

Cleaned data saved to c:\Users\kavin\bootcamp_Kavin_Dhanasekaran\homework\homework6\data/raw/sample_data_cleaned.csv


## Load Raw Dataset

## Apply Cleaning Functions

In [27]:
def fill_missing_median(df, columns):
    """
    Fills missing values in specified columns with the column median.
    
    Parameters:
        df (DataFrame): Input pandas DataFrame.
        columns (list): List of column names to fill.
    Returns:
        DataFrame: DataFrame with missing values filled.
    """
    for col in columns:
        if col in df.columns:
            median = df[col].median()
            df[col].fillna(median, inplace=True)
    return df


In [29]:
def drop_missing(df, threshold=0.5):
    """
    Drops rows with a proportion of missing values exceeding the threshold.
    """
    return df[df.isnull().mean(axis=1) <= threshold]


In [30]:
def normalize_data(df, columns):
    """
    Normalizes specified columns using min-max scaling.
    """
    for col in columns:
        if col in df.columns:
            min_val = df[col].min()
            max_val = df[col].max()
            # Avoid division by zero
            if pd.notnull(min_val) and pd.notnull(max_val) and max_val != min_val:
                df[col] = (df[col] - min_val) / (max_val - min_val)
    return df

In [46]:

import pandas as pd

def drop_missing(df: pd.DataFrame, how: str = 'all', axis: int = 0, threshold: float = None) -> pd.DataFrame:
    """
    Drop rows or columns with missing values.

    Parameters:
        df: pd.DataFrame
        how: 'all' or 'any' (used if threshold is None)
        axis: 0 = rows, 1 = columns
        threshold: float (0-1), minimum proportion of non-NA values required to keep row/column

    Returns:
        pd.DataFrame with rows/columns dropped
    """
    if threshold is not None:
        min_count = int((1 - threshold) * df.shape[axis])
        return df.dropna(axis=axis, thresh=min_count)
    else:
        return df.dropna(how=how, axis=axis)


def fill_missing_median(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    """
    Fill missing numeric values with the median of their column.
    
    Parameters:
        df: pd.DataFrame
        columns: list of numeric columns to fill. If None, fill all numeric columns.
    """
    if columns is None:
        columns = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
    
    for col in columns:
        if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
            df[col] = df[col].fillna(df[col].median())
    
    return df


def normalize_data(df: pd.DataFrame, columns=None) -> pd.DataFrame:
    """
    Normalize string/categorical columns by stripping whitespace
    and converting to lowercase. Fill missing string values with 'Unknown'.
    
    Parameters:
        df: pd.DataFrame
        columns: list of columns to normalize. If None, normalize all non-numeric columns.
    """
    if columns is None:
        columns = [col for col in df.columns if not pd.api.types.is_numeric_dtype(df[col])]
    
    for col in columns:
        if col in df.columns:
            df[col] = df[col].fillna('Unknown').astype(str).str.strip().str.lower()
    
    return df


def remove_duplicates(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove duplicate rows from the DataFrame.
    """
    return df.drop_duplicates()


def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Full cleaning pipeline:
      1. Drop rows where all values are missing
      2. Remove duplicates
      3. Fill missing numeric values
      4. Normalize text columns
    """
    df = drop_missing(df, how='all')
    df = remove_duplicates(df)
    df = fill_missing_median(df)
    df = normalize_data(df)
    return df


In [40]:
import pandas as pd

# Load CSV
df = pd.read_csv(r"C:\Users\kavin\bootcamp_Kavin_Dhanasekaran\homework\homework6\data\raw\sample_data.csv")

# Individual operations
df = drop_missing(df, how='all')
df = remove_duplicates(df)
df = fill_missing_median(df)
df = normalize_data(df)

# Or run full pipeline
df_cleaned = clean_data(df)

print(df_cleaned.head())



    age   income  score  zipcode      city  extra_data
0  34.0  55000.0  0.820    90210   beverly        23.5
1  45.0  52000.0  0.910    10001  new york        42.0
2  29.0  42000.0  0.805    60614   chicago        23.5
3  50.0  58000.0  0.760    94103        sf        23.5
4  38.0  52000.0  0.880    73301    austin        23.5


In [None]:
# TODO: Apply your functions here
# Example:
# df = cleaning.fill_missing_median(df, ['col1','col2'])
# df = cleaning.drop_missing(df, threshold=0.5)
# df = cleaning.normalize_data(df, ['col1','col2'])

## Save Cleaned Dataset

In [None]:
# df.to_csv('../data/processed/sample_data_cleaned.csv', index=False)

In [50]:
import os

# Ensure the output directory exists
output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)  # Creates the folder if it doesn't exist

# Save processed dataset
output_file = os.path.join(output_dir, 'sample_data_cleaned.csv')
df.to_csv(output_file, index=False)
print(f"Data cleaning and processing complete. Cleaned data saved to '{output_file}'.")


Data cleaning and processing complete. Cleaned data saved to '../data/processed\sample_data_cleaned.csv'.
