# Measuring Completeness

**Activity Overview**: Evaluate data completeness by checking missing data rates and handling partially available records.

## Title: Customer Profiles

**Task**: Calculate the missing data rate for customer profiles.

**Steps**:
1. List all required fields for a complete customer profile (e.g., name, address, email,
phone number).
2. Analyze the dataset to count how many profiles have missing fields.
3. Calculate the percentage of missing data fields across all profiles.

In [2]:
import pandas as pd
import os

def calculate_missing_data_rate(file_path, required_fields):
    """
    Calculates record-level and field-level missing data rates
    from a CSV file containing customer profiles.

    Parameters:
        file_path (str): Path to the CSV file.
        required_fields (list): List of required column names.

    Returns:
        dict: Dictionary containing missing data stats.
    """
    # Check if file exists
    if not os.path.isfile(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    try:
        df = pd.read_csv(file_path)
    except Exception as e:
        raise ValueError(f"Error reading CSV file: {e}")

    # Check if DataFrame is empty
    if df.empty:
        raise ValueError("The CSV file is empty.")

    # Check if all required columns exist
    missing_columns = [col for col in required_fields if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns in data: {missing_columns}")

    # Compute metrics
    total_profiles = len(df)
    incomplete_profiles = df[required_fields].isnull().any(axis=1).sum()
    record_missing_rate = (incomplete_profiles / total_profiles) * 100
    field_missing_rates = df[required_fields].isnull().mean() * 100

    return {
        'total_profiles': total_profiles,
        'incomplete_profiles': incomplete_profiles,
        'record_missing_rate': record_missing_rate,
        'field_missing_rates': field_missing_rates.to_dict()
    }

# Example usage:
if __name__ == "__main__":
    file = 'customer_data.csv'
    fields = ['Name', 'Address', 'Email', 'Phone Number']

    try:
        results = calculate_missing_data_rate(file, fields)
        print(f"Total Profiles: {results['total_profiles']}")
        print(f"Incomplete Profiles: {results['incomplete_profiles']}")
        print(f"Record-Level Missing Rate: {results['record_missing_rate']:.2f}%\n")
        print("Field-Level Missing Rates:")
        for field, rate in results['field_missing_rates'].items():
            print(f"  {field}: {rate:.2f}%")
    except Exception as e:
        print(f"Error: {e}")


Error: File not found: customer_data.csv
