In [1]:
import pandas as pd
import numpy as np

# Sample dataset for demonstration
data = {
    'CustomerID': [1, 2, 3, 4, 4],
    'Name': ['Alice', 'Bob', 'Charlie', None, 'David'],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example', None, 'david@example.com'],
    'JoinDate': ['2023-01-15', '2022-12-01', '2023-03-10', 'invalid_date', '2023-03-10'],
    'PurchaseAmount': [120.50, 80.0, None, 40.25, 40.25]
}
df = pd.DataFrame(data)

# Task 1: Completeness, Uniqueness, Consistency
def data_quality_score_task1(df):
    total_cells = df.size
    missing_cells = df.isnull().sum().sum()
    completeness_score = 1 - (missing_cells / total_cells)

    total_rows = len(df)
    unique_rows = len(df.drop_duplicates())
    uniqueness_score = unique_rows / total_rows

    email_valid = df['Email'].apply(lambda x: isinstance(x, str) and '@' in x and '.' in x)
    date_valid = pd.to_datetime(df['JoinDate'], errors='coerce').notna()
    consistency_score = (email_valid & date_valid).sum() / total_rows

    overall_score = (completeness_score + uniqueness_score + consistency_score) / 3
    return {
        "Completeness": round(completeness_score, 2),
        "Uniqueness": round(uniqueness_score, 2),
        "Consistency": round(consistency_score, 2),
        "Overall Score": round(overall_score, 2)
    }

# Task 2: Accuracy, Timeliness, Integrity
def data_quality_score_task2(df):
    accuracy_score = df['PurchaseAmount'].dropna().apply(lambda x: x >= 0).mean()

    df['JoinDateParsed'] = pd.to_datetime(df['JoinDate'], errors='coerce')
    recent_date_cutoff = pd.Timestamp.now() - pd.DateOffset(years=2)
    timeliness_score = (df['JoinDateParsed'] >= recent_date_cutoff).mean()

    integrity_score = df['CustomerID'].is_unique

    overall_score = (accuracy_score + timeliness_score + integrity_score) / 3
    return {
        "Accuracy": round(accuracy_score, 2),
        "Timeliness": round(timeliness_score, 2),
        "Integrity": round(integrity_score, 2),
        "Overall Score": round(overall_score, 2)
    }

# Task 3: Validity, Precision, Accessibility
def data_quality_score_task3(df):
    validity_score = df['PurchaseAmount'].dropna().apply(lambda x: isinstance(x, (float, int)) and x >= 0).mean()

    precision_score = df['PurchaseAmount'].dropna().apply(lambda x: len(str(x).split('.')[-1]) == 2).mean()

    accessibility_score = df[['Name', 'Email']].notnull().all(axis=1).mean()

    overall_score = (validity_score + precision_score + accessibility_score) / 3
    return {
        "Validity": round(validity_score, 2),
        "Precision": round(precision_score, 2),
        "Accessibility": round(accessibility_score, 2),
        "Overall Score": round(overall_score, 2)
    }

# Run and print results
print("Task 1 - Completeness, Uniqueness, Consistency:", data_quality_score_task1(df))
print("Task 2 - Accuracy, Timeliness, Integrity:", data_quality_score_task2(df))
print("Task 3 - Validity, Precision, Accessibility:", data_quality_score_task3(df))


Task 1 - Completeness, Uniqueness, Consistency: {'Completeness': np.float64(0.88), 'Uniqueness': 1.0, 'Consistency': np.float64(0.6), 'Overall Score': np.float64(0.83)}
Task 2 - Accuracy, Timeliness, Integrity: {'Accuracy': np.float64(1.0), 'Timeliness': np.float64(0.0), 'Integrity': 0, 'Overall Score': np.float64(0.33)}
Task 3 - Validity, Precision, Accessibility: {'Validity': np.float64(1.0), 'Precision': np.float64(0.5), 'Accessibility': np.float64(0.8), 'Overall Score': np.float64(0.77)}
