## Defining Data Quality SLAs
### Data Completeness
**Description**: Set an SLA that ensures that 95% of data fields in your dataset are filled (non-null values). Practice by checking a dataset of your choice and calculate its completeness.

In [None]:
# write your code from here
import pandas as pd

def check_data_completeness(df, threshold=0.95):
    completeness = df.notnull().mean().mean()
    if completeness >= threshold:
        return f"Data completeness SLA met: {completeness:.2%} ≥ {threshold:.2%}"
    else:
        return f"Data completeness SLA NOT met: {completeness:.2%} < {threshold:.2%}"

data = {
    'Name': ['Alice', 'Bob', None, 'David', 'Eva'],
    'Age': [25, None, 30, 22, 28],
    'Email': ['alice@example.com', 'bob@example.com', 'charlie@example.com', None, 'eva@example.com']
}

df = pd.DataFrame(data)
result = check_data_completeness(df)
print(result)


### Data Timeliness:
**Description**: Establish an SLA that specifies that data should be integrated and processed within 24 hours of acquisition. Monitor the data pipeline for timeliness.

In [None]:
# write your code from here
from datetime import datetime, timedelta

def check_data_timeliness(data_acquisition_time, processing_time, max_delay_hours=24):
    try:
        acquisition_dt = pd.to_datetime(data_acquisition_time)
        processing_dt = pd.to_datetime(processing_time)
        delay = processing_dt - acquisition_dt
        if delay <= timedelta(hours=max_delay_hours):
            return f"Data timeliness SLA met: processed within {delay}."
        else:
            return f"Data timeliness SLA NOT met: processing delayed by {delay}."
    except Exception as e:
        return f"Error in checking timeliness: {str(e)}"

# Example timestamps
data_acquisition_time = "2025-05-16 10:00:00"
processing_time = "2025-05-17 08:30:00"

result = check_data_timeliness(data_acquisition_time, processing_time)
print(result)


### Data Consistency:
**Description**: Define an SLA for maintaining consistency across various related datasets. Implement a check to ensure that 99% of data entries are consistent.

In [None]:
# write your code from here
import pandas as pd

def check_data_consistency(df1, df2, key_columns, threshold=0.99):
    try:
        if not all(col in df1.columns for col in key_columns) or not all(col in df2.columns for col in key_columns):
            return "Error: One or more key columns missing in the datasets."
        
        # Merge on key columns with indicator to identify mismatches
        merged = df1.merge(df2, on=key_columns, how='outer', indicator=True)
        
        # Count consistent rows (present in both)
        consistent_count = (merged['_merge'] == 'both').sum()
        total_count = len(merged)
        
        consistency_ratio = consistent_count / total_count if total_count > 0 else 0
        
        if consistency_ratio >= threshold:
            return f"Data consistency SLA met: {consistency_ratio:.2%} consistent entries."
        else:
            return f"Data consistency SLA NOT met: only {consistency_ratio:.2%} consistent entries."
    except Exception as e:
        return f"Error during consistency check: {str(e)}"


# Example usage
df1 = pd.DataFrame({'ID': [1,2,3,4], 'Value': [10,20,30,40]})
df2 = pd.DataFrame({'ID': [1,2,3,5], 'Value': [10,20,30,50]})
key_cols = ['ID']

result = check_data_consistency(df1, df2, key_cols)
print(result)
