### Task 1: Understanding and Defining Data Quality Metrics
**Description**: Learn how to define basic data quality metrics such as completeness, validity, and uniqueness for a simple dataset.

**Steps**:
1. Dataset: Use a CSV with columns like Name , Email , Age .
2. Metric Definitions:
    - Completeness: Percentage of non-null values.
    - Validity: % of email fields containing @ .
    - Uniqueness: Count distinct entries in the Email column.

In [None]:
# Write your code from here
import pandas as pd

def calculate_completeness(df):
    completeness = df.notnull().mean() * 100
    return completeness.to_dict()

def calculate_validity_email(df, email_col='Email'):
    if email_col not in df.columns:
        return None
    valid_emails = df[email_col].dropna().apply(lambda x: '@' in str(x))
    validity = (valid_emails.sum() / len(df)) * 100
    return validity

def calculate_uniqueness(df, col):
    if col not in df.columns:
        return None
    unique_count = df[col].nunique()
    return unique_count

if __name__ == "__main__":
    df = pd.read_csv("data.csv")
    
    completeness = calculate_completeness(df)
    validity = calculate_validity_email(df)
    uniqueness = calculate_uniqueness(df, 'Email')
    
    print(f"Completeness (% non-null per column): {completeness}")
    print(f"Validity (% emails containing '@'): {validity:.2f}%")
    print(f"Uniqueness (distinct emails count): {uniqueness}")


### Task 2: Calculating Data Quality Score
**Description**: Aggregate multiple metrics to calculate an overall data quality score.

**Steps**:
1. Formula: Simple average of all metrics defined in Task 1.

In [None]:
# Write your code from here
import pandas as pd

def calculate_completeness(df):
    completeness = df.notnull().mean() * 100
    return completeness.mean()

def calculate_validity_email(df, email_col='Email'):
    if email_col not in df.columns:
        return 0.0
    valid_emails = df[email_col].dropna().apply(lambda x: '@' in str(x))
    validity = (valid_emails.sum() / len(df)) * 100
    return validity

def calculate_uniqueness(df, col):
    if col not in df.columns:
        return 0.0
    unique_count = df[col].nunique()
    total_count = len(df)
    uniqueness = (unique_count / total_count) * 100 if total_count > 0 else 0.0
    return uniqueness

def calculate_overall_dqi(df):
    completeness_score = calculate_completeness(df)
    validity_score = calculate_validity_email(df)
    uniqueness_score = calculate_uniqueness(df, 'Email')
    scores = [completeness_score, validity_score, uniqueness_score]
    overall_score = sum(scores) / len(scores)
    return overall_score

if __name__ == "__main__":
    df = pd.read_csv("data.csv")
    overall_dqi = calculate_overall_dqi(df)
    print(f"Overall Data Quality Score: {overall_dqi:.2f}%")


### Task 3: Creating Expectations for a CSV
**Description**: Develop basic data quality expectations using Great Expectations.

**Steps**:
1. Expectation Suite
2. Define Expectations for Completeness

In [None]:
# Write your code from here
import great_expectations as ge

# Load CSV as a Great Expectations dataset
df = ge.read_csv("data.csv")

# Create an expectation suite
suite_name = "basic_completeness_suite"
suite = df.get_expectation_suite(suite_name)

# Define expectations for completeness on specific columns
columns_to_check = ['Name', 'Email', 'Age']
for col in columns_to_check:
    df.expect_column_values_to_not_be_null(column=col)

# Save the expectation suite (optional)
df.save_expectation_suite(suite_name=suite_name, discard_failed_expectations=False)

# Validate the dataset against the suite
results = df.validate(expectation_suite=suite_name)

print(results)


### Task 4: Running and Validating Expectations
**Description**: Run the created expectations and generate an output report.

**Steps**:
1. Validate
2. Generate HTML Report

In [None]:
# Write your code from here
import great_expectations as ge
from great_expectations.render.renderer import ExpectationSuiteHTMLRenderer
from great_expectations.render.view import DefaultJinjaPageView
from great_expectations.data_context import DataContext

# Load CSV as a GE dataset
df = ge.read_csv("data.csv")

# Validate the dataset using the existing expectation suite
suite_name = "basic_completeness_suite"
results = df.validate(expectation_suite=suite_name)

# Generate an HTML report for the validation results
renderer = ExpectationSuiteHTMLRenderer()
rendered_content = renderer.render(validation_result_suite=results)

view = DefaultJinjaPageView(rendered_content)
html_report = view.render()

# Save the report as an HTML file
with open("validation_report.html", "w") as f:
    f.write(html_report)

print("Validation complete. Report saved as validation_report.html")


### Task 5: Automating Data Quality Score Calculation
**Description**: Automate the data quality score via a script that integrates with Great
Expectations.

In [None]:
# Write your code from here
import great_expectations as ge

def calculate_data_quality_score(csv_file, suite_name):
    df = ge.read_csv(csv_file)
    results = df.validate(expectation_suite=suite_name)
    
    # Extract successful expectations count and total expectations count
    successful = sum([res['success'] for res in results['results']])
    total = len(results['results'])
    
    # Calculate data quality score as percentage of successful expectations
    dqi_score = (successful / total) * 100 if total > 0 else 0.0
    return dqi_score

# Example usage
csv_path = "data.csv"
expectation_suite = "basic_completeness_suite"

score = calculate_data_quality_score(csv_path, expectation_suite)
print(f"Automated Data Quality Score: {score:.2f}%")


### Task 6: Leveraging Data Quality Metrics for Automated Data Cleaning
**Description**: Implement a system where if data quality metrics fall below a threshold,
automated data cleaning scripts are triggered.

**Steps**:
1. Define Cleaning Logic
2. Integrate with Great Expectations:
    - Use an action within the Great Expectations action list that only triggers if quality score is below a threshold, automating the cleaning.

In [None]:
# Write your code from here
import great_expectations as ge
import pandas as pd

def clean_data(df):
    # Example cleaning logic: drop rows with any null values
    return df.dropna()

def calculate_data_quality_score(results):
    successful = sum([res['success'] for res in results['results']])
    total = len(results['results'])
    return (successful / total) * 100 if total > 0 else 0.0

def automate_cleaning_if_needed(csv_file, suite_name, threshold=90):
    df = pd.read_csv(csv_file)
    ge_df = ge.from_pandas(df)
    
    validation_results = ge_df.validate(expectation_suite=suite_name)
    dqi_score = calculate_data_quality_score(validation_results)
    
    print(f"Current Data Quality Score: {dqi_score:.2f}%")
    if dqi_score < threshold:
        print("Data quality below threshold. Triggering automated cleaning...")
        cleaned_df = clean_data(df)
        cleaned_df.to_csv(csv_file, index=False)
        print("Data cleaning completed and saved.")
    else:
        print("Data quality above threshold. No cleaning needed.")

# Example usage
csv_path = "data.csv"
expectation_suite = "basic_completeness_suite"
quality_threshold = 90

automate_cleaning_if_needed(csv_path, expectation_suite, quality_threshold)
