In [None]:
# Make sure to run this in a terminal or Jupyter Notebook environment

# -----------------------------------
# 19. Setting Up Expectations
# -----------------------------------
# Step 1: Install Great Expectations (if not already installed)
# !pip install great_expectations

import great_expectations as ge
import pandas as pd

# Step 2: Create a sample dataset
df = pd.DataFrame({
    "age": [25, 30, 45, None, 22],
    "salary": [50000, 60000, 80000, 55000, None]
})

# Convert to GE DataFrame
ge_df = ge.from_pandas(df)

# Step 3: Create expectations
ge_df.expect_column_values_to_not_be_null("age")
ge_df.expect_column_values_to_be_between("age", min_value=18, max_value=65)

# Step 4: Validate data
results = ge_df.validate()
print("Validation Results:\n", results)

# -----------------------------------
# 20. Testing for Expectation
# -----------------------------------
# Add an expectation: salary must be > 30000 and < 100000
ge_df.expect_column_values_to_be_between("salary", min_value=30000, max_value=100000)

# Revalidate with the new expectation
results = ge_df.validate()
print("\nValidation After Adding Salary Range Expectation:\n", results)

# -----------------------------------
# 21. Generating Data Docs
# -----------------------------------
# Initialize GE project (run this in your terminal inside your project directory):
# !great_expectations init

# Save your expectations to a suite (in terminal or script)
# Example command:
# great_expectations suite new

# To manually generate docs after validation:
# !great_expectations docs build

# Data Docs will be saved in: great_expectations/uncommitted/data_docs/local_site/index.html

# Optional: View your docs
# import webbrowser
# webbrowser.open("great_expectations/uncommitted/data_docs/local_site/index.html")


In [None]:
import pandas as pd

# Load sample dataset
df = pd.read_csv('your_dataset.csv')

# 1. Data Profiling: Basic Summary
def profile_data(df):
    profile = {
        'num_rows': len(df),
        'num_columns': len(df.columns),
        'missing_per_column': df.isnull().sum().to_dict(),
        'duplicates_count': df.duplicated().sum(),
        'column_types': df.dtypes.astype(str).to_dict(),
    }
    return profile

# 2. Quality Rule: Check for duplicates in key columns
def check_duplicates(df, subset_cols=None):
    duplicates = df[df.duplicated(subset=subset_cols, keep=False)]
    return duplicates

# 3. Quality Rule: Missing value threshold per column
def check_missing_threshold(df, threshold=0.1):
    missing_ratio = df.isnull().mean()
    columns_exceeding = missing_ratio[missing_ratio > threshold].index.tolist()
    return columns_exceeding

# 4. Format Enforcement: Example regex check for email column
import re

def check_email_format(df, column='email'):
    invalid_emails = []
    email_pattern = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    if column in df.columns:
        for idx, val in df[column].dropna().items():
            if not re.match(email_pattern, str(val)):
                invalid_emails.append((idx, val))
    return invalid_emails

# 5. Generate report (print summary)
def generate_report(df):
    profile = profile_data(df)
    print("=== Data Profile ===")
    print(f"Rows: {profile['num_rows']}, Columns: {profile['num_columns']}")
    print(f"Missing Values per Column: {profile['missing_per_column']}")
    print(f"Total Duplicate Rows: {profile['duplicates_count']}")
    print(f"Column Data Types: {profile['column_types']}\n")

    duplicates = check_duplicates(df)
    print(f"Duplicate rows found: {len(duplicates)}")
    if len(duplicates) > 0:
        print(duplicates.head())

    missing_cols = check_missing_threshold(df, threshold=0.2)
    print(f"Columns exceeding 20% missing values: {missing_cols}")

    invalid_emails = check_email_format(df, column='email')
    print(f"Invalid emails found: {len(invalid_emails)}")
    if invalid_emails:
        print(invalid_emails[:5])

# Run report
if __name__ == "__main__":
    generate_report(df)
