### Task 1: Automated Data Profiling

**Steps**:
1. Using Pandas-Profiling
    - Generate a profile report for an existing CSV file.
    - Customize the profile report to include correlations.
    - Profile a specific subset of columns.
2. Using Great Expectations
    - Create a basic expectation suite for your data.
    - Validate data against an expectation suite.
    - Add multiple expectations to a suite.

In [None]:
# Write your code from here
import pandas as pd
from pandas_profiling import ProfileReport
import great_expectations as ge

# 1. Pandas-Profiling

# Load CSV file
df = pd.read_csv('data.csv')

# Generate full profile report with correlations
profile_full = ProfileReport(df, title="Full Data Profile with Correlations", correlations={"pearson": {"calculate": True}})
profile_full.to_file("full_profile_report.html")

# Generate profile report for a subset of columns
subset_cols = ['column1', 'column2', 'column3']  # replace with your actual column names
profile_subset = ProfileReport(df[subset_cols], title="Subset Data Profile")
profile_subset.to_file("subset_profile_report.html")


# 2. Great Expectations

# Initialize a GE Data Context and create a dataset
gdf = ge.from_pandas(df)

# Create a new expectation suite
suite_name = "basic_expectations"
suite = gdf.get_expectation_suite(suite_name, overwrite_existing=True)

# Add expectations
gdf.expect_column_values_to_not_be_null('column1')
gdf.expect_column_values_to_be_in_type_list('column2', ['int64', 'float64'])
gdf.expect_column_mean_to_be_between('column3', min_value=0, max_value=100)

# Save expectation suite
gdf.save_expectation_suite(suite_name)

# Validate data against the expectation suite
results = gdf.validate(expectation_suite=suite_name)
print(results)


### Task 2: Real-time Monitoring of Data Quality

**Steps**:
1. Setting up Alerts for Quality Drops
    - Use the logging library to set up a basic alert on failed expectations.
    - Implementing alerts using email notifications.
    - Using a dashboard like Grafana for visual alerts.
        - Note: Example assumes integration with a monitoring system
        - Alert setup would involve creating a data source and alert rule in Grafana

In [None]:
# Write your code from here
import great_expectations as ge
import logging
import smtplib
from email.message import EmailMessage

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def send_email_alert(subject, body, to_emails):
    try:
        msg = EmailMessage()
        msg.set_content(body)
        msg['Subject'] = subject
        msg['From'] = 'your_email@example.com'  # replace with sender email
        msg['To'] = ', '.join(to_emails)

        # Configure SMTP server (example for Gmail)
        with smtplib.SMTP_SSL('smtp.gmail.com', 465) as smtp:
            smtp.login('your_email@example.com', 'your_email_password')  # replace with credentials
            smtp.send_message(msg)
        logging.info("Email alert sent successfully.")
    except Exception as e:
        logging.error(f"Failed to send email alert: {e}")

def monitor_data_quality(df, expectation_suite_name):
    gdf = ge.from_pandas(df)
    results = gdf.validate(expectation_suite=expectation_suite_name)
    if not results["success"]:
        logging.warning("Data quality dropped: Expectation(s) failed.")
        # Prepare email alert content
        failed_expectations = [r for r in results['results'] if not r['success']]
        body = f"Data quality alert! {len(failed_expectations)} expectations failed.\nDetails:\n"
        for exp in failed_expectations:
            body += f"- Expectation: {exp['expectation_config']['expectation_type']}\n"
            body += f"  Result: {exp['result']}\n"
        send_email_alert("Data Quality Alert", body, ["recipient@example.com"])  # replace with actual recipient(s)
    else:
        logging.info("Data quality check passed successfully.")

# Example usage:
if __name__ == "__main__":
    import pandas as pd
    # Load or create your dataframe here
    df = pd.read_csv('data.csv')  # Replace with your data source
    expectation_suite_name = 'basic_expectations'  # Ensure this exists in GE context
    
    monitor_data_quality(df, expectation_suite_name)

# Notes:
# - For Grafana: set up a Prometheus data source or push metrics to a time series DB
# - Create alert rules in Grafana dashboard based on metrics from your monitoring system


### Task 3: Using AI for Data Quality Monitoring
**Steps**:
1. Basic AI Models for Monitoring
    - Train a simple anomaly detection model using Isolation Forest.
    - Use a simple custom function based AI logic for outlier detection.
    - Creating a monitoring function that utilizes a pre-trained machine learning model.

In [None]:
# Write your code from here
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest

def train_isolation_forest(data, contamination=0.1, random_state=42):
    try:
        model = IsolationForest(contamination=contamination, random_state=random_state)
        model.fit(data)
        return model
    except Exception as e:
        print(f"Error training Isolation Forest: {e}")
        return None

def detect_anomalies(model, data):
    try:
        preds = model.predict(data)
        # Isolation Forest outputs: -1 for anomalies, 1 for normal points
        anomalies = data[preds == -1]
        return anomalies
    except Exception as e:
        print(f"Error detecting anomalies: {e}")
        return pd.DataFrame()

def custom_outlier_detection(data, threshold=3):
    try:
        mean = np.mean(data, axis=0)
        std = np.std(data, axis=0)
        z_scores = (data - mean) / std
        # Identify outliers where absolute z-score > threshold
        outliers_mask = (np.abs(z_scores) > threshold).any(axis=1)
        return data[outliers_mask]
    except Exception as e:
        print(f"Error in custom outlier detection: {e}")
        return np.array([])

def monitoring_function(data, model=None, use_ai=True):
    if use_ai:
        if model is None:
            print("No pre-trained model provided.")
            return None
        anomalies = detect_anomalies(model, data)
        if not anomalies.empty:
            print(f"Detected {len(anomalies)} anomalies using AI model.")
        else:
            print("No anomalies detected by AI model.")
        return anomalies
    else:
        outliers = custom_outlier_detection(data)
        if outliers.size > 0:
            print(f"Detected {len(outliers)} outliers using custom logic.")
        else:
            print("No outliers detected by custom logic.")
        return outliers

# Example usage:
if __name__ == "__main__":
    # Sample data: columns could be e.g. age and income
    sample_data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, 100000], [120, 500], [28, 62000]])
    df = pd.DataFrame(sample_data, columns=['age', 'income'])
    
    # Train model
    isolation_forest_model = train_isolation_forest(df)
    
    # Monitor data with AI
    anomalies = monitoring_function(df, model=isolation_forest_model, use_ai=True)
    
    # Monitor data with custom logic
    outliers = monitoring_function(sample_data, use_ai=False)
