# Detox usecase Notebook

In [0]:
import os
os.environ["SPARK_VERSION"] = "3.5"  # Change this to your version if needed

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql import functions as F
import pydeequ
from pydeequ.checks import Check, CheckLevel
from pydeequ.verification import VerificationSuite, VerificationResult
from pyspark.sql import SparkSession
import boto3
import json
import time
import random

### Randomization functions
The following functions implements a dataset with random properties for simulation phase

In [0]:
def create_data_table(spark):
    """
    Generate a DataFrame containing randomly generated data.

    The DataFrame will have the following columns:
    - 'Scores': Random integers between 0 and 10.
    - 'Temperature': Random integers between -10 and 50.
    - 'RandomValues': Random floating-point numbers between 0 and 1, with some values randomly set to None (NaN).
    - 'NormallyDistributed': Random values generated from a normal distribution with a mean of 10 and a standard deviation of 4.

    Returns:
    DataFrame: A PySpark DataFrame containing the generated data with four columns.
    """
    num_rows = random.choice([95, 200, 300])
    scores = [random.randint(0, 10) for _ in range(num_rows)]
    temperature = [random.randint(-10, 50) for _ in range(num_rows)]
    random_values = [random.uniform(0, 1) for _ in range(num_rows)]
    normally_distributed = [random.gauss(0, 1) for _ in range(num_rows)]

    # Create a DataFrame
    data = {
        'Scores': scores,
        'Temperature': temperature,
        'RandomValues': random_values,
        'NormallyDistributed': normally_distributed
    }
    
    # Convert to DataFrame
    df = spark.createDataFrame([(s, t, r, n) for s, t, r, n in zip(scores, temperature, random_values, normally_distributed)],
                                schema=['Scores', 'Temperature', 'RandomValues', 'NormallyDistributed'])
    return df


def bump_df(df, spark):
    """
    Randomly modifies a Spark DataFrame by altering values in specific columns.

    This function performs the following operations on the input DataFrame `df` with a 
    probability of 20%:
    
    1. Randomly replaces values in the 'Scores' column with random values between -10 
       and 10. This replacement occurs with a probability of 50%.
       
    2. Randomly inserts NaN (None) values into the 'RandomValues' column with a 
       probability of 30%.
       
    3. Randomly replaces values in the 'Temperature' column with -999 with a 
       probability of 40%.

    Parameters:
    df (pyspark.sql.DataFrame): The input Spark DataFrame to be modified.
    spark (pyspark.sql.SparkSession): The Spark session object (not used in the current implementation).

    Returns:
    pyspark.sql.DataFrame: The modified Spark DataFrame with potential changes in the 
                           'Scores', 'RandomValues', and 'Temperature' columns.
    """

    from pyspark.sql import SparkSession
    from pyspark.sql.functions import col, when, rand, expr
    import numpy as np

    # Insert random values greater or lower than 10 into random positions in the 'Scores' column
    df = df.withColumn(
        'Scores',
        when(rand() < 0.10, (rand() * 20) - 10).otherwise(col('Scores'))
    )

    # Insert NaNs into random positions in the 'RandomValues' column
    df = df.withColumn(
        'RandomValues',
        when(rand() < 0.10, None).otherwise(col('RandomValues'))
    )
    
    df = df.withColumn(
        'NormallyDistributed',
        when(rand() < 0.10, (rand()*10)).otherwise(col('NormallyDistributed'))
    )

    
    return df

df = create_data_table(spark)
bdf = bump_df(df, spark)
bdf.show()


### Pydeequ step
In this second phase, thwe implement the Pydeequ checks over some of the columns of the matrices and yields .json logs about their results

In [0]:
def pydeequ_check(df, spark):
    """
        Performs data quality checks on a Spark DataFrame using PyDeequ.

        This function defines a series of data quality checks to validate the contents 
        of the input DataFrame `df`. The checks include verifying the size of the DataFrame, 
        ensuring that the 'Scores' column meets specified minimum and maximum values, 
        checking for completeness in the 'RandomValues' column, and validating the 
        non-negativity of 'Scores'. Additionally, it assesses the mean and standard 
        deviation of a normally distributed column.

        The checks are executed with a warning level, and the results are returned in JSON format.

        Parameters:
        df (pyspark.sql.DataFrame): The input Spark DataFrame to be validated.
        spark (pyspark.sql.SparkSession): The Spark session object used for executing checks.

        Returns:
        dict: A JSON representation of the verification results, including the outcomes of the defined data quality checks.
    """
    # Define checks using PyDeequ
    check = Check(spark, CheckLevel.Warning, "Data Quality Checks")

    # Add checks for the DataFrame
    check_result = (
            check
            .hasSize(lambda x: x == 300)  # Check for DataFrame size
            .hasMin("Scores", lambda x: x == 0.0)  # Scores should have a minimum of 0
            .hasMax("Scores", lambda x: x == 10.0)  # Scores should have a maximum of 10
            .isComplete("RandomValues")  # Check for completeness in RandomValues
            .isNonNegative("Scores")  # Check Temperature for non-negativity
            .hasMean("NormallyDistributed", lambda x: x > 0.0)  # Check Normally Distributed values
            .hasStandardDeviation("NormallyDistributed", lambda x: x >  1.0)
    )
    
    result = VerificationSuite(spark) \
    .onData(df) \
    .addCheck(check) \
    .run()
    
    #return VerificationResult.checkResultsAsDataFrame(spark, result, pandas=True)
    return VerificationResult.checkResultsAsJson(spark, result)

### Cloudwatch step

Here, the handler for logging the results from the last step, the Pydequ data quality logs

In [0]:
def log_to_cloudwatch(json_result, client, log_group_name='your-log-group', log_stream_name='your-log-stream'):
 
    # Create log group if it doesn't exist
    try:
        client.create_log_group(logGroupName=log_group_name)
    except client.exceptions.ResourceAlreadyExistsException:
        pass

    # Create log stream if it doesn't exist
    try:
        client.create_log_stream(logGroupName=log_group_name, logStreamName=log_stream_name)
    except client.exceptions.ResourceAlreadyExistsException:
        pass

    # Prepare log events from the JSON result
    log_events = []
    for log in json_result:
        log_entry = {
            'check_status': log['check_status'],
            'check_level': log['check_level'],
            'constraint_status': log['constraint_status'],
            'check': log['check'],
            'constraint_message': log['constraint_message'],
            'constraint': log['constraint']
        }
        log_events.append({
            'timestamp': int(time.time() * 1000),  # Current timestamp in milliseconds
            'message': json.dumps(log_entry)  # Convert log entry to string
        })

    # Send logs to CloudWatch
    sequence_token = None
    while log_events:
        chunk = log_events[:100]  # Limit to 100 log events per request
        del log_events[:100]

        # Get sequence token
        response = client.describe_log_streams(logGroupName=log_group_name, logStreamNamePrefix=log_stream_name)
        sequence_token = response['logStreams'][0].get('uploadSequenceToken')

        # Put log events
        if sequence_token:
            client.put_log_events(
                logGroupName=log_group_name,
                logStreamName=log_stream_name,
                logEvents=chunk,
                sequenceToken=sequence_token
            )
        else:
            client.put_log_events(
                logGroupName=log_group_name,
                logStreamName=log_stream_name,
                logEvents=chunk
            )

        time.sleep(1)  # Avoid hitting rate limits

### Simulation main
This cell implements:
1. Setup the pyspark environment + spark session
2. Create a boto3 session and logs into AWS
3. Starts a loop across 100 steps generating random dataframes, applying pydeequ quality analysis and logs it into Cloudwatch


In [0]:
    
# Main function. Make sure you have a credential file accessible for boto3
# This main function makes 10 trials for pydeequ + cloudwatch
if __name__ == "__main__":


    # Initialize Spark session with Deequ package
    spark = (SparkSession
        .builder
        .config("spark.jars.excludes", pydeequ.deequ_maven_coord)
        .config("spark.jars.packages", pydeequ.f2j_maven_coord)
        .getOrCreate())

    # Create a session with your access keys
    #os.environ['AWS_SHARED_CREDENTIALS_FILE'] = '/Workspace/Users/jesse.americogomesdelima@gilead.com/detox/credentials'
    session = boto3.Session(region_name='us-east-1')
    #session = boto3.Session(session = 'hackathon')
    
    # Create a CloudWatch Logs client
    client = session.client('logs')

    # Set log group and log stream for our experiment
    log_group = 'hackathon'
    log_stream = 'detox_usecase'
    
    # Create 100 simulations and log results to CloudWatch
    for i in range(1,10000):
        df = create_data_table(spark)
        bdf = bump_df(df, spark)
        json_result = pydeequ_check(bdf, spark)
        log_to_cloudwatch(json_result, client, log_group, log_stream)

        # Waits between 1 and 5 seconds until next simulation
        delay =random.randint(1,180)
        print(f"Simulation {i}, next step = {delay}s")
        time.sleep(delay)
        
