# SageMaker Clarify Bias Report Demo

## Import necessary libraries and create SageMaker session

In [49]:
import boto3
import sagemaker
import pandas as pd
import numpy as np
import botocore
from sagemaker import clarify
import awswrangler as wr

# Need to create SageMaker session
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name

## Create SageMakerClarifyProcessor Object for Scaling

In [50]:
clarify_processor = clarify.SageMakerClarifyProcessor(
    role=role,
    instance_count=1, # this is # of nodes in cluster
    instance_type='ml.c5.2xlarge', # processing capacity of each node in cluster
    sagemaker_session=sess
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


## Specify Data Configuration in DataConfig Object

In [None]:
# Where the bias report is outputted in S3
bias_report_output_path = "s3://blueberries-dsc/blueberries/output_clarify"

# Getting our training data from S3
# this will make it easier to pass in the column names for the configuration
train_data = wr.s3.read_csv("s3://blueberries-dsc/blueberry.csv")

# Specifying our training data input path in S3
s3_data_input_path="s3://blueberries-dsc/blueberry.csv"

# Creating DataConfig Object
data_config = clarify.DataConfig(
    s3_data_input_path=s3_data_input_path,
    s3_output_path=bias_report_output_path,
    label='yield', # this is our target variable
    headers=train_data.columns.to_list(), # column/feature names
    dataset_type='text/csv'
)

## Specify Bias Configuration in BiasConfig Object
#### Since our target variable 'yield' is continuous, we define a threshold (5000), where we consider values above it to be a positive outcome and values below it to be a negative outcome
#### We're choosing the feature/facet 'AverageRainingDays' to be analyzed for bias, and since it's also continuous, in this case we consider average raining days above 0.1 versus below 0.1

In [32]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=[5000],
    facet_values_or_threshold=[0.1],
    facet_name = 'AverageRainingDays',
)

## Run Pre-training Bias
### For this demo, we're analyzing bias in the training data, but SageMaker Clarify also allows us to analyze bias in model predictions
### To do this, we use the SageMakerClarifyProcessor object created before, passing in our data and bias configurations, and the bias metrics we want to check

In [33]:
clarify_processor.run_pre_training_bias(
    data_config = data_config,
    data_bias_config = bias_config,
    methods = ["CI", "DPL"], # This specifies that we want to check for Class Imbalance 
                                                        # and Difference in Positive Proportions of Labels
)

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['id', 'clonesize', 'honeybee', 'bumbles', 'andrena', 'osmia', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays', 'AverageRainingDays', 'fruitset', 'fruitmass', 'seeds', 'yield'], 'label': 'yield', 'label_values_or_threshold': [5000], 'facet': [{'name_or_index': 'AverageRainingDays', 'value_or_threshold': [0.1]}], 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'pre_training_bias': {'methods': ['CI', 'DPL']}}}
INFO:sagemaker:Creating processing-job with name Clarify-Pretraining-Bias-2023-06-15-05-22-48-650


............................[34m2023-06-15 05:27:25,507 logging.conf not found when configuring logging, using default logging configuration.[0m
[34m2023-06-15 05:27:25,508 Starting SageMaker Clarify Processing job[0m
[34m2023-06-15 05:27:25,508 Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34m2023-06-15 05:27:25,508 Analysis result path: /opt/ml/processing/output[0m
[34m2023-06-15 05:27:25,508 This host is algo-1.[0m
[34m2023-06-15 05:27:25,508 This host is the leader.[0m
[34m2023-06-15 05:27:25,508 Number of hosts in the cluster is 1.[0m
[34m2023-06-15 05:27:25,510 Running Python / Pandas based analyzer.[0m
[34m2023-06-15 05:27:25,510 Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
[34m2023-06-15 05:27:25,519 Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34m2023-06-15 05:27:25,548 Loaded dataset. Dataset info:[0m
[34m<class 'pandas.core.frame.DataFrame'>[0m
[34mRangeIndex: 15289 entries, 0