In [1]:
import os

%load_ext dotenv
%dotenv

FEATURE_COLUMNS = ['player_rating_home_player_1', 'player_rating_home_player_2', 'player_rating_home_player_3',
                   'player_rating_home_player_4', 'player_rating_home_player_5',
                   'player_rating_home_player_6', 'player_rating_home_player_7', 'player_rating_home_player_8',
                   'player_rating_home_player_9', 'player_rating_home_player_10',
                   'player_rating_home_player_11', 'player_rating_away_player_1', 'player_rating_away_player_2',
                   'player_rating_away_player_3', 'player_rating_away_player_4',
                   'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7',
                   'player_rating_away_player_8', 'player_rating_away_player_9',
                   'player_rating_away_player_10', 'player_rating_away_player_11', 'ewm_home_team_goals',
                   'ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'ewm_away_team_goals_conceded',
                   'points_home', 'points_away', 'home_weighted_wins', 'away_weighted_wins', 'avg_home_team_rating',
                   'avg_away_team_rating', 'home_streak_wins', 'away_streak_wins', 'ewm_shoton_home',
                   'ewm_shoton_away', 'ewm_possession_home', 'ewm_possession_away', 'avg_home_rating_attack',
                   'avg_away_rating_attack', 'avg_away_rating_defence', 'avg_home_rating_defence',
                   'average_rating_home', 'average_rating_away', 'num_top_players_home', 'num_top_players_away',
                   'ewm_home_team_goals_conceded_x_ewm_shoton_home', 'attacking_strength_home',
                   'attacking_strength_away', 'attacking_strength_diff', 'result_match']

In [2]:
import boto3
from sagemaker.session import Session
from sagemaker import clarify

default_prefix = "sagemaker/DEMO-sagemaker-clarify"
region = os.environ.get("AWS_REGION")
default_bucket = os.environ["BUCKET"]
execution_role = os.environ["ROLE"]
sagemaker_session = Session()

clarify_processor = clarify.SageMakerClarifyProcessor(
    role=execution_role,
    instance_count=1,
    instance_type="ml.m5.xlarge",
    sagemaker_session=sagemaker_session
)

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\kamil\AppData\Local\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\kamil\AppData\Local\sagemaker\sagemaker\config.yaml


In [3]:
from io import StringIO
import pandas as pd

s3_uri = os.environ.get("S3_TRAIN_PATH")

s3_components = s3_uri.replace("s3://", "").split("/")
bucket = s3_components[0]
key = "/".join(s3_components[1:])

s3_client = boto3.client('s3')

response = s3_client.get_object(Bucket=bucket, Key=key)
data = response['Body'].read().decode('utf-8')

train_df = pd.read_csv(StringIO(data))
train_df.columns = FEATURE_COLUMNS

mapping = {1.0: 'home_win', 0.0: 'home_not_win'}
train_df['result_match'] = train_df['result_match'].map(mapping)
train_df.head()

Unnamed: 0,player_rating_home_player_1,player_rating_home_player_2,player_rating_home_player_3,player_rating_home_player_4,player_rating_home_player_5,player_rating_home_player_6,player_rating_home_player_7,player_rating_home_player_8,player_rating_home_player_9,player_rating_home_player_10,...,avg_home_rating_defence,average_rating_home,average_rating_away,num_top_players_home,num_top_players_away,ewm_home_team_goals_conceded_x_ewm_shoton_home,attacking_strength_home,attacking_strength_away,attacking_strength_diff,result_match
0,73.0,73.0,73.0,73.0,73.0,73.0,75.0,73.0,74.0,75.0,...,73.0,73.363636,82.181818,0.0,3.0,9.77076,31.216102,41.378109,-10.162008,home_not_win
1,74.0,74.0,74.0,74.0,74.0,74.0,74.0,72.0,73.0,74.0,...,74.0,74.0,84.818182,0.0,10.0,0.0,38.541667,27.056962,11.484705,home_not_win
2,86.0,86.0,86.0,86.0,86.0,86.0,85.0,86.0,81.0,87.0,...,86.0,85.363636,74.363636,10.0,0.0,5.1908,16.898406,38.219895,-21.321489,home_win
3,68.0,68.0,68.0,68.0,68.0,68.0,75.0,73.0,77.0,73.0,...,68.0,70.818182,76.727273,0.0,0.0,0.2142,43.814371,34.459459,9.354912,home_not_win
4,79.0,79.0,79.0,79.0,79.0,79.0,80.0,81.0,77.0,79.0,...,79.0,79.454545,86.272727,0.0,10.0,2.29738,38.565217,29.040678,9.524539,home_not_win


In [4]:
from sagemaker.s3 import S3Uploader

S3_LOCATION = f"s3://{default_bucket}/football"

df_local_path = "train_df_clarify.csv"
train_df.to_csv(df_local_path, index=False)

s3_data_input_path = S3Uploader.upload(local_path=df_local_path, desired_s3_uri=f"{S3_LOCATION}/clarify", sagemaker_session=sagemaker_session)

os.remove(df_local_path)

In [5]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(default_bucket, default_prefix)
bias_data_config = clarify.DataConfig(
    s3_data_input_path=s3_data_input_path,
    s3_output_path=bias_report_output_path,
    label="result_match",
    headers=train_df.columns.to_list(),
    dataset_type="text/csv",
)

In [6]:
MODEL_PACKAGE_GROUP = os.environ["MODEL_PACKAGE_GROUP"]

sm = boto3.client("sagemaker")
model_packages = sm.list_model_packages(ModelPackageGroupName=MODEL_PACKAGE_GROUP, SortBy="CreationTime", SortOrder="Descending")

model_package = [pk for pk in model_packages["ModelPackageSummaryList"] 
                 if pk["ModelApprovalStatus"] == "Approved"][0]

model_package_arn = model_package["ModelPackageArn"]

models = sm.search(
    Resource='Model',
    SearchExpression={
        'Filters': [
            {
                'Name': 'Model.Containers.ModelPackageName',
                'Operator': 'Equals',
                'Value': model_package_arn
            },
        ]
    }
)["Results"]

model_name = models[0]["Model"]["Model"]["ModelName"]
print(model_name)

football-endpoint-model-shadow-0616192632


In [7]:
content_template = '{"confidence":$features}'

model_config = clarify.ModelConfig(
    model_name=model_name,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    accept_type="application/jsonlines",
    content_type="text/csv",
)

In [8]:
probability = 0.65
predictions_config = clarify.ModelPredictedLabelConfig(
    label='prediction',
    probability='confidence',
    probability_threshold=probability,
)

In [9]:
bias_config = clarify.BiasConfig(
    label_values_or_threshold=['home_win'], facet_name="attacking_strength_diff", facet_values_or_threshold=[0], group_name="num_top_players_home"
)

In [10]:
from sagemaker.experiments import Run

with Run(
    experiment_name='tracking-bias-explainability',
    run_name="bias-only",
    sagemaker_session=sagemaker_session,
) as run:
    clarify_processor.run_bias(
        data_config=bias_data_config,
        bias_config=bias_config,
        model_config=model_config,
        model_predicted_label_config=predictions_config,
        pre_training_methods="all",
        post_training_methods="all",
    )

INFO:sagemaker.experiments.run:The run (bias-only) under experiment (tracking-bias-explainability) already exists. Loading it.
INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['player_rating_home_player_1', 'player_rating_home_player_2', 'player_rating_home_player_3', 'player_rating_home_player_4', 'player_rating_home_player_5', 'player_rating_home_player_6', 'player_rating_home_player_7', 'player_rating_home_player_8', 'player_rating_home_player_9', 'player_rating_home_player_10', 'player_rating_home_player_11', 'player_rating_away_player_1', 'player_rating_away_player_2', 'player_rating_away_player_3', 'player_rating_away_player_4', 'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11', 'ewm_home_team_goals', 'ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'ewm_away_team_goals_concede

[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is algo-1.[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is the leader.[0m
[34mINFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.[0m
[34mINFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.[0m
[34mINFO:analyzer.data_loading.data_loader_factory:Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
  df = df.append(df_tmp, ignore_index=True)[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34mINFO:sagemaker-clarify-processing:Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34mINFO:sagemaker-clarify-processing:Lo

In [22]:
baseline_df = train_df.drop(columns=["result_match"])

baseline = baseline_df.values.tolist()

shap_config = clarify.SHAPConfig(
    baseline=baseline,
    num_samples=15,
    agg_method="mean_abs",
    save_local_shap_values=False,
)

In [23]:
explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, default_prefix)
explainability_data_config = clarify.DataConfig(
    s3_data_input_path=s3_data_input_path,
    s3_output_path=explainability_output_path,
    label="result_match",
    headers=train_df.columns.to_list(),
    dataset_type="text/csv",
)

In [24]:
train_df.columns.to_list()

['player_rating_home_player_1',
 'player_rating_home_player_2',
 'player_rating_home_player_3',
 'player_rating_home_player_4',
 'player_rating_home_player_5',
 'player_rating_home_player_6',
 'player_rating_home_player_7',
 'player_rating_home_player_8',
 'player_rating_home_player_9',
 'player_rating_home_player_10',
 'player_rating_home_player_11',
 'player_rating_away_player_1',
 'player_rating_away_player_2',
 'player_rating_away_player_3',
 'player_rating_away_player_4',
 'player_rating_away_player_5',
 'player_rating_away_player_6',
 'player_rating_away_player_7',
 'player_rating_away_player_8',
 'player_rating_away_player_9',
 'player_rating_away_player_10',
 'player_rating_away_player_11',
 'ewm_home_team_goals',
 'ewm_away_team_goals',
 'ewm_home_team_goals_conceded',
 'ewm_away_team_goals_conceded',
 'points_home',
 'points_away',
 'home_weighted_wins',
 'away_weighted_wins',
 'avg_home_team_rating',
 'avg_away_team_rating',
 'home_streak_wins',
 'away_streak_wins',
 'ewm_sh

In [25]:
clarify_processor.run_explainability(
    data_config=explainability_data_config,
    model_config=model_config,
    explainability_config=shap_config,
    model_scores='confidence',
)

INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['player_rating_home_player_1', 'player_rating_home_player_2', 'player_rating_home_player_3', 'player_rating_home_player_4', 'player_rating_home_player_5', 'player_rating_home_player_6', 'player_rating_home_player_7', 'player_rating_home_player_8', 'player_rating_home_player_9', 'player_rating_home_player_10', 'player_rating_home_player_11', 'player_rating_away_player_1', 'player_rating_away_player_2', 'player_rating_away_player_3', 'player_rating_away_player_4', 'player_rating_away_player_5', 'player_rating_away_player_6', 'player_rating_away_player_7', 'player_rating_away_player_8', 'player_rating_away_player_9', 'player_rating_away_player_10', 'player_rating_away_player_11', 'ewm_home_team_goals', 'ewm_away_team_goals', 'ewm_home_team_goals_conceded', 'ewm_away_team_goals_conceded', 'points_home', 'points_away', 'home_weighted_wins', 'away_weighted_wins', 'avg_home_team_rating', 'avg_away_team_rating', '

[34mINFO:sagemaker-clarify-processing:Starting SageMaker Clarify Processing job[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis config path: /opt/ml/processing/input/config/analysis_config.json[0m
[34mINFO:analyzer.data_loading.data_loader_util:Analysis result path: /opt/ml/processing/output[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is algo-1.[0m
[34mINFO:analyzer.data_loading.data_loader_util:This host is the leader.[0m
[34mINFO:analyzer.data_loading.data_loader_util:Number of hosts in the cluster is 1.[0m
[34mINFO:sagemaker-clarify-processing:Running Python / Pandas based analyzer.[0m
[34mINFO:analyzer.data_loading.data_loader_factory:Dataset type: text/csv uri: /opt/ml/processing/input/data[0m
  df = df.append(df_tmp, ignore_index=True)[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34mINFO:sagemaker-clarify-processing:Loading dataset...[0m
  df = df.append(df_tmp, ignore_index=True)[0m
[34mINFO:sagemaker-clarify-processing:Lo

KeyboardInterrupt: 

In [None]:
explainability_output_path

In [None]:
!aws s3 cp {explainability_output_path}/report.html ./explainability-report.html

In [23]:
local_explanations_out = pd.read_csv(explainability_output_path + "/explanations_shap/out.csv")
feature_names = [str.replace(c, "_label0", "") for c in local_explanations_out.columns.to_series()]
local_explanations_out.columns = feature_names

selected_example = 111
print(
    "Example number:",
    selected_example,
    "\nwith model prediction:",
    sum(local_explanations_out.iloc[selected_example]) > 0,
)
print("\nFeature values -- Label", train_df.iloc[selected_example])
local_explanations_out.iloc[selected_example].plot(
    kind="bar", title="Local explanation for the example number " + str(selected_example), rot=90
)

ImportError: Install s3fs to access S3