In [None]:
# Test cell

# The variables below have to be included in the test cell in order to query the cube.
from init_sisense import sisense_conn
cube_name = "bank_churn"
additional_parameters = sisense_conn.load_additional_parameters(cube_name, table_name="bank_churn_train_sagemaker_ml533")
sisense_conn.set_parameters(cube_name=cube_name, additional_parameters=additional_parameters)

# AWS Sagemaker AutoPilot

### Install library

In [None]:
!pip install boto3 sagemaker

In [None]:
from time import gmtime, strftime, sleep , time
import os
import json
import tarfile
from datetime import datetime
import yaml
import pandas as pd
import boto3
import io
import sagemaker
from sagemaker import get_execution_role

### Load input parameters from custom code

In [None]:
table = sisense_conn.add_param["Dataset"]["table"]
target_column = sisense_conn.add_param["Target Column"]["column"]
drop_column = sisense_conn.add_param["Drop Feature"]
aws_access_key_path = sisense_conn.add_param["aws_access_key_path"]
aws_secret_access_path = sisense_conn.add_param["aws_secret_access_path"]
region_name = sisense_conn.add_param["region_name"]
S3_bucket_name = sisense_conn.add_param["S3_bucket_name"]
aws_arn_role = sisense_conn.add_param["aws_arn_role"]

### Load data from Elasticube

In [None]:
# Define Sql Statement
logical_sql1 = (f'SELECT * from "{table}"')
print("SQL Statement:\n" + logical_sql1)
# Execute the SQL Statement
logical_sql_res1 = sisense_conn.get_logical_sql(query=logical_sql1, 
                                               cube_name=cube_name,  # passed to notebook from build / Test Cell
                                               count=None)  # limit the rows fetched
# check for errors in logical sql execution 
# if "error" in logical_sql_res1:
#     raise err.CustomCodeException(*err.ERROR_IN_LOGICAL_SQL, description=logical_sql_res1.get("details"))
column_name = logical_sql_res1['headers']
values = logical_sql_res1['values']
# Get Data
df = pd.DataFrame(values, columns = column_name)
if drop_column != '0':
    # Split the drop_column string into a list of column names
    columns_to_drop = drop_column.split(',')
    # Drop the specified columns
    df = df.drop(columns_to_drop, axis=1)

In [None]:
df.head()

In [None]:
if '.' in table:
    dataset = table.split('.')[0]
else:
    dataset = table

In [None]:
dataset

In [None]:
all_columns = df.columns.tolist()
features = [col for col in all_columns if col != target_column]

In [None]:
features

In [None]:
## Create list of features for Widget
widget_features = df.columns.tolist()

# Remove the target column from the list
if target_column in widget_features:
    widget_features.remove(target_column)
widget_features

### Create directory in Sisense to store files and model

In [None]:
current_timestamp = datetime.now()
current_timestamp = current_timestamp.strftime('%Y%m%d%H%M%S')
folder_path = f"/opt/sisense/storage/notebooks/custom_code_notebooks/notebooks/sagemaker/{table.split('.')[0]}/{current_timestamp}"
os.makedirs(folder_path, exist_ok=True)
folder_path

In [None]:
## Save Feature list locally
json_file_path = f'{folder_path}/features.json'

# Open the file in write mode and use json.dump() to write the list
with open(json_file_path, 'w') as json_file:
    json.dump(features, json_file, indent=4) 

print(f"Features list saved to {json_file_path}")

### To use sagemkaer autopilot library we need to put the data on S3

### Create S3 Bucket

#### Get AWS Keys

In [None]:
# Read the access key from file
with open(aws_access_key_path, 'r') as f:
    aws_access_key_id = f.read().strip()

with open(aws_secret_access_path, 'r') as f:
    aws_secret_access_key = f.read().strip()

In [None]:
# Initialize the S3 client
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name)

# Create the S3 bucket
s3.create_bucket(Bucket=S3_bucket_name)

# Convert the DataFrame to a CSV file in memory
csv_buffer = io.StringIO()
df.to_csv(csv_buffer, index=False)

# Specify the desired S3 key (object name) for the CSV file
s3_key = f"{dataset}/input-data.csv" # Replace with the desired S3 key

# Upload the CSV file to the S3 bucket
s3.put_object(Bucket=S3_bucket_name, Key=s3_key, Body=csv_buffer.getvalue())

print(f"Uploaded the Csv to s3://{S3_bucket_name}/{dataset}/{s3_key}")


### Set up AWS Configuration locally

In [None]:
boto3.setup_default_session(aws_access_key_id=aws_access_key_id,
                            aws_secret_access_key=aws_secret_access_key,
                            region_name=region_name)
region = boto3.Session().region_name
session = sagemaker.Session()

### Initialize the SageMaker client

In [None]:
autopilot = boto3.client('sagemaker')

### Define parameters for the AutoML job
### Customize your paramter using aws [documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker/client/create_auto_ml_job_v2.html)
##### For MODE:AUTO If your dataset is larger than 100 MB, Autopilot chooses HPO.

#### ENSEMBLING vs HYPERPARAMETER_TUNING
##### Data Characteristics: For large or complex datasets, AUTO mode might lean towards ENSEMBLING because it can leverage diverse algorithms to capture various data patterns, potentially improving overall model performance.
##### Performance vs. Time: If the primary goal is quick model training with reasonable performance, HYPERPARAMETER_TUNING might be preferred for smaller datasets. Conversely, for larger datasets where computation resources allow, ENSEMBLING can provide robust models at the expense of longer training times.
##### Resource Allocation: The choice between these modes can also impact the types of AWS resources used, such as processing power and storage. Ensembling might require more resources to handle multiple models simultaneously.

In [None]:
auto_ml_job_input_data_config = [
    {
        'ChannelType': 'training', # training or validation
        'ContentType': 'text/csv;header=present',
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': f's3://{S3_bucket_name}/{s3_key}'
            }
        }
    }
]

output_data_config = {
    'S3OutputPath': f's3://{S3_bucket_name}/{dataset}/autopilot-output/'
}

## If using Mode:Auto no need for CandidateGenerationConfig -Sagemkaer will choose ENSEMBLING or HYPERPARAMETER based on datasize
auto_ml_problem_type_config = {
    'TabularJobConfig': {
        # 'CandidateGenerationConfig': {
        #     'AlgorithmsConfig': [
        #     ]
        # },
        'CompletionCriteria': {
            'MaxCandidates': 10, #10
            'MaxRuntimePerTrainingJobInSeconds': 3600, #1200
            'MaxAutoMLJobRuntimeInSeconds': 9000 #9000
        },
         'Mode': 'AUTO',
        # 'ProblemType': 'BinaryClassification',
        'TargetAttributeName': target_column
#         'SampleWeightAttributeName': 'string'
        
    }
}

## If using Mode:Ensembling
# auto_ml_problem_type_config = {
#     'TabularJobConfig': {
#         'CandidateGenerationConfig': {
#             'AlgorithmsConfig': [
#                 {
#                     'AutoMLAlgorithms': [
#                         'catboost',
#                         'extra-trees',
#                         'fastai',
#                         'lightgbm',
#                         'linear-learner',
#                         'nn-torch',
#                         'randomforest',
#                         'xgboost'
#                     ]
#                 }
#             ]
#         },
#         'CompletionCriteria': {
#             'MaxCandidates': 10,
#             'MaxRuntimePerTrainingJobInSeconds': 1200,
#             'MaxAutoMLJobRuntimeInSeconds': 9000
#         },
#         'Mode': 'ENSEMBLING',
#         'TargetAttributeName': target_column,
#         'SampleWeightAttributeName': 'SampleWeight' ## SampleWeight is a custom column in your datatset. Ex have values like 0 for class '0' if this class has more records and value like 4 for class '1' which has underrepresented row.
#     }
# }

# If using HYPERPARAMETER_TUNING
# auto_ml_problem_type_config = {
#     'TabularJobConfig': {
#         'CandidateGenerationConfig': {
#             'AlgorithmsConfig': [
#                 {
#                     'AutoMLAlgorithms': [
#                         'linear-learner',
#                         'mlp',
#                         'xgboost'
#                     ]
#                 }
#             ]
#         },
#         'CompletionCriteria': {
#             'MaxCandidates': 10,
#             'MaxRuntimePerTrainingJobInSeconds': 1200,
#             'MaxAutoMLJobRuntimeInSeconds': 9000
#         },
#         'Mode': 'HYPERPARAMETER_TUNING',
#         'TargetAttributeName': target_column
#     }
# }


role_arn = aws_arn_role

# Regression: MSE | Binary classification: F1 | Multiclass classification: Accuracy.
# auto_ml_job_objective = { 
#     'MetricName': 'Accuracy'
# }


# model_deploy_config = {
#     'AutoGenerateEndpointName': False,
#     'EndpointName': 'string'
# }

data_split_config = {
    'ValidationFraction': 0.2  # Change to your desired value
}



### Call the create_auto_ml_job_v2 API

In [None]:
timestamp_suffix = strftime("%Y%m%d%H%M", gmtime())
auto_ml_job_name = "sisense-autopilot" + timestamp_suffix
print("AutoMLJobName: " + auto_ml_job_name)

autopilot.create_auto_ml_job_v2(
    AutoMLJobName=auto_ml_job_name,
    AutoMLJobInputDataConfig=auto_ml_job_input_data_config,
    OutputDataConfig=output_data_config,
    AutoMLProblemTypeConfig=auto_ml_problem_type_config,
    RoleArn=role_arn,
#     Tags=tags,
#     SecurityConfig=security_config,
#     AutoMLJobObjective=auto_ml_job_objective,
#     ModelDeployConfig=model_deploy_config,
    DataSplitConfig=data_split_config
)


In [None]:

print("JobStatus - Secondary Status")
print("------------------------------")


describe_response = autopilot.describe_auto_ml_job_v2(AutoMLJobName=auto_ml_job_name)
print(describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"])
job_run_status = describe_response["AutoMLJobStatus"]

while job_run_status not in ("Failed", "Completed", "Stopped"):
    describe_response = autopilot.describe_auto_ml_job_v2(AutoMLJobName=auto_ml_job_name)
    job_run_status = describe_response["AutoMLJobStatus"]

    print(
        describe_response["AutoMLJobStatus"] + " - " + describe_response["AutoMLJobSecondaryStatus"]
    )
    sleep(60)

In [None]:
job_desc = autopilot.describe_auto_ml_job_v2(AutoMLJobName=auto_ml_job_name)
job_desc

In [None]:
job_desc["BestCandidate"]

#### You can use the describe_auto_ml_job API to look up the best candidate selected by the SageMaker Autopilot job.

In [None]:
best_candidate = job_desc["BestCandidate"]
best_candidate_name = best_candidate["CandidateName"]

print("\n")
print("CandidateName: " + best_candidate_name)
print(
    "FinalAutoMLJobObjectiveMetricName: "
    + best_candidate["FinalAutoMLJobObjectiveMetric"]["MetricName"]
)
print(
    "FinalAutoMLJobObjectiveMetricValue: "
    + str(best_candidate["FinalAutoMLJobObjectiveMetric"]["Value"])
)
print("\nBest candidate details:: " + str(best_candidate))

### To explore the performance of other algorithms that Autopilot explored, you can enumerate them via list_candidates_for_auto_ml_job API call

In [None]:
autopilot_dict = autopilot.list_candidates_for_auto_ml_job(AutoMLJobName=auto_ml_job_name)

for item in autopilot_dict["Candidates"]:
    print(item["CandidateName"], item["FinalAutoMLJobObjectiveMetric"])
    print(item["InferenceContainers"][0]["Image"], "\n")

### Autopilot also automatically generates a feature importance report store them locally

In [None]:
best_candidate["CandidateProperties"]

In [None]:
def split_s3_path(s3_path):
    # Handle the case where the s3_path is an empty string or just spaces
    if not s3_path.strip():
        return None, None  # Return None for both bucket and key if the path is empty

    path_parts = s3_path.replace("s3://", "").split("/")
    bucket = path_parts.pop(0)
    key = "/".join(path_parts)
    return bucket, key

# Safely access 'CandidateArtifactLocations' and 'Explainability'
candidate_artifact_locations = best_candidate.get("CandidateProperties", {}).get("CandidateArtifactLocations", {})

# Check if 'Explainability' exists
explainability_prefix = candidate_artifact_locations.get("Explainability")

# Proceed only if explainability_prefix is valid and not empty
if explainability_prefix and explainability_prefix.strip():
    s3_bucket, explainability_dir = split_s3_path(explainability_prefix)

    # Proceed only if valid S3 bucket and key were returned
    if s3_bucket and explainability_dir:
        session.download_data(path=folder_path, bucket=s3_bucket, key_prefix=explainability_dir)
    else:
        print("Explainability prefix is invalid or empty.")
else:
    print("Explainability prefix is not provided or missing.")


In [None]:
# from IPython.display import IFrame

# # Display HTML report
# IFrame(src="report.html", width=700, height=600)

In [None]:
# you will need transform job for data processing/feature engineering
# and tuning job for model tuning/fitting
transform_job = ""
tuning_job = ""
processing_job = ""
training_job = ""
for index in range(len(best_candidate["CandidateSteps"])):
    if (
        best_candidate["CandidateSteps"][index]["CandidateStepType"]
        == "AWS::SageMaker::TransformJob"
    ):
        transform_job = best_candidate["CandidateSteps"][index]["CandidateStepName"]
    if (
        best_candidate["CandidateSteps"][index]["CandidateStepType"]
        == "AWS::SageMaker::TuningJob"
    ):
        tuning_job = best_candidate["CandidateSteps"][index]["CandidateStepName"]
    if (
        best_candidate["CandidateSteps"][index]["CandidateStepType"]
        == "AWS::SageMaker::ProcessingJob"
    ):
        processing_job = best_candidate["CandidateSteps"][index]["CandidateStepName"]
    if (
        best_candidate["CandidateSteps"][index]["CandidateStepType"]
        == "AWS::SageMaker::TrainingJob"
    ):
        training_job = best_candidate["CandidateSteps"][index]["CandidateStepName"]

print(f"transform_job:{transform_job}, tuning_job:{tuning_job}, training_job:{training_job}, processing_job:{processing_job}")

### You can describe the job to see the details

In [None]:

# Create a list of jobs
job_list = [
    ('transform_job', transform_job),
    ('tuning_job', tuning_job),
    ('processing_job', processing_job),
    ('training_job', training_job)
]

# Initialize an empty list to store responses
response = []

# Iterate over the job list and retrieve job descriptions
for job_type, job_name in job_list:
    if job_name:  # Check if the job name is not empty
        if job_type == 'transform_job':
            response.append(autopilot.describe_transform_job(TransformJobName=job_name))
        elif job_type == 'tuning_job':
            response.append(autopilot.describe_tuning_job(TuningJobName=job_name))
        elif job_type == 'training_job':
            response.append(autopilot.describe_training_job(TrainingJobName=job_name))
        elif job_type == 'processing_job':
            response.append(autopilot.describe_processing_job(ProcessingJobName=job_name))

# Access the first response
first_response = response[0] if response else None

# Print the responses
for i, res in enumerate(response, start=1):
    print(f"Response {i}: {res}")

In [None]:
##### OLD LOGIC
# job_list = [transform_job,tuning_job,processing_job,training_job]
# response = []
# for job in job_list:
#     if job != '':
#         if job == 'transform_job':
#             response.append(autopilot.describe_transform_job(TransformJobName=job))
#         elif job == 'tuning_job':
#             response.append(autopilot.describe_tuning_job(TuningJobName=job))
#         elif job == 'training_job':
#             response.append(autopilot.describe_training_job(TrainingJobName=job))
#         else:
#             response.append(autopilot.describe_processing_job(ProcessingJobName=job))
            
# response[0]

In [None]:
auto_ml_job_name, best_candidate_name

In [None]:
# best_candidate = job_desc["BestCandidate"]
# best_candidate["InferenceContainers"]

In [None]:
best_candidate["InferenceContainers"]

# Create Model

In [None]:
timestamp_suffix = strftime("%d", gmtime())
dataset = dataset.replace('_','-')
model_name = 'sisense-autopilot-' + dataset + timestamp_suffix + "-model"
model = autopilot.create_model(
    ModelName=model_name,
    Containers=best_candidate["InferenceContainers"],
    ExecutionRoleArn=aws_arn_role
)
model

In [None]:
def describe_autopilot_model(response):
    model_locations = {}  # Dictionary to store model locations for each role
    final_model_name = None
    containers = response['Containers']

    # Check if it's an Ensemble model by looking for 'MODEL_NAME'
    for container in containers:
        environment = container['Environment']
        if 'MODEL_NAME' in environment:
            model_location = container['ModelDataUrl']
            final_model_name = environment['MODEL_NAME']
            print(f"Ensemble Model Detected.")
            print(f"Final Ensemble Model Name: {final_model_name}")
            print(f"Model Location: {model_location}\n")

            # Store the model location for Ensemble
            model_locations['ensemble'] = model_location
            return model_locations, final_model_name

    # HPO Case: Dynamically determine regression vs classification by number of containers
    num_containers = len(containers)

    if num_containers == 2:
        # HPO for Regression (2 containers: Pre-processing and Inference)
        container_labels = [
            "Feature Engineering Pre-processing",
            "Trained Model (Regression)"
        ]
    elif num_containers == 3:
        # HPO for Classification (3 containers: Pre-processing, Inference, Post-processing)
        container_labels = [
            "Feature Engineering Pre-processing",
            "Trained Model (Classification)",
            "Post-processing"
        ]
    else:
        print(f"Unexpected number of containers: {num_containers}")
        return model_locations, final_model_name

    # Final model name for HPO
    final_model_name = response['ModelName']
    print(f"HPO Model Detected.\nFinal HPO Model Name: {final_model_name}")
    
    # Iterate over HPO containers and collect model locations
    for i, container in enumerate(containers):
        model_location = container['ModelDataUrl']
        if i < len(container_labels):  # Ensure we stay within label bounds
            print(f"{container_labels[i]} Container")
            print(f"Model Location: {model_location}\n")
            
            # Store model location by its role
            role = container_labels[i].replace(" ", "_").lower()  # Normalize the label as the key
            model_locations[role] = model_location

    return model_locations, final_model_name  # Return both model locations and final model name

# Example usage:
response = autopilot.describe_model(ModelName=model_name)
model_locations, final_model_name = describe_autopilot_model(response)
print(f"Final Model Name: {final_model_name}")


In [None]:
final_model_name,model_locations

### Download Trained Models from S3

In [None]:

# # Extract the bucket name and key from the S3 path
# bucket_name, s3_key = model_location.replace('s3://', '').split('/', 1)
# print(f"Bucket: {bucket_name}, Key: {s3_key}")

# # Specify the local file path to save the downloaded tar file
# local_tar_file_path = os.path.join(folder_path, os.path.basename(s3_key))
# print(f"Local tar file path: {local_tar_file_path}")

# # Download the model artifacts from Amazon S3
# s3 = boto3.resource('s3')
# s3.Bucket(bucket_name).download_file(s3_key, local_tar_file_path)
# print("Downloaded the model tar file from S3.")

# # Untar the downloaded tar file
# with tarfile.open(local_tar_file_path, 'r:gz') as tar:
#     # List the contents of the tar file
#     tar_members = tar.getmembers()

#     # Debug: Print all member names
#     print("Contents of the tar file:")
#     for member in tar_members:
#         print(f" - {member.name}")

#     # Check if a top-level directory named 'models' exists
#     models_folder_exists = any(member.name.startswith('./models/') and member.isdir() for member in tar_members)

#     if models_folder_exists:
#         print("Found 'models/' directory in the tar file. Extracting as is.")
#         # Extract directly to folder_path since 'models' folder is already present
#         tar.extractall(path=folder_path)
#     else:
#         print("No 'models/' directory found. Creating 'model' folder and extracting contents.")
#         # Create a 'model' directory only if 'models' folder is not present in the tar
#         model_path = os.path.join(folder_path, 'model')
#         os.makedirs(model_path, exist_ok=True)
#         for member in tar_members:
#             # Strip leading directories to ensure extraction directly into model_path
#             member.name = os.path.basename(member.name)
#             tar.extract(member, path=model_path)

# # Remove the tar file after extraction (optional)
# os.remove(local_tar_file_path)
# print("Removed the tar file after extraction.")

# print("Model files extracted successfully.")


In [None]:
import os
import boto3
import tarfile

def download_and_extract_model(model_location, folder_path, role, final_model_name=None, is_trained_model=False):
    final_model_name = final_model_name.replace('-', '_')
    # Ensure the base folder path exists
    os.makedirs(folder_path, exist_ok=True)

    # Create a parent 'model' directory
    model_base_path = os.path.join(folder_path, 'model')
    os.makedirs(model_base_path, exist_ok=True)

    # If this is the trained model, use final_model_name but place it inside the 'model' folder
    if is_trained_model and final_model_name:
        model_role_path = os.path.join(model_base_path, final_model_name)
    else:
        # Create a subdirectory within 'model' for each specific role
        model_role_path = os.path.join(model_base_path, role.replace(" ", "_").replace("-", "_").lower())

    # Extract the bucket name and key from the S3 path
    bucket_name, s3_key = model_location.replace('s3://', '').split('/', 1)
    print(f"Bucket: {bucket_name}, Key: {s3_key}")

    # Specify the local file path to save the downloaded tar file
    local_tar_file_path = os.path.join(folder_path, f"{role.replace(' ', '_').replace('-', '_').lower()}_model.tar.gz")
    print(f"Local tar file path: {local_tar_file_path}")

    # Download the model artifacts from Amazon S3
    s3 = boto3.resource('s3')
    s3.Bucket(bucket_name).download_file(s3_key, local_tar_file_path)
    print(f"Downloaded the {role} model tar file from S3.")

    # Untar the downloaded tar file
    with tarfile.open(local_tar_file_path, 'r:gz') as tar:
        # List the contents of the tar file
        tar_members = tar.getmembers()

        # Debug: Print all member names
        print("Contents of the tar file:")
        for member in tar_members:
            print(f" - {member.name}")

        # Check if a top-level directory named 'models' exists in the tar file
        models_folder_exists = any(member.name.startswith('models/') and member.isdir() for member in tar_members)

        if models_folder_exists:
            print("Found 'models/' directory in the tar file. Extracting directly to folder_path.")
            # Extract directly to folder_path since 'models' folder is already present
            tar.extractall(path=folder_path)
        else:
            print(f"No 'models/' directory found. Creating '{model_role_path}' folder and extracting contents.")
            # Ensure the model role folder exists
            os.makedirs(model_role_path, exist_ok=True)

            # Extract contents into the role-specific folder under 'model'
            for member in tar_members:
                # Strip leading directories to ensure extraction directly into model_role_path
                member.name = os.path.basename(member.name)
                tar.extract(member, path=model_role_path)

    # Remove the tar file after extraction (optional)
    os.remove(local_tar_file_path)
    print(f"Removed the tar file after extraction for {role}.")

    print(f"{role.capitalize().replace('_', ' ')} model files extracted successfully.")

# Download and extract each model with a unique name based on its role
for i, (role, model_location) in enumerate(model_locations.items()):
    # If this is the second tar file (the trained model), save it using final_model_name
    is_trained_model = (i == 1)  # Assuming the second item is the trained model
    download_and_extract_model(model_location, folder_path, role, final_model_name=final_model_name, is_trained_model=is_trained_model)


### Get model local path

In [None]:
# ## OLD LOGIC
# directory_name = final_model_name.replace('-', '_')


# # Check if folder_path exists
# if os.path.exists(folder_path):
#     # Variable to store the full path if directory is found
#     found_path = None
    
#     # Walk through all subdirectories under folder_path
#     for root, dirs, files in os.walk(folder_path):
#         # Check if the target directory exists in the current root
#         if directory_name in dirs:
#             # Construct the full path
#             found_path = os.path.join(root, directory_name)
#             break  # Exit loop once directory is found

#     if found_path:
#         print("Directory found at:", found_path)
#     else:
#         print(f"Directory '{directory_name}' not found under '{folder_path}'.")
# else:
#     print(f"Folder path '{folder_path}' does not exist.")

In [None]:
model_folder_path = folder_path+'/model'
# Directory name to check
directory_name = final_model_name.replace('-', '_')

# Check if folder_path exists
if os.path.exists(model_folder_path):
    # Variable to store the full path if directory is found
    found_path = None
    
    # Walk through all subdirectories under folder_path
    for root, dirs, files in os.walk(model_folder_path):
        # Check if the target directory exists in the current root
        if directory_name in dirs:
            # Construct the full path
            found_path = os.path.join(root, directory_name)
            break  # Exit loop once directory is found

    # If found_path is not set, it means we are in an HPO scenario
    if not found_path:
        # Check if there is only one model file in the top-level 'models' directory
        models_folder = os.path.join(model_folder_path, 'models')
        if os.path.exists(models_folder):
            model_files = [f for f in os.listdir(models_folder) if os.path.isfile(os.path.join(models_folder, f))]
            if len(model_files) == 1:
                found_path = models_folder
            else:
                print(f"Multiple model files found in '{models_folder}'. Unable to determine final model path.")
                found_path = None

        # Additional check if found_path is not set
        if not found_path:
            # Check for files inside 'model' directory
            model_folder = os.path.join(model_folder_path, 'model')
            if os.path.exists(model_folder):
                model_files = [f for f in os.listdir(model_folder) if os.path.isfile(os.path.join(model_folder, f))]
                if model_files:
                    found_path = model_folder
                    print(f"Model files found in '{model_folder}': {model_files}")

    if found_path:
        print("Model directory found at:", model_folder_path)
    else:
        print(f"Directory or model '{directory_name}' not found under '{model_folder_path}'.")
else:
    print(f"Folder path '{model_folder_path}' does not exist.")


In [None]:
# score = {}

# # Iterate over the candidate metrics
# for metric in best_candidate["CandidateProperties"]["CandidateMetrics"]:
#     # Check if the metric is RMSE or F1 and store the corresponding value
#     if metric['MetricName'] == 'RMSE':
#         score['RMSE'] = metric['Value']
#     elif metric['MetricName'] == 'F1':
#         score['F1'] = metric['Value']

# Initialize the score dictionary
score = {}

# Initialize the score dictionary
score = {}

# Extract problem type from the job description
problem_type = job_desc['ResolvedAttributes']['AutoMLProblemTypeResolvedAttributes']['TabularResolvedAttributes']['ProblemType']
print(problem_type)
# Iterate over the candidate metrics
for metric in best_candidate["CandidateProperties"]["CandidateMetrics"]:
    # Store specific metrics based on problem type
    if problem_type == 'Regression':
        if metric['MetricName'] == 'RMSE':
            score['RMSE'] = metric['Value']
    elif problem_type == 'BinaryClassification':
        if metric['MetricName'] == 'F1':
            score['F1'] = metric['Value']
    elif problem_type == 'MulticlassClassification':
        if metric['MetricName'] == 'F1macro':
            score['F1macro'] = metric['Value']

# Print or return the score dictionary
print(score)



### Saving All Model Metric in Local Folder

In [None]:
import pandas as pd
model_metrics= pd.DataFrame(best_candidate["CandidateProperties"]["CandidateMetrics"])
file_path = f'{folder_path}/model_metrics.csv'
model_metrics.to_csv(file_path, index=False)

### Create Dashbaord for Online Prediction

In [None]:
dashboard_name = cube_name + '_' + table.split('.')[0] + '_sagemaker'
dashboard_name

In [None]:
## Check if dashbaord exsists
url = 'http://' + os.environ['API_GATEWAY_EXTERNAL_SERVICE_HOST'] + ':' + os.environ['API_GATEWAY_EXTERNAL_SERVICE_PORT']
endpoint = f'/api/v1/dashboards?datasourceTitle={cube_name}'
# endpoint = f'{url}/api/v1/dashboards?datasourceTitle={cube_name}'

# source_token = ''
# source_head = {'Authorization': f'Bearer {source_token}','Content-type': 'application/json'}
# response = requests.get(endpoint, headers=source_head)
response = sisense_conn.call_api_custom('GET',url,endpoint, params=None, payload=None)
response
res = response.json()
dashboard_exists = False
dash_id = ''
# Check if the dashboard already exists
for dash in res:
    if dash['title'] == dashboard_name:
        dashboard_exists = True
        dash_id = dash['oid']
        print(f"Dashboard '{dashboard_name}' - '{dash_id}' Already Exists")
        break

# If the dashboard does not exist, create it
if not dashboard_exists:
    print(f'Creating Dashboard as {dashboard_name}')
    payload = {
        "title": dashboard_name,
        "datasource": {
            "fullname": f"localhost/{cube_name}",
            "id": f"localhost_{cube_name}",
            "address": "LocalHost",
            "database": cube_name,
            "live": False,
            "title": cube_name
        },
        "type": "dashboard",
        "desc": "",
        "filters": [],
        "style": {},
        "editing": True
    }

    dash_endpoint = f'/api/dashboards/'
    # response = requests.post(dash_endpoint, headers=source_head, data=json.dumps(payload))
    response = sisense_conn.call_api_custom('POST',url,dash_endpoint, params=None, payload=payload)
    res=response.json()
    dash_id = res[0]['oid']
    
    if response.status_code == 200:
        print(f"Dashboard '{dashboard_name}' - '{dash_id}' created successfully")
    else:
        print(f'Failed to create dashboard {dashboard_name}. Status code: {response.status_code}')

In [None]:
## Create Widget Payload 
def generate_payload(features):
    columns = []
    num_columns = 3
    items_per_column = len(features) // num_columns
    extra_items = len(features) % num_columns
    
    feature_index = 0

    for col in range(num_columns):
        column = {
            "type": "Column",
            "separator": col == 0,  # Only the first column has separator set to True
            "spacing": "large",
            "items": []
        }

        for item in range(items_per_column + (1 if col < extra_items else 0)):
            feature = features[feature_index]
            item_dict = {
                "spacing": "large",
                "type": "Input.Text",
                "id": f"data.{feature}",
                "placeholder": feature,
                "defaultValue": "",
                "isMultiline": True,
                "rows": "2",
                "borderRadius": "14px",
                "borderStyle": "none",
                "backgroundColor": "lightgrey"
            }
            column["items"].append(item_dict)
            feature_index += 1

        columns.append(column)

    payload = {
        "columns": columns
    }

    # return json.dumps(payload, indent=4)
    return payload



In [None]:
## Check if widget exists
# widget_endpoint = f'{url}/api/v1/dashboards/{dash_id}/widgets'
# response = requests.get(widget_endpoint, headers=source_head)
widget_endpoint = f'/api/v1/dashboards/{dash_id}/widgets'
response = sisense_conn.call_api_custom('GET',url,widget_endpoint, params=None, payload=None)
res = response.json()
widget_id = ''
widget_exists = False

# Check if the widget already exists
for widget in res:
    if widget['title'] == table.split('.')[0] + '_sagemaker_online_prediction':
        widget_exists = True
        widget_id= widget["oid"]
        print(f"Widget {widget_id} 'Online-Prediction' in '{dashboard_name}' Already Exists")
        break

# If the widget does not exist, create it
if not widget_exists:
    print(f'Creating Widget in {dashboard_name}')
    payload = {
    "title": table.split('.')[0] + '_sagemaker_online_prediction',
    "type": "BloX",
    "subtype": "BloX",
    "desc": None,
    "source": None,
    "datasource": {
        "title": cube_name,
        "fullname": f'LocalHost/{cube_name}',
        "id": f'LOCALHOST_{cube_name}',
        "address": "LocalHost",
        "database": cube_name
    },
    "selection": None,
    "metadata": {
        "ignore": {
            "dimensions": [],
            "ids": [],
            "all": False
        },
        "panels": [
            {
                "name": "Items",
                "items": []
            },
            {
                "name": "Values",
                "items": []
            },
            {
                "name": "filters",
                "items": []
            }
        ],
        "usedFormulasMapping": {}
    },
    "style": {
        "currentCard": {
            "style": "",
            "script": "",
            "title": "",
            "showCarousel": True,
            "backgroundImage": "",
            "body": [
                {
                    "type": "Container",
                    "items": [
                        {
                            "type": "TextBlock",
                            "text": "Sisense AutoML",
                            "size": "extraLarge",
                            "color": "yellow",
                            "weight": "bold",
                            "horizontalAlignment": "center"
                        },
                        {
                            "type": "ColumnSet",
                            "spacing": "extraLarge",
                            "columns": []
                        }
                    ]
                },
                {
                    "type": "Container",
                    "separator": False,
                    "id": "outputContainer",
                    "size": "Large",
                    "items": [
                        {
                            "type": "ActionSet",
                            "actions": [
                                {
                                    "type": "sagemaker_prediction",
                                    "title": "Predict",
                                    "data": {
                                        "question": "",
                                        "table": ""
                                    }
                                }
                            ]
                        },
                        {
                            "spacing": "extraLarge",
                            "type": "TextBlock",
                            "text": "Output ",
                            "color": "green"
                        }
                    ]
                }
            ]
        },
        "currentConfig": {
            "fontFamily": "Open Sans",
            "fontSizes": {
                "default": 16,
                "small": 12,
                "medium": 22,
                "large": 32,
                "extraLarge": 50
            },
            "fontWeights": {
                "default": 500,
                "light": 100,
                "bold": 900
            },
            "containerStyles": {
                "default": {
                    "backgroundColor": "black",
                    "foregroundColors": {
                        "default": {
                            "normal": "#3A4356"
                        },
                        "white": {
                            "normal": "#ffffff"
                        },
                        "grey": {
                            "normal": "#dcdcdc"
                        },
                        "orange": {
                            "normal": "#f2B900"
                        },
                        "yellow": {
                            "normal": "#ffcb05"
                        },
                        "black": {
                            "normal": "#000000"
                        },
                        "lightGreen": {
                            "normal": "#93c0c0"
                        },
                        "green": {
                            "normal": "#54a254"
                        },
                        "red": {
                            "normal": "#dd1111"
                        },
                        "accent": {
                            "normal": "#2E89FC"
                        },
                        "good": {
                            "normal": "#54a254"
                        },
                        "warning": {
                            "normal": "#e69500"
                        },
                        "attention": {
                            "normal": "#cc3300"
                        }
                    }
                }
            },
            "imageSizes": {
                "default": 40,
                "small": 40,
                "medium": 80,
                "large": 160
            },
            "imageSet": {
                "imageSize": "medium",
                "maxImageHeight": 100
            },
            "actions": {
                "color": "",
                "backgroundColor": "red",
                "maxActions": 5,
                "spacing": "extraLarge",
                "buttonSpacing": 20,
                "actionsOrientation": "horizontal",
                "actionAlignment": "center",
                "showCard": {
                    "actionMode": "inline",
                    "inlineTopMargin": 16,
                    "style": "default"
                }
            },
            "spacing": {
                "default": 5,
                "small": 5,
                "medium": 10,
                "large": 20,
                "extraLarge": 40,
                "padding": 20
            },
            "separator": {
                "lineThickness": 1,
                "lineColor": "#eeeeee"
            },
            "factSet": {
                "title": {
                    "size": "default",
                    "color": "default",
                    "weight": "bold",
                    "warp": True
                },
                "value": {
                    "size": "default",
                    "color": "default",
                    "weight": "default",
                    "warp": True
                },
                "spacing": 20
            },
            "supportsInteractivity": True,
            "imageBaseUrl": "",
            "height": 743
        },
        "currentCardName": "Multiple Indicator",
        "narration": {
            "enabled": False,
            "display": "above",
            "format": "bullets",
            "verbosity": "medium",
            "up_sentiment": "good",
            "aggregation": "sum",
            "labels": []
        }
    }
}
    payload_columns = generate_payload(widget_features)
    payload['style']['currentCard']['body'][0]['items'][1]['columns'] = payload_columns['columns']
    # dash_endpoint = f'{url}/api/v1/dashboards/{dash_id}/widgets'
    response = sisense_conn.call_api_custom('POST',url,widget_endpoint, params=None, payload=payload)
    # response = requests.post(widget_endpoint, headers=source_head, data=json.dumps(payload))
    widget_id = response.json()["oid"]
    
    if response.status_code == 201:
        print(f"Widget '{widget_id}' - '_sagemaker_online_prediction' in '{dashboard_name}' created successfully")
    else:
        print(f'Failed to create Widget in "{dashboard_name}". Status code: {response.status_code}')




### Create result dataframe for model name and accuracy score

In [None]:
# Extract metric name and value from score
key, value = next(iter(score.items()))
df = pd.DataFrame({
    'model_name': [model_name],
    'metric_name': [key],
    'score': [int(value * 100) / 100.0],
    'path': [found_path]
})
df

### Save model information in CSV

In [None]:
# folder_path = f"/opt/sisense/storage/notebooks/custom_code_notebooks/notebooks/automl/models/"
# os.makedirs(folder_path, exist_ok=True)
new_df = df.copy()
new_df['provider'] = 'Sagemaker'
new_df['created_date'] = datetime.now()
new_df['dash_id'] = dash_id
new_df['widget_id'] = widget_id
new_df

In [None]:
models_file_path = f"/opt/sisense/storage/notebooks/custom_code_notebooks/notebooks/automl/models/models.csv"
# Check if models.csv file exists
if not os.path.exists(models_file_path):
    # If it does not exist, create an empty DataFrame and save it as models.csv
    empty_df = pd.DataFrame(columns=new_df.columns)
    empty_df.to_csv(models_file_path, index=False)
    print('empty')
old_df = pd.read_csv(models_file_path)
if old_df.empty:
    combined_df = new_df
else:
    combined_df = pd.concat([old_df, new_df], axis=0)

combined_df.to_csv(models_file_path,index=False)

# Create AWS Sagemaker Endpoint

In [None]:
timestamp_suffix = strftime("%Y%m%d%H%M%S", gmtime())
endpoint_config_name = model_name + "-" + timestamp_suffix + "-epc"
endpoint_config_name

In [None]:
if len(endpoint_config_name) > 63:
    # Calculate how many characters need to be trimmed from model_name
    excess_length = len(endpoint_config_name) - 63
    # Trim the model_name to make it fit within the limit
    model_name_trimmed = model_name[excess_length:]
    # Recreate the endpoint_config_name with the trimmed model_name
    endpoint_config_name = model_name_trimmed + "-" + timestamp_suffix + "-epc"

### Create Endpoint Configuration

#### Check if sagemaker used Ensemble or HPO based on dataset size

In [None]:
if len(best_candidate["InferenceContainers"]) > 1:
    mode_selected = 'HPO'
else:
    mode_selected = 'Ensemble'
mode_selected

In [None]:
# response = s3.head_object(Bucket=S3_bucket_name, Key=s3_key)
# file_size = response['ContentLength']
# file_size_mb = file_size / (1024 * 1024)
# if file_size_mb > 100:
#     mode_selected = 'HPO'
# else:
#     mode_selected = 'Ensemble'
# mode_selected

In [None]:
if mode_selected == 'HPO':
    ## Creating provisioned endpoint configuration
    endpoint_config = autopilot.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[
            {
                'VariantName': 'AllTraffic',
                'ModelName': model_name,
                'InstanceType': 'ml.m5.large',
                'InitialInstanceCount': 1
            }
        ],
    )
else:
    # Creating serverless endpoint configuration
    serverless_config = {
        'MemorySizeInMB': 1024,
        'MaxConcurrency': 5
    }
    endpoint_config = autopilot.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[
            {
                'VariantName': 'AllTraffic',
                'ModelName': model_name,
                'ServerlessConfig': serverless_config
            }
        ],
    )
endpoint_config

### Create Endpoint

In [None]:
endpoint_name = endpoint_config_name.replace('-epc', '-ep')
create_endpoint_response = autopilot.create_endpoint(EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name)

In [None]:
## Wait for Endpoint to be created
autopilot.get_waiter("endpoint_in_service").wait(EndpointName=endpoint_name)

In [None]:
# describe endpoint creation status
status = autopilot.describe_endpoint(EndpointName=endpoint_name)["EndpointStatus"]
status

In [None]:
endpoint_name

### Test Endpoint

In [None]:
# # once endpoint status is InService, you can invoke the endpoint for inferencing
# # Define your input data as a CSV string
# input_data = "1000,101113,2019011,9039658401,1,1,ZA,2019-11-26T00:00:00,2019-11-26T00:00:00,2020-01-25T00:00:00,2020-01-25T00:00:00,AA,USD,888005002.69,Open,NET060,GL001"
# # input_data = "Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75"
# if status == "InService":
#     # region_name = 'us-east-1'
#     sm_runtime = boto3.client('sagemaker-runtime', region_name=region_name)
#     inference_result = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType='text/csv', Body=input_data)
# inference_result

In [None]:
# inference_result = inference_result['Body'].read().decode('utf-8').strip()
# inference_result

In [None]:
# # Check if inference_result is not empty
# if inference_result:
#     try:
#         # Convert the prediction to a float
#         prediction_value = float(inference_result)

#         # Check if the float is very close to an integer
#         if prediction_value.is_integer():
#             # Convert to int if it is close enough to an integer
#             prediction = int(prediction_value)
#         else:
#             # Otherwise, keep as float
#             prediction = prediction_value

#         print("Prediction:", prediction)
#     except ValueError as e:
#         print(f"Error converting prediction to float: {e}")
# else:
#     print("Error: Inference result is empty.")

In [None]:
###### OLD METHOD FOR INFERENCE ######

# from io import StringIO

# if sagemaker.__version__ < "2":
#     from sagemaker.predictor import RealTimePredictor
#     from sagemaker.content_types import CONTENT_TYPE_CSV

#     predictor = RealTimePredictor(
#         endpoint=endpoint_name,
#         sagemaker_session=session,
#         content_type=CONTENT_TYPE_CSV,
#         accept=CONTENT_TYPE_CSV,
#     )

#     # Obtain predictions from SageMaker endpoint
#     prediction = predictor.predict(
#         open_ar.to_csv(sep=",", header=False, index=False)
#     ).decode("utf-8")

#     # Load prediction in pandas and compare to ground truth
#     prediction_df = pd.read_csv(StringIO(prediction), header=None)

# else:
#     from sagemaker.predictor import Predictor
#     from sagemaker.serializers import CSVSerializer
#     from sagemaker.deserializers import CSVDeserializer
    
#     predictor = Predictor(
#         endpoint_name=endpoint_name,
#         sagemaker_session=session,
#         serializer=CSVSerializer(),
#         deserializer=CSVDeserializer(),
#     )
    
#     # Obtain predictions from SageMaker endpoint
#     single_row_values = "-122.23,37.88,45,500,120,400,135,8.20,NEAR BAY"
#     prediction = predictor.predict(single_row_values)
#     # Load prediction in pandas
#     prediction_df = pd.DataFrame(prediction)
# prediction_df

### Save Model and Endpoint Name information

In [None]:
# Check if endpoint_name is defined
try:
    endpoint_name = endpoint_name 
except NameError:
    endpoint_name = "" 

fdf = pd.DataFrame({
    'model_name': [model_name],
    'metric_name': [key],
    'score': [int(value * 100) / 100.0],
    'local_path': [found_path],
    'model_s3_location': [model_location],
    'aws_model_name': [final_model_name],
    'endpoint_name': [endpoint_name]
})
fdf

### Save endpoint information locally

In [None]:
endpoint_value = fdf["endpoint_name"].iloc[0]  # This extracts the single value from the column

# Construct the full file path
endpoint_file_path = os.path.join(folder_path, "endpoint.txt")

# Save the value to the specified directory
with open(endpoint_file_path, "w") as file:
    file.write(endpoint_value)

In [None]:
df_result = pd.DataFrame(fdf)
df_result

# Delete Model, Endpoint Config, Endpoint

In [None]:
# autopilot.delete_endpoint(EndpointName=endpoint_name) 
# autopilot.delete_endpoint_config(EndpointConfigName=endpoint_config_name)
# autopilot.delete_model(ModelName=model_name)

In [None]:
# response = autopilot.list_models()

# # Print out all models and delete them
# for model in response["Models"]:
#     print(f"Model Name: {model['ModelName']}, Creation Time: {model['CreationTime']}")
#     autopilot.delete_model(ModelName=model['ModelName'])
#     print(f"{model['ModelName']} deleted")