In [4]:
!pip install -qU pip transformers sagemaker boto3

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tf-models-official 2.11.0 requires tensorflow~=2.11.0, which is not installed.
tf-models-official 2.11.0 requires pyyaml<6.0,>=5.1, but you have pyyaml 6.0.1 which is incompatible.
awscli 1.27.50 requires botocore==1.29.50, but you have botocore 1.34.44 which is incompatible.
awscli 1.27.50 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0.1 which is incompatible.
awscli 1.27.50 requires s3transfer<0.7.0,>=0.6.0, but you have s3transfer 0.10.0 which is incompatible.[0m[31m
[0m

In [35]:
import boto3
import boto3.session
import pandas as pd
from sklearn.model_selection import train_test_split
import io
import numpy as np
import os
import ast
import random
import json

import sagemaker
from sagemaker.session import Session
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
from sagemaker.inputs import TrainingInput, CreateModelInput

from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.model_metrics import MetricsSource, ModelMetrics
from sagemaker.huggingface import HuggingFace, HuggingFaceModel

from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.lambda_step import LambdaStep
from sagemaker.lambda_helper import Lambda
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.fail_step import FailStep
from sagemaker.workflow.functions import Join, JsonGet
from sagemaker.workflow.conditions import ConditionLessThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep

from transformers import TFBertForSequenceClassification, BertTokenizer, BertConfig, TrainingArguments, Trainer, TextClassificationPipeline
import tensorflow as tf
from tensorflow.data import Dataset

In [40]:
sagemaker_session = sagemaker.session.Session()
region_name = 'us-west-2'
role = sagemaker.get_execution_role()
pipeline_session = PipelineSession()
model_package_group_name = 'EmotionAIModelPackage'

In [41]:
data_path = 'data'
if not os.path.exists(data_path):
    os.makedirs(data_path)
else:
    print(f'The directory {data_path} already exists')

The directory data already exists


In [42]:
local_paths = ['goemotions_1.csv', 'goemotions_2.csv', 'goemotions_3.csv']

my_session = boto3.session.Session(region_name=region_name)
s3 = my_session.client('s3')

bucket_name = 'aai-540-final-data'
base_uri = 'data/'

for local_path in local_paths:
    s3_key = f'{base_uri}{local_path}'
    s3.upload_file(base_uri + local_path, bucket_name, s3_key)
    
input_uri = f's3://{bucket_name}/{base_uri}'

In [43]:
input_data = ParameterString(
    name='RawDataUri1',
    default_value=input_uri,
)

In [44]:
processing_instance_count = ParameterInteger(name='ProcessingInstanceCount', default_value=1)
instance_type = ParameterString(name='TrainingInstanceType', default_value='ml.p2.xlarge')  
model_approval_status = ParameterString(
    name='ModelApprovalStatus', default_value='PendingManualApproval'
)
f1_threshold = ParameterFloat(name='F1Threshold', default_value=0.4)

In [45]:
code_path = 'code'
if not os.path.exists(code_path):
    os.makedirs(code_path)
else:
    print(f'The directory {code_path} already exists')

The directory code already exists


In [131]:
%%writefile code/preprocessing.py
import pandas as pd
from sklearn.model_selection import train_test_split
import random
import os

region_name = 'us-west-2'
bucket_name = 'aai-540-final-data'

emotion_labels = [
    'admiration',
    'amusement',
    'anger',
    'annoyance',
    'approval',
    'caring',
    'confusion',
    'curiosity',
    'desire',
    'disappointment',
    'disapproval',
    'disgust',
    'embarrassment',
    'excitement',
    'fear',
    'gratitude',
    'grief',
    'joy',
    'love',
    'nervousness',
    'optimism',
    'pride',
    'realization',
    'relief',
    'remorse',
    'sadness',
    'surprise',
    'neutral'
]

emotion_categories = {
	'anger': ['anger', 'annoyance', 'disapproval'],
	'disgust': ['disgust'],
	'fear': ['fear', 'nervousness'],
	'happy': ['joy', 'amusement', 'approval', 'gratitude'],
	'optimistic': ['optimism', 'relief', 'pride', 'excitement'],
	'affectionate': [ 'love', 'caring', 'admiration',  'desire'],
	'sadness': ['sadness', 'disappointment', 'embarrassment', 'grief',  'remorse'],
	'surprise': ['surprise', 'realization', 'confusion', 'curiosity'],
	'neutral': ['neutral']
}

emotion_to_category = {}
for category, emotions in emotion_categories.items():
	for emotion in emotions:
		emotion_to_category[emotion] = category
        
category_names = list(emotion_categories.keys())
category_to_index = {category: index for index, category in enumerate(category_names)}

if __name__ == '__main__':

    base_dir = '/opt/ml/processing'

    df_1 = pd.read_csv(f'{base_dir}/input/goemotions_1.csv')
    df_2 = pd.read_csv(f'{base_dir}/input/goemotions_2.csv')
    df_3 = pd.read_csv(f'{base_dir}/input/goemotions_3.csv')

    dfs = [df_1, df_2, df_3]

    df_full = pd.concat(dfs, ignore_index=True)

    df_grouped = df_full.groupby('id', as_index=False)[emotion_labels].sum(numeric_only=True)
    df_filtered = df_grouped[(df_grouped[emotion_labels] > 1).any(axis=1)].copy()

    def random_emotion(row):
        emotions_with_agreement = [emotion for emotion in emotion_labels if row[emotion] > 1]
        random.shuffle(emotions_with_agreement)
        return emotions_with_agreement[0] if emotions_with_agreement else None


    df_filtered['selected_emotions'] = df_filtered.apply(random_emotion, axis=1)

    final_df = pd.merge(df_filtered, df_full[['id', 'text']], on='id').drop_duplicates()

    final_df['emotions'] = final_df['selected_emotions'].apply(lambda x: category_to_index.get(emotion_to_category.get(x, 'unknown'), None))
    final_df = final_df.drop(columns=emotion_labels + ['selected_emotions', 'id'])

    # Split the dataset into training and test sets initially
    df_train, df_test = train_test_split(final_df, test_size=0.1, random_state=42)

    # Split the training set further into training and validation sets
    df_train, df_val = train_test_split(df_train, test_size=0.125, random_state=42)

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    output_dirs = ['output/train', 'output/validation', 'output/test']
    for output_dir in output_dirs:
        full_dir = f'{base_dir}/{output_dir}'
        if not os.path.exists(full_dir):
            os.makedirs(full_dir)

    df_train.to_csv(f'{base_dir}/output/train/train.csv', index=False)
    df_val.to_csv(f'{base_dir}/output/validation/validation.csv', index=False)
    df_test.to_csv(f'{base_dir}/output/test/test.csv', index=False)

Overwriting code/preprocessing.py


In [132]:
framework_version = '1.2-1'
instance_type = 'ml.t3.medium'
instance_count = 1

sklearn_processor = SKLearnProcessor(
    framework_version=framework_version,
    role=role,
    instance_type=instance_type,
    instance_count=instance_count,
    sagemaker_session=pipeline_session
)

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [99]:
processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(
            source=input_data,
            destination='/opt/ml/processing/input',
        ),
    ],
    outputs=[
        ProcessingOutput(output_name='train', source='/opt/ml/processing/output/train'),
        ProcessingOutput(output_name='validation', source='/opt/ml/processing/output/validation'),
        ProcessingOutput(output_name='test', source='/opt/ml/processing/output/test'),
    ],
    code='code/preprocessing.py',
)

step_process = ProcessingStep(
    name='EmotionAIPreProcess',
    step_args=processor_args
)

In [101]:
%%writefile code/tokenization_script.py
from transformers import BertTokenizer
import pandas as pd
import tensorflow as tf
import os

base_dir = '/opt/ml/processing'


def serialize_example(token_ids, attention_mask, label):
    feature = {
        'input_ids': tf.train.Feature(int64_list=tf.train.Int64List(value=token_ids)),
        'attention_mask': tf.train.Feature(int64_list=tf.train.Int64List(value=attention_mask)),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()


def save_data(texts, labels, tokenizer, destination):
    serialized_examples = []
    for text, label in zip(texts, labels):
        encoding = tokenizer.encode_plus(text, truncation=True, padding=True, max_length=50)
        serialized_example = serialize_example(encoding['input_ids'], encoding['attention_mask'], label)
        serialized_examples.append(serialized_example)
    
    with tf.io.TFRecordWriter(f'{base_dir}/{destination}/tokenized_data.tfrecord') as writer:
        for example in serialized_examples:
            writer.write(example)

            
if __name__ == '__main__':
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    df_train = pd.read_csv(f'{base_dir}/output/train/train.csv')
    df_val = pd.read_csv(f'{base_dir}/output/validation/validation.csv')
    df_test = pd.read_csv(f'{base_dir}/output/test/test.csv')

    output_dirs = ['output/tokenized/train', 'output/tokenized/validation', 'output/tokenized/test']
    for output_dir in output_dirs:
        full_dir = f'{base_dir}/{output_dir}'
        if not os.path.exists(full_dir):
            os.makedirs(full_dir)

    save_data(df_train['text'].tolist(), df_train['emotions'].tolist(), tokenizer, 'output/tokenized/train')
    save_data(df_val['text'].tolist(), df_val['emotions'].tolist(), tokenizer, 'output/tokenized/validation')
    save_data(df_test['text'].tolist(), df_test['emotions'].tolist(), tokenizer, 'output/tokenized/test')

Overwriting code/tokenization_script.py


In [102]:
tensorflow_version = '2.6.3'
transformers_version = '4.17.0'
py_version = 'py38'
huggingface_instance_type = 'ml.p2.xlarge'
huggingface_instance_count = 1
huggingface_uri = '763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.6.3-transformers4.17.0-gpu-py38-cu110-ubuntu18.04'

huggingface_processor = ScriptProcessor(
    image_uri=huggingface_uri,
    command=['python3'],
    instance_type=huggingface_instance_type,
    instance_count=huggingface_instance_count,
    base_job_name='huggingface-preprocessing',
    role=role,
)

In [103]:
step_tokenize = ProcessingStep(
    name='EmotionAITokenize',
    processor=huggingface_processor,
    inputs=[
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/train'
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs['validation'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/validation'
        ),
        ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,
            destination='/opt/ml/processing/input/test'
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name='tokenized_train', 
            source='/opt/ml/processing/output/tokenized/train'),
        ProcessingOutput(
            output_name='tokenized_validation',
            source='/opt/ml/processing/output/tokenized/validation'),
        ProcessingOutput(
            output_name='tokenized_test', 
            source='/opt/ml/processing/output/tokenized/test'),
    ],
    code='code/tokenization_script.py',
)

In [104]:
%%writefile code/train_script.py
import argparse
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertConfig


base_dir = '/opt/ml/processing'


def model_fn(num_labels, dropout_prob):
    config = BertConfig.from_pretrained(
        'bert-base-uncased',
        num_labels=num_labels,
        hidden_dropout_prob=dropout_prob,
        attention_probs_dropout_prob=dropout_prob)
    return TFBertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)


def _parse_function(proto):
    # Define the feature description dictionary for `tf.io.parse_single_example`
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([50], tf.int64),
        'attention_mask': tf.io.FixedLenFeature([50], tf.int64),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }
    # Parse the input `tf.train.Example` proto using the dictionary above
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    labels = parsed_features.pop('label')
    return parsed_features, labels


def load_dataset(file_path):
    raw_dataset = tf.data.TFRecordDataset(file_path)
    parsed_dataset = raw_dataset.map(_parse_function)
    return parsed_dataset


if __name__ == '__main__':
    parser = argparse.ArgumentParser()

    # Hyperparameters
    parser.add_argument('--num_layers_to_freeze', type=int, default=0)
    parser.add_argument('--dropout_prob', type=float, default=0.1)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--epochs', type=int, default=3)
    parser.add_argument('--initial_learning_rate', type=float, default=2e-5)
    parser.add_argument('--lr_scheduler', type=str, default='PolynomialDecay')
    parser.add_argument('--decay_steps', type=int, default=5000)
    parser.add_argument('--power_exp', type=float, default=1.0)

    args = parser.parse_args()
    
    train_dataset = load_dataset(f'{base_dir}/output/tokenized/train/tokenized_data.tfrecord')
    val_dataset = load_dataset(f'{base_dir}/output/tokenized/validation/tokenized_data.tfrecord')
    
    train_dataset = train_dataset.shuffle(buffer_size=10000).batch(args.batch_size).prefetch(tf.data.experimental.AUTOTUNE)
    val_dataset = val_dataset.batch(args.batch_size).prefetch(tf.data.experimental.AUTOTUNE)

    # Initialize the model
    model = model_fn(num_labels=9, dropout_prob=args.dropout_prob)

    # Compile and train your model here
    if args.lr_scheduler == 'PolynomialDecay':
        lr_scheduler = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=args.initial_learning_rate,
            decay_steps=args.decay_steps,
            power=args.power_exp
        )
    elif args.lr_scheduler == 'CosineDecay':
        lr_scheduler = tf.keras.experimental.CosineDecay(
            initial_learning_rate=args.initial_learning_rate,
            decay_steps=args.decay_steps,
            alphas=0.0
        )
    else:
        raise ValueError('Invalid learning rate scheduler')

    optimizer = tf.keras.optimizers.Adam(learning_rate=lr_scheduler)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

    model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
    model.fit(train_dataset, epochs=args.epochs, validation_data=val_dataset)

Overwriting code/train_script.py


In [105]:
model_path = f's3://{bucket_name}/models/tuned_model'
# Define hyperparameters
hyperparameters = {
    'epochs': 4,  
    'initial_learning_rate': 9.4e-06,
    'batch_size': 12,
    'num_layers_to_freeze': 0,
    'dropout_prob': 0.1,
    'lr_scheduler': 'CosineDecay',
    'decay_steps': 7639
}

huggingface_estimator = HuggingFace(entry_point='train_script.py',
                                    source_dir='code',
                                    role=role,
                                    output_path=model_path,
                                    instance_type=huggingface_instance_type,  
                                    instance_count=1,
                                    transformers_version=transformers_version,
                                    tensorflow_version=tensorflow_version,
                                    py_version=py_version,
                                    hyperparameters=hyperparameters)

In [106]:
training_inputs = {
    'train': TrainingInput(
        s3_data=step_tokenize.properties.ProcessingOutputConfig.Outputs['tokenized_train'].S3Output.S3Uri,
        content_type='application/x-tfrecord'
    ),
    'validation': TrainingInput(
        s3_data=step_tokenize.properties.ProcessingOutputConfig.Outputs['tokenized_validation'].S3Output.S3Uri,
        content_type='application/x-tfrecord'
    )
}

In [107]:
step_train = TrainingStep(
    name='EmotionAITrain',
    estimator=huggingface_estimator,
    inputs=training_inputs
)

In [108]:
%%writefile code/evaluation.py
import json
import pathlib
import tarfile
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer
import numpy as np
from sklearn.metrics import f1_score

base_dir = '/opt/ml/processing'


def extract_model(model_path=f'{base_dir}/model/model.tar.gz', extract_path='model'):
    with tarfile.open(model_path) as tar:
        tar.extractall(path=extract_path)
    return extract_path


def load_model(model_dir='model'):
    model = TFBertForSequenceClassification.from_pretrained(model_dir)
    return model


def _parse_function(proto):
    # Define the feature description dictionary for `tf.io.parse_single_example`
    feature_description = {
        'input_ids': tf.io.FixedLenFeature([50], tf.int64),
        'attention_mask': tf.io.FixedLenFeature([50], tf.int64),
        'label': tf.io.FixedLenFeature([], tf.int64),
    }
    # Parse the input `tf.train.Example` proto using the dictionary above
    parsed_features = tf.io.parse_single_example(proto, feature_description)
    labels = parsed_features.pop('label')
    return parsed_features, labels


def load_dataset(file_path):
    raw_dataset = tf.data.TFRecordDataset(file_path)
    parsed_dataset = raw_dataset.map(_parse_function)
    return parsed_dataset


if __name__ == '__main__':
    model_path = f'{base_dir}/models/tuned_model'
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    test_dataset = load_dataset(f'{base_dir}/output/tokenized/test/tokenized_data.tfrecord')
    test_dataset = test_dataset.batch(16).prefetch(tf.data.experimental.AUTOTUNE)
    
    model_extracted_path = extract_model()
    model = load_model(model_extracted_path)
   
    # Evaluate the model
    loss, accuracy = model.evaluate(test_dataset, return_dict=True)
    predictions = np.argmax(model.predict(test_dataset).logits, axis=1)
    true_labels = np.array([label for _, label in test_dataset])
    
    f1 = f1_score(true_labels, predictions, average='weighted')
    
    report_dict = {
        'classification_metrics': {
            'accuracy': {'value': accuracy},
            'f1_score': {'value': f1},
        },
    }

    output_dir = f'{base_dir}/evaluation'
    pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

    evaluation_path = f'{output_dir}/evaluation.json'
    with open(evaluation_path, 'w') as f:
        f.write(json.dumps(report_dict))


Overwriting code/evaluation.py


In [109]:
huggingface_evaluation_processor = ScriptProcessor(
    image_uri=huggingface_uri,
    command=['python3'],
    instance_type=huggingface_instance_type,
    instance_count=huggingface_instance_count,
    base_job_name='huggingface-evaluation',
    role=role,
)

In [110]:
evaluation_report = PropertyFile(
    name='EvaluationReport', output_name='evaluation', path='evaluation.json'
)

step_eval = ProcessingStep(
    name='EmotionAIEvaluation',
    processor=huggingface_evaluation_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/model'
        ),
        ProcessingInput(
            source=step_tokenize.properties.ProcessingOutputConfig.Outputs['tokenized_test'].S3Output.S3Uri,
            destination='/opt/ml/processing/test'
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name='evaluation', 
            source='/opt/ml/processing/evaluation'),
    ],
    property_files=[evaluation_report],
    code='code/evaluation.py',
)

In [112]:
%%writefile code/inference.py
import tensorflow as tf
import json
from transformers import BertTokenizer, TFBertForSequenceClassification, AutoConfig


def model_fn(model_dir):
	config = AutoConfig.from_pretrained(model_dir)
	model = TFBertForSequenceClassification.from_pretrained(model_dir, config=config)
	return model


def input_fn(request_body, request_content_type):
	# Process and tokenize the incoming request
	if request_content_type == 'application/json':
		input_data = json.loads(request_body)
		raw_text = input_data['text']
		# Tokenize the input text
		tokenizer = BertTokenizer.from_pretrained('/opt/ml/model')
		tokens = tokenizer.encode_plus(raw_text, truncation=True, padding=True, max_length=50, return_tensors='tf')
		return {'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask']}
	else:
		raise ValueError('This model only supports application/json input')


def predict_fn(input_data, model):
	predictions = model(**input_data)
	return predictions


def logits_to_top_n_prob(logits, class_names, top_n=3):
	if hasattr(logits, 'logits'):
		logits = logits.logits
	# Convert the logits to probabilities
	probabilities = tf.nn.softmax(logits, axis=-1)
	probabilities = probabilities.numpy().flatten()
	# Get the top n probabilities
	top_n_indices = probabilities.argsort()[-top_n:][::-1]
	top_n_probabilities = {class_names[i]: float(probabilities[i]) for i in top_n_indices}

	return top_n_probabilities


def output_fn(prediction, content_type):
	classes = ['anger', 'disgust', 'fear', 'happy', 'optimistic', 'affectionate', 'sad', 'surprised', 'neutral']
	# Convert the logits to probabilities
	probabilities = logits_to_top_n_prob(prediction, classes, top_n=3)
	if content_type == 'application/json':
		return json.dumps(probabilities), 'application/json'
	else:
		raise ValueError('This model only supports application/json output')

Overwriting code/inference.py


In [113]:
bert_model = HuggingFaceModel(
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    transformers_version=transformers_version,
    tensorflow_version=tensorflow_version,
    sagemaker_session=pipeline_session,
    py_version=py_version,
    source_dir='code',
    entry_point='inference.py'
)

step_create_model = ModelStep(
    name='EmotionAIModel',
    step_args=bert_model.create(instance_type='ml.cd.large'),
)

In [114]:
inference_instances = ['ml.c5.large', 'ml.c5.xlarge']
transform_instances = ['ml.g4dn.xlarge']

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri='{}/evaluation.json'.format(
            step_eval.arguments['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri']
        ),
        content_type='application/json',
    )
)

register_args = bert_model.register(
    content_types=['application/json'],
    response_types=['application/json'],
    inference_instances=inference_instances,
    transform_instances=transform_instances,
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics,
)
step_register = ModelStep(name='EmotionAIRegisterModel', step_args=register_args)



In [121]:
%%writefile code/deploy_script.py
import boto3
import json
import time

def lambda_handler(event, context):
    current_time = time.strftime("%m-%d-%H-%M-%S", time.localtime())
    client = boto3.client('sagemaker')
    
    model_name = event['model_name']
    endpoint_config_name = f'{event["endpoint_config_name"]}-{current_time}'
    endpoint_name = event['endpoint_name']
    
    instance_type = event['endpoint_instance_type']

    # Create an endpoint configuration
    endpoint_config_response = client.create_endpoint_config(
        EndpointConfigName=endpoint_config_name,
        ProductionVariants=[
            {
                'VariantName': 'AllTraffic',
                'ModelName': model_name,
                'InitialInstanceCount': 1,
                'InstanceType': instance_type,
            }
        ]
    )
    print(f'Endpoint Config Arn: {endpoint_config_response["EndpointConfigArn"]}')

    list_endpoints_response = client.list_endpoints(
        SortBy="CreationTime",
        SortOrder="Descending",
        NameContains=endpoint_name,
    )
    print(f"list_endpoints_response: {list_endpoints_response}")

    if len(list_endpoints_response["Endpoints"]) > 0:
        print("Updating Endpoint with new Endpoint Configuration")
        update_endpoint_response = client.update_endpoint(
            EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
        )
        print(f"update_endpoint_response: {update_endpoint_response}")
    else:
        print("Creating Endpoint")
        create_endpoint_response = client.create_endpoint(
            EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name
        )
        print(f"create_endpoint_response: {create_endpoint_response}")
    
    return {'statusCode': 200, 'body': json.dumps('Endpoint Created Successfully')}

Overwriting code/deploy_script.py


In [122]:
endpoint_config_name = 'emotion-ai-endpoint'
endpoint_name = 'emotion-ai-endpoint'

deploy_model_lambda_function_name = 'sagemaker-deploy-model-lambda'

deploy_lambda = Lambda(
    function_name=deploy_model_lambda_function_name,
    execution_role_arn=role,
    script='code/deploy_script.py',
    handler='deploy_script.lambda_handler',
    session=pipeline_session,
)

lambda_inputs = {
    'model_name': step_create_model.properties.ModelName,
    'endpoint_config_name': endpoint_config_name,
    'endpoint_name': endpoint_name,
    'endpoint_instance_type': inference_instances[0],
}

# Define the LambdaStep in the pipeline
step_deploy_model = LambdaStep(
    name='DeployModel',
    lambda_func=deploy_lambda,
    inputs=lambda_inputs
)

In [123]:
step_fail = FailStep(
    name='EmotionAIF1Fail',
    error_message=Join(on=' ', values=['Execution failed due to F1 <', f1_threshold]),
)

In [124]:
cond_lte = ConditionLessThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path='classification_metrics.f1.value',
    ),
    right=f1_threshold,
)

step_cond = ConditionStep(
    name='EmotionAIF1Cond',
    conditions=[cond_lte],
    if_steps=[step_register, step_create_model, step_deploy_model],
    else_steps=[step_fail],
)

In [125]:
pipeline_name = 'EmotionAIPipeline'
pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_data,
        f1_threshold,
    ],
    steps=[step_process, step_tokenize, step_train, step_eval, step_cond],
)

In [126]:
definition = json.loads(pipeline.definition())
definition

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


{'Version': '2020-12-01',
 'Metadata': {},
 'Parameters': [{'Name': 'ProcessingInstanceCount',
   'Type': 'Integer',
   'DefaultValue': 1},
  {'Name': 'ModelApprovalStatus',
   'Type': 'String',
   'DefaultValue': 'PendingManualApproval'},
  {'Name': 'RawDataUri1',
   'Type': 'String',
   'DefaultValue': 's3://aai-540-final-data/data/'},
  {'Name': 'F1Threshold', 'Type': 'Float', 'DefaultValue': 0.4}],
 'PipelineExperimentConfig': {'ExperimentName': {'Get': 'Execution.PipelineName'},
  'TrialName': {'Get': 'Execution.PipelineExecutionId'}},
 'Steps': [{'Name': 'EmotionAIPreProcess',
   'Type': 'Processing',
   'Arguments': {'ProcessingResources': {'ClusterConfig': {'InstanceType': 'ml.t3.medium',
      'InstanceCount': 1,
      'VolumeSizeInGB': 30}},
    'AppSpecification': {'ImageUri': '246618743249.dkr.ecr.us-west-2.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3',
     'ContainerEntrypoint': ['python3',
      '/opt/ml/processing/input/code/preprocessing.py']},
    'RoleArn': 'ar

In [127]:
pipeline.upsert(role_arn=role)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.


{'PipelineArn': 'arn:aws:sagemaker:us-west-2:542526735114:pipeline/EmotionAIPipeline',
 'ResponseMetadata': {'RequestId': 'e7860199-2782-47e0-b5c7-08912f38606b',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e7860199-2782-47e0-b5c7-08912f38606b',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '85',
   'date': 'Sun, 18 Feb 2024 20:06:39 GMT'},
  'RetryAttempts': 0}}

In [128]:
execution = pipeline.start()

In [129]:
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-west-2:542526735114:pipeline/EmotionAIPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-west-2:542526735114:pipeline/EmotionAIPipeline/execution/e4d3n5iuwnkn',
 'PipelineExecutionDisplayName': 'execution-1708286801590',
 'PipelineExecutionStatus': 'Executing',
 'PipelineExperimentConfig': {'ExperimentName': 'emotionaipipeline',
  'TrialName': 'e4d3n5iuwnkn'},
 'CreationTime': datetime.datetime(2024, 2, 18, 20, 6, 41, 507000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2024, 2, 18, 20, 6, 41, 507000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-2:542526735114:user-profile/d-4hbkixuafatz/jeffreythomas',
  'UserProfileName': 'jeffreythomas',
  'DomainId': 'd-4hbkixuafatz'},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-west-2:542526735114:user-profile/d-4hbkixuafatz/jeffreythomas',
  'UserProfileName': 'jeffreythomas',
  'DomainId': 'd-4hbkixuafatz'},
 'ResponseMetadata': {'RequestI

In [130]:
execution.wait()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"