# Diabetes Pipeline Implementation

## Diabetes Pipeline - V1 (Without Model Registry and Automated Approval)

In [22]:

import boto3
import sagemaker
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.model import Model
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.inputs import TrainingInput
import time
import os
from sklearn.model_selection import train_test_split

# Set up SageMaker session
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = get_execution_role()
bucket = sagemaker_session.default_bucket()
s3_prefix = 'simple-ml-pipeline-fixed'

print(f"SageMaker Role: {role}")
print(f"SageMaker Session Region: {region}")
print(f"Default S3 bucket: {bucket}")

# Step 1: Load the diabetes dataset from S3
print("Loading diabetes dataset from S3...")
df = pd.read_csv("s3://s3-mlpipeline/diabetes-prediction/diabetes.csv")

# Rename Outcome to target for consistency with the training script
df = df.rename(columns={"Outcome": "target"})
print("\nDataset columns:", df.columns.tolist())
print("\nSample of the data:")
print(df.head())

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save train and test datasets to local files
train_file = 'train.csv'
test_file = 'test.csv'
train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

# Upload train and test files to S3
s3_train_data = f's3://{bucket}/{s3_prefix}/data/{train_file}'
s3_test_data = f's3://{bucket}/{s3_prefix}/data/{test_file}'

print("\nUploading training data to S3...")
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/data/{train_file}").upload_file(train_file)

print("\nUploading test data to S3...")
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/data/{test_file}").upload_file(test_file)

print(f"Training data uploaded to: {s3_train_data}")
print(f"Test data uploaded to: {s3_test_data}")

# Step 2: Create pipeline session
pipeline_session = PipelineSession()

# Step 3: Create training script
training_script = """
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib
import tarfile

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # Hyperparameters
    parser.add_argument('--n_estimators', type=int, default=10)
    parser.add_argument('--min_samples_leaf', type=int, default=2)
    parser.add_argument('--max_depth', type=int, default=5)
    
    # SageMaker paths
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    
    args = parser.parse_args()
    
    print(f"Training directory contents: {os.listdir(args.train)}")
    
    # Load data
    training_data_path = os.path.join(args.train, 'train.csv')
    print(f"Loading training data from: {training_data_path}")
    train_data = pd.read_csv(training_data_path)
    print(f"Training data shape: {train_data.shape}")
    
    # Split features and target
    X = train_data.drop('target', axis=1)
    y = train_data['target']
    
    print("Training RandomForest model...")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        min_samples_leaf=args.min_samples_leaf,
        max_depth=args.max_depth,
        random_state=42
    )
    
    model.fit(X, y)
    print("Model training completed")
    
    # Save model
    model_path = os.path.join(args.model_dir, 'model.joblib')
    print(f"Saving model to: {model_path}")
    joblib.dump(model, model_path)
    
    # Save feature columns
    feature_columns = X.columns.tolist()
    feature_columns_path = os.path.join(args.model_dir, 'feature_columns.txt')
    print(f"Saving feature columns to: {feature_columns_path}")
    with open(feature_columns_path, 'w') as f:
        f.write(','.join(feature_columns))
    
    print(f"Model directory contents: {os.listdir(args.model_dir)}")
    print("Training complete")
"""

with open('train.py', 'w') as f:
    f.write(training_script)

# Upload training script to S3
training_script_s3_uri = f's3://{bucket}/{s3_prefix}/scripts/train.py'
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/scripts/train.py").upload_file('train.py')

# Step 4: Create evaluation script
evaluation_script = """
import argparse
import os
import sys
import glob
import pandas as pd
import numpy as np
import json
import joblib
import traceback
import tarfile
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def extract_model_if_needed(model_path):
    model_dir = os.path.dirname(model_path)
    print(f"Model directory contents: {os.listdir(model_dir)}")
    
    tar_files = glob.glob(os.path.join(model_dir, "*.tar.gz"))
    if tar_files:
        print(f"Found tar file: {tar_files[0]}")
        with tarfile.open(tar_files[0], "r:gz") as tar:
            tar.extractall(path=model_dir)
        print(f"Extracted tar file. New contents: {os.listdir(model_dir)}")
        
        if os.path.exists(os.path.join(model_dir, 'model.joblib')):
            return os.path.join(model_dir, 'model.joblib')
        
        joblib_files = glob.glob(os.path.join(model_dir, "*.joblib"))
        if joblib_files:
            return joblib_files[0]
    
    return model_path

def find_model_file(model_dir):
    print(f"Searching for model in: {model_dir}")
    
    if os.path.exists(model_dir) and not os.path.isdir(model_dir):
        return model_dir
        
    if os.path.isdir(model_dir):
        print(f"Directory contents: {os.listdir(model_dir)}")
        if os.path.exists(os.path.join(model_dir, 'model.joblib')):
            return os.path.join(model_dir, 'model.joblib')
        joblib_files = glob.glob(os.path.join(model_dir, "*.joblib"))
        if joblib_files:
            return joblib_files[0]
        tar_files = glob.glob(os.path.join(model_dir, "*.tar.gz"))
        if tar_files:
            extract_dir = os.path.join(model_dir, "extracted")
            os.makedirs(extract_dir, exist_ok=True)
            with tarfile.open(tar_files[0], "r:gz") as tar:
                tar.extractall(path=extract_dir)
            print(f"Extracted tar file to {extract_dir}. Contents: {os.listdir(extract_dir)}")
            
            if os.path.exists(os.path.join(extract_dir, 'model.joblib')):
                return os.path.join(extract_dir, 'model.joblib')
            joblib_files = glob.glob(os.path.join(extract_dir, '**/*.joblib'), recursive=True)
            if joblib_files:
                return joblib_files[0]
    
    return None

def find_feature_columns(model_dir, model_path):
    feature_columns_path = os.path.join(model_dir, 'feature_columns.txt')
    
    if os.path.exists(feature_columns_path):
        return feature_columns_path
        
    feature_columns_path = os.path.join(os.path.dirname(model_path), 'feature_columns.txt')
    if os.path.exists(feature_columns_path):
        return feature_columns_path
        
    feature_files = glob.glob(os.path.join(model_dir, "**/*feature*"), recursive=True)
    if feature_files:
        return feature_files[0]
        
    return None

if __name__ == '__main__':
    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('--model_path', type=str, required=True)
        parser.add_argument('--test_data', type=str, required=True)
        parser.add_argument('--output_dir', type=str, required=True)
        
        args = parser.parse_args()
        
        os.makedirs(args.output_dir, exist_ok=True)
        
        model_dir = os.path.dirname(args.model_path)
        model_file_path = find_model_file(model_dir)
        if not model_file_path:
            raise FileNotFoundError(f"Could not find model file in {model_dir}")
        print(f"Using model file: {model_file_path}")
        
        model = joblib.load(model_file_path)
        print("Model loaded successfully")
        
        feature_columns_path = find_feature_columns(model_dir, model_file_path)
        if feature_columns_path:
            print(f"Using feature columns file: {feature_columns_path}")
            with open(feature_columns_path, 'r') as f:
                feature_columns = f.read().strip().split(',')
            print(f"Feature columns loaded: {feature_columns}")
        else:
            print("WARNING: Could not find feature columns file. Will try to infer from test data.")
        
        test_data_path = os.path.join(args.test_data, 'test.csv')
        print(f"Loading test data from: {test_data_path}")
        test_data = pd.read_csv(test_data_path)
        print(f"Test data loaded successfully, shape: {test_data.shape}")
        print(f"Test data columns: {test_data.columns.tolist()}")
        
        if not feature_columns_path:
            feature_columns = [col for col in test_data.columns if col != 'target']
            print(f"Inferred feature columns: {feature_columns}")
        
        missing_cols = [col for col in feature_columns if col not in test_data.columns]
        if missing_cols:
            print(f"WARNING: Missing columns in test data: {missing_cols}")
            feature_columns = [col for col in feature_columns if col in test_data.columns]
            print(f"Using available columns: {feature_columns}")
        
        if 'target' not in test_data.columns:
            print("WARNING: 'target' column not found in test data.")
            potential_targets = [col for col in test_data.columns if col.lower() in ['label', 'class', 'y', 'output']]
            if potential_targets:
                target_col = potential_targets[0]
                print(f"Using '{target_col}' as target column.")
            else:
                raise ValueError("No suitable target column found in test data.")
        else:
            target_col = 'target'
        
        X_test = test_data[feature_columns]
        y_test = test_data[target_col]
        
        print("Making predictions...")
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        print("Predictions generated successfully")
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        
        metrics = {
            'accuracy': float(accuracy),
            'precision': float(precision),
            'recall': float(recall),
            'f1_score': float(f1),
            'roc_auc': float(roc_auc)
        }
        
        metrics_path = os.path.join(args.output_dir, 'evaluation.json')
        print(f"Saving metrics to: {metrics_path}")
        with open(metrics_path, 'w') as f:
            json.dump(metrics, f)
        
        print("Evaluation metrics:")
        for metric_name, metric_value in metrics.items():
            print(f"{metric_name}: {metric_value:.4f}")
            
    except Exception as e:
        print(f"ERROR: An exception occurred: {str(e)}")
        print("Traceback:")
        traceback.print_exc()
        with open(os.path.join(args.output_dir if 'args' in locals() else '/tmp', 'error.txt'), 'w') as f:
            f.write(f"Error: {str(e)}")
            f.write(traceback.format_exc())
        sys.exit(1)
"""

with open('evaluate.py', 'w') as f:
    f.write(evaluation_script)

evaluation_script_s3_uri = f's3://{bucket}/{s3_prefix}/scripts/evaluate.py'
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/scripts/evaluate.py").upload_file('evaluate.py')

# Step 5: Create inference script
inference_script = """
import os
import json
import pandas as pd
import joblib
import numpy as np
import glob
import tarfile

model = None
feature_columns = None

def find_model_file(model_dir):
    print(f"Searching for model in: {model_dir}")
    if os.path.exists(os.path.join(model_dir, 'model.joblib')):
        return os.path.join(model_dir, 'model.joblib')
    joblib_files = glob.glob(os.path.join(model_dir, '**/*.joblib'), recursive=True)
    if joblib_files:
        return joblib_files[0]
    tar_files = glob.glob(os.path.join(model_dir, '*.tar.gz'))
    if tar_files:
        extract_dir = os.path.join(model_dir, 'extracted')
        os.makedirs(extract_dir, exist_ok=True)
        with tarfile.open(tar_files[0], 'r:gz') as tar:
            tar.extractall(path=extract_dir)
        print(f"Extracted tar file to {extract_dir}. Contents: {os.listdir(extract_dir)}")
        if os.path.exists(os.path.join(extract_dir, 'model.joblib')):
            return os.path.join(extract_dir, 'model.joblib')
        joblib_files = glob.glob(os.path.join(extract_dir, '**/*.joblib'), recursive=True)
        if joblib_files:
            return joblib_files[0]
    raise FileNotFoundError(f"Could not find model file in {model_dir}")

def find_feature_columns(model_dir, model_path):
    feature_columns_path = os.path.join(model_dir, 'feature_columns.txt')
    if os.path.exists(feature_columns_path):
        return feature_columns_path
    feature_columns_path = os.path.join(os.path.dirname(model_path), 'feature_columns.txt')
    if os.path.exists(feature_columns_path):
        return feature_columns_path
    feature_files = glob.glob(os.path.join(model_dir, '**/*feature*'), recursive=True)
    if feature_files:
        return feature_files[0]
    return None

def model_fn(model_dir):
    global model, feature_columns
    print(f"Model directory contents: {os.listdir(model_dir)}")
    model_path = find_model_file(model_dir)
    print(f"Loading model from: {model_path}")
    model = joblib.load(model_path)
    feature_columns_path = find_feature_columns(model_dir, model_path)
    if feature_columns_path:
        print(f"Loading feature columns from: {feature_columns_path}")
        with open(feature_columns_path, 'r') as f:
            feature_columns = f.read().strip().split(',')
        print(f"Feature columns: {feature_columns}")
    else:
        print("WARNING: Could not find feature columns file.")
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        input_data = json.loads(request_body)
        df = pd.DataFrame(input_data)
    elif request_content_type == 'text/csv':
        df = pd.read_csv(pd.io.StringIO(request_body))
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")
    if feature_columns is not None:
        missing_cols = [col for col in feature_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required features: {missing_cols}")
        return df[feature_columns]
    exclude_cols = ['target', 'label', 'class', 'y', 'output']
    input_cols = [col for col in df.columns if col not in exclude_cols]
    return df[input_cols]

def predict_fn(input_data, model):
    if hasattr(model, 'predict_proba'):
        predictions = model.predict_proba(input_data)[:, 1]
    else:
        predictions = model.predict(input_data)
    return predictions

def output_fn(predictions, response_content_type):
    if response_content_type == 'application/json':
        return json.dumps({'predictions': predictions.tolist()})
    else:
        raise ValueError(f"Unsupported response content type: {response_content_type}")
"""

with open('inference.py', 'w') as f:
    f.write(inference_script)

inference_script_s3_uri = f's3://{bucket}/{s3_prefix}/scripts/inference.py'
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/scripts/inference.py").upload_file('inference.py')

# Step 6: Define the training step
print("\nSetting up training step...")

sklearn_estimator = SKLearn(
    entry_point='train.py',
    source_dir=None,
    script_uri=training_script_s3_uri,
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version='0.23-1',
    hyperparameters={
        'n_estimators': 10,
        'min_samples_leaf': 2,
        'max_depth': 5
    },
    sagemaker_session=pipeline_session
)

train_step = TrainingStep(
    name="ModelTraining",
    estimator=sklearn_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=s3_train_data,
            content_type="text/csv"
        )
    }
)

# Step 7: Define the evaluation step
print("\nSetting up evaluation step...")

sklearn_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(
        framework='sklearn',
        region=region,
        version='0.23-1'
    ),
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    base_job_name='model-evaluation',
    command=['python3'],
    sagemaker_session=pipeline_session
)

evaluation_step = ProcessingStep(
    name="ModelEvaluation",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=train_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=s3_test_data,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation"
        )
    ],
    code=evaluation_script_s3_uri,
    job_arguments=[
        '--model_path', '/opt/ml/processing/model/model.joblib',
        '--test_data', '/opt/ml/processing/test',
        '--output_dir', '/opt/ml/processing/evaluation'
    ]
)

# Step 8: Define the model creation step
print("\nSetting up model creation step...")

model = Model(
    image_uri=sagemaker.image_uris.retrieve(
        framework='sklearn',
        region=region,
        version='0.23-1'
    ),
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=pipeline_session,
    entry_point='inference.py',
    source_dir=None,
    code_location=f's3://{bucket}/{s3_prefix}/code'
)

create_model_step = ModelStep(
    name="DiabetesPrediction-V1",
    step_args=model.create(instance_type="ml.m5.large")
)

# Step 9: Create the pipeline
pipeline_name = "DiabetesMLPipeline-V1"
print(f"\nCreating pipeline: {pipeline_name}")

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[],
    steps=[train_step, evaluation_step, create_model_step],
    sagemaker_session=pipeline_session
)

# Step 10: Create the pipeline in SageMaker
print("\nCreating the pipeline in SageMaker...")
try:
    response = pipeline.create(role_arn=role)
    print(f"Pipeline ARN: {response}")

    # Uncomment to start the pipeline execution
    # execution = pipeline.start()
    # print(f"Pipeline execution started with ARN: {execution.arn}")
    
    print("\nIMPORTANT: The pipeline has been created but not started.")
    print("To start the pipeline execution, run:")
    print("execution = pipeline.start()")
    
    print("\nAfter pipeline execution completes, a model can be deployed as follows:")
    print("""
    model_name = create_model_step.properties.ModelName
    
    model = Model(
        model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
        image_uri=sagemaker.image_uris.retrieve('sklearn', region, '0.23-1'),
        role=role,
        name=model_name,
        entry_point='inference.py'
    )
    
    predictor = model.deploy(
        initial_instance_count=1,
        instance_type='ml.m5.large',
        endpoint_name='sklearn-endpoint-' + time.strftime('%Y-%m-%d-%H-%M-%S')
    )
    
    sample_input = test_df.drop('target', axis=1).head(5)
    predictions = predictor.predict(sample_input.to_csv(index=False).encode('utf-8'))
    print(f"Predictions: {predictions}")
    """)
    
except Exception as e:
    print(f"Error creating pipeline: {e}")

print("\nPipeline setup complete!")


SageMaker Role: arn:aws:iam::533267367615:role/CFN-SM-IM-Lambda-catalog-SageMakerExecutionRole-SytEij7wtq5A
SageMaker Session Region: us-east-1
Default S3 bucket: sagemaker-us-east-1-533267367615
Loading diabetes dataset from S3...

Dataset columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'target']

Sample of the data:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  target  
0                     0.627   50       1  
1                     0.351   31       0  
2                     0.672   32       1  



Setting up model creation step...



Creating pipeline: DiabetesMLPipeline-V1

Creating the pipeline in SageMaker...




Pipeline ARN: {'PipelineArn': 'arn:aws:sagemaker:us-east-1:533267367615:pipeline/DiabetesMLPipeline-V1', 'ResponseMetadata': {'RequestId': 'cc40c684-8022-47de-89ad-de8fce6a605c', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'cc40c684-8022-47de-89ad-de8fce6a605c', 'content-type': 'application/x-amz-json-1.1', 'content-length': '89', 'date': 'Tue, 08 Apr 2025 17:53:35 GMT'}, 'RetryAttempts': 0}}

IMPORTANT: The pipeline has been created but not started.
To start the pipeline execution, run:
execution = pipeline.start()

After pipeline execution completes, a model can be deployed as follows:

    model_name = create_model_step.properties.ModelName
    
    model = Model(
        model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
        image_uri=sagemaker.image_uris.retrieve('sklearn', region, '0.23-1'),
        role=role,
        name=model_name,
        entry_point='inference.py'
    )
    
    predictor = model.deploy(
        initial_instance_count=1,
   

## Diabetes Pipeline - V2 (with Model Registration)

In [9]:
# Fixed SageMaker ML Pipeline with Model Registry publishing and automated approval
# This version uses the diabetes dataset, registers the model in the Model Registry, but separates deployment

import boto3
import sagemaker
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.model import Model
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.inputs import TrainingInput
import time
import os
from sklearn.model_selection import train_test_split

# IMPORTS for Model Registry
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.model_metrics import ModelMetrics, MetricsSource
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.functions import Join
from sagemaker.model import ModelPackage

# Set up SageMaker session
sagemaker_session = sagemaker.Session()
region = sagemaker_session.boto_region_name
role = get_execution_role()
bucket = sagemaker_session.default_bucket()
s3_prefix = 'simple-ml-pipeline-fixed'

print(f"SageMaker Role: {role}")
print(f"SageMaker Session Region: {region}")
print(f"Default S3 bucket: {bucket}")

# Step 1: Load the diabetes dataset from S3
print("Loading diabetes dataset from S3...")
df = pd.read_csv("s3://s3-mlpipeline/diabetes-prediction/diabetes.csv")

# Rename Outcome to target
df = df.rename(columns={"Outcome": "target"})
print("\nDataset columns:", df.columns.tolist())
print("\nSample of the data:")
print(df.head())

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Save train and test datasets locally
train_file = 'train.csv'
test_file = 'test.csv'
train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

# Upload train and test files to S3
s3_train_data = f's3://{bucket}/{s3_prefix}/data/{train_file}'
s3_test_data = f's3://{bucket}/{s3_prefix}/data/{test_file}'
print("\nUploading training data to S3...")
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/data/{train_file}").upload_file(train_file)
print("\nUploading test data to S3...")
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/data/{test_file}").upload_file(test_file)
print(f"Training data uploaded to: {s3_train_data}")
print(f"Test data uploaded to: {s3_test_data}")

# Step 2: Create pipeline session
pipeline_session = PipelineSession()

# Step 3: Create training script
training_script = """
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    
    # Hyperparameters
    parser.add_argument('--n_estimators', type=int, default=10)
    parser.add_argument('--min_samples_leaf', type=int, default=2)
    parser.add_argument('--max_depth', type=int, default=5)
    
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    
    args = parser.parse_args()
    
    print(f"Training directory contents: {os.listdir(args.train)}")
    
    # Load data
    training_data_path = os.path.join(args.train, 'train.csv')
    print(f"Loading training data from: {training_data_path}")
    train_data = pd.read_csv(training_data_path)
    print(f"Training data shape: {train_data.shape}")
    
    # Split features and target
    X = train_data.drop('target', axis=1)
    y = train_data['target']
    
    print("Training RandomForest model...")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        min_samples_leaf=args.min_samples_leaf,
        max_depth=args.max_depth,
        random_state=42
    )
    
    model.fit(X, y)
    print("Model training completed")
    
    # Save model
    model_path = os.path.join(args.model_dir, 'model.joblib')
    print(f"Saving model to: {model_path}")
    joblib.dump(model, model_path)
    
    # Save feature columns
    feature_columns = X.columns.tolist()
    feature_columns_path = os.path.join(args.model_dir, 'feature_columns.txt')
    print(f"Saving feature columns to: {feature_columns_path}")
    with open(feature_columns_path, 'w') as f:
        f.write(','.join(feature_columns))
    
    print(f"Model directory contents: {os.listdir(args.model_dir)}")
    print("Training complete")
"""

with open('train.py', 'w') as f:
    f.write(training_script)

training_script_s3_uri = f's3://{bucket}/{s3_prefix}/scripts/train.py'
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/scripts/train.py").upload_file('train.py')

# Step 4: Create evaluation script
evaluation_script = """
import argparse
import os
import sys
import glob
import pandas as pd
import numpy as np
import json
import joblib
import traceback
import tarfile
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def find_model_file(model_dir):
    if os.path.isdir(model_dir):
        joblib_files = glob.glob(os.path.join(model_dir, '*.joblib'))
        if joblib_files:
            return joblib_files[0]
        tar_files = glob.glob(os.path.join(model_dir, '*.tar.gz'))
        if tar_files:
            extract_dir = os.path.join(model_dir, 'extracted')
            os.makedirs(extract_dir, exist_ok=True)
            with tarfile.open(tar_files[0], 'r:gz') as tar:
                tar.extractall(path=extract_dir)
            joblib_files = glob.glob(os.path.join(extract_dir, '**/*.joblib'), recursive=True)
            if joblib_files:
                return joblib_files[0]
    return model_dir

def find_feature_columns(model_dir, model_path):
    fc_path = os.path.join(model_dir, 'feature_columns.txt')
    if os.path.exists(fc_path):
        return fc_path
    fc_path = os.path.join(os.path.dirname(model_path), 'feature_columns.txt')
    if os.path.exists(fc_path):
        return fc_path
    return None

if __name__ == '__main__':
    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('--model_path', type=str, required=True)
        parser.add_argument('--test_data', type=str, required=True)
        parser.add_argument('--output_dir', type=str, required=True)
        args = parser.parse_args()
        
        os.makedirs(args.output_dir, exist_ok=True)
        
        model_dir = os.path.dirname(args.model_path)
        model_file = find_model_file(model_dir)
        model = joblib.load(model_file)
        print("Model loaded successfully.")
        
        feature_columns_path = find_feature_columns(model_dir, model_file)
        if feature_columns_path:
            with open(feature_columns_path, 'r') as f:
                feature_columns = f.read().strip().split(',')
        else:
            feature_columns = None
        
        test_data_path = os.path.join(args.test_data, 'test.csv')
        test_data = pd.read_csv(test_data_path)
        
        if not feature_columns:
            feature_columns = [col for col in test_data.columns if col != 'target']
        
        X_test = test_data[feature_columns]
        y_test = test_data['target']
        
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        roc_auc = roc_auc_score(y_test, y_prob)
        
        metrics = {
            'accuracy': float(accuracy),
            'precision': float(precision),
            'recall': float(recall),
            'f1_score': float(f1),
            'roc_auc': float(roc_auc)
        }
        
        # Save to evaluation.json
        eval_path = os.path.join(args.output_dir, 'evaluation.json')
        with open(eval_path, 'w') as f:
            json.dump(metrics, f)
        
        print("Evaluation metrics:", metrics)
        
    except Exception as e:
        print("Error during evaluation:", str(e))
        traceback.print_exc()
        sys.exit(1)
"""

with open('evaluate.py', 'w') as f:
    f.write(evaluation_script)

evaluation_script_s3_uri = f's3://{bucket}/{s3_prefix}/scripts/evaluate.py'
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/scripts/evaluate.py").upload_file('evaluate.py')

# Step 5: Create inference script
inference_script = """
import os
import json
import pandas as pd
import joblib
import numpy as np
import glob
import tarfile

model = None
feature_columns = None

def find_model_file(model_dir):
    joblib_files = glob.glob(os.path.join(model_dir, '*.joblib'))
    if joblib_files:
        return joblib_files[0]
    tar_files = glob.glob(os.path.join(model_dir, '*.tar.gz'))
    if tar_files:
        extract_dir = os.path.join(model_dir, 'extracted')
        os.makedirs(extract_dir, exist_ok=True)
        with tarfile.open(tar_files[0], 'r:gz') as tar:
            tar.extractall(path=extract_dir)
        joblib_files = glob.glob(os.path.join(extract_dir, '**/*.joblib'), recursive=True)
        if joblib_files:
            return joblib_files[0]
    raise FileNotFoundError("No model file found.")

def find_feature_columns(model_dir, model_path):
    fc_path = os.path.join(model_dir, 'feature_columns.txt')
    if os.path.exists(fc_path):
        return fc_path
    fc_path = os.path.join(os.path.dirname(model_path), 'feature_columns.txt')
    if os.path.exists(fc_path):
        return fc_path
    return None

def model_fn(model_dir):
    global model, feature_columns
    model_path = find_model_file(model_dir)
    model = joblib.load(model_path)
    fc_path = find_feature_columns(model_dir, model_path)
    if fc_path:
        with open(fc_path, 'r') as f:
            feature_columns = f.read().strip().split(',')
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == 'application/json':
        data = json.loads(request_body)
        df = pd.DataFrame(data)
    elif request_content_type == 'text/csv':
        df = pd.read_csv(pd.io.StringIO(request_body))
    else:
        raise ValueError("Unsupported content type.")
    if feature_columns:
        return df[feature_columns]
    exclude_cols = ['target', 'label', 'class', 'y', 'output']
    return df[[col for col in df.columns if col not in exclude_cols]]

def predict_fn(input_data, model):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(input_data)[:, 1]
    else:
        return model.predict(input_data)

def output_fn(prediction, response_content_type):
    if response_content_type == 'application/json':
        return json.dumps({'predictions': prediction.tolist()})
    else:
        raise ValueError("Unsupported content type.")
"""

with open('inference.py', 'w') as f:
    f.write(inference_script)

inference_script_s3_uri = f's3://{bucket}/{s3_prefix}/scripts/inference.py'
boto3.Session().resource('s3').Bucket(bucket).Object(f"{s3_prefix}/scripts/inference.py").upload_file('inference.py')

# Step 6: Define the training step
print("\nSetting up training step...")

sklearn_estimator = SKLearn(
    entry_point='train.py',
    source_dir=None,
    script_uri=training_script_s3_uri,
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version='0.23-1',
    hyperparameters={
        'n_estimators': 10,
        'min_samples_leaf': 2,
        'max_depth': 5
    },
    sagemaker_session=pipeline_session
)

train_step = TrainingStep(
    name="ModelTraining",
    estimator=sklearn_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=s3_train_data,
            content_type="text/csv"
        )
    }
)

# Step 7: Define the evaluation step with a property file for metrics
print("\nSetting up evaluation step...")

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",  
    path="evaluation.json"      
)

sklearn_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(
        framework='sklearn',
        region=region,
        version='0.23-1'
    ),
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    base_job_name='model-evaluation',
    command=['python3'],
    sagemaker_session=pipeline_session
)

evaluation_step = ProcessingStep(
    name="ModelEvaluation",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(
            source=train_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model"
        ),
        ProcessingInput(
            source=s3_test_data,
            destination="/opt/ml/processing/test"
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name="evaluation",
            source="/opt/ml/processing/evaluation"
        )
    ],
    code=evaluation_script_s3_uri,
    job_arguments=[
        '--model_path', '/opt/ml/processing/model/model.joblib',
        '--test_data', '/opt/ml/processing/test',
        '--output_dir', '/opt/ml/processing/evaluation'
    ],
    property_files=[evaluation_report]
)

# Step 8: Define the model creation step (used if you want an immediate model object)
print("\nSetting up model creation step...")

model = Model(
    image_uri=sagemaker.image_uris.retrieve(
        framework='sklearn',
        region=region,
        version='0.23-1'
    ),
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    role=role,
    sagemaker_session=pipeline_session,
    entry_point='inference.py',
    source_dir=None,
    code_location=f's3://{bucket}/{s3_prefix}/code'
)

create_model_step = ModelStep(
    name="DiabetesPrediction-V2",
    step_args=model.create(instance_type="ml.m5.large")
)

# Step 9: Register the model in the Model Registry (automatically approved here)
print("\nSetting up RegisterModel step...")
print(evaluation_step.properties.ProcessingOutputConfig.Outputs["evaluation"].S3Output.S3Uri.expr)

model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(on='/', values=[
            evaluation_step.properties.ProcessingOutputConfig.Outputs["evaluation"].S3Output.S3Uri,
            "evaluation.json"
        ]),
        content_type="application/json"
    )
)

register_model_step = RegisterModel(
    name="RegisterModelStep",
    estimator=sklearn_estimator,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv", "application/json"],
    response_types=["text/csv", "application/json"],
    inference_instances=["ml.m5.large"],
    transform_instances=["ml.m5.large"],
    model_package_group_name="DiabetesModelPackageGroup",
    model_metrics=model_metrics,
    approval_status="Approved"  # Automatically approve this version
)

# Assemble the pipeline steps (train, evaluate, create model, register)
# Note: We've removed the deployment step from the pipeline
pipeline_name = "DiabetesMLPipeline-V2"
print(f"\nCreating pipeline: {pipeline_name}")

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[],
    steps=[train_step, evaluation_step, create_model_step, register_model_step],  # Removed deploy_model_step
    sagemaker_session=pipeline_session
)

# Create the pipeline in SageMaker
print("\nCreating the pipeline in SageMaker...")
try:
    response = pipeline.create(role_arn=role)
    print(f"Pipeline ARN: {response}")

    # Uncomment to start the pipeline execution
    # execution = pipeline.start()
    # print(f"Pipeline execution started with ARN: {execution.arn}")
    
    print("\nIMPORTANT: The pipeline has been created but not started.")
    print("To start the pipeline execution, run:\n")
    print("execution = pipeline.start()")
    print("\nThe steps will train the model, evaluate it, and register it (with auto-approval).")
    
except Exception as e:
    print(f"Error creating pipeline: {e}")

print("\nPipeline setup complete!")

# SEPARATE DEPLOYMENT CODE
# This code should be run AFTER the pipeline has completed successfully
# It will deploy the latest approved model from the model registry

def deploy_latest_approved_model(model_package_group_name="DiabetesModelPackageGroup", 
                               endpoint_name=None, 
                               instance_type="ml.m5.large", 
                               initial_instance_count=1):
    """
    Deploy the latest approved model from a model package group.
    
    Parameters:
    -----------
    model_package_group_name : str
        Name of the model package group to deploy from
    endpoint_name : str, optional
        Custom name for the endpoint. If None, a timestamped name will be generated.
    instance_type : str
        SageMaker instance type for the endpoint
    initial_instance_count : int
        Number of instances to deploy
        
    Returns:
    --------
    str
        Name of the deployed endpoint
    """
    sm_client = boto3.client('sagemaker')
    
    # Get the latest approved model package
    response = sm_client.list_model_packages(
        ModelPackageGroupName=model_package_group_name,
        ModelApprovalStatus='Approved',
        SortBy='CreationTime',
        SortOrder='Descending',
        MaxResults=1
    )
    
    if not response['ModelPackageSummaryList']:
        raise Exception(f"No approved models found in model package group: {model_package_group_name}")
        
    model_package_arn = response['ModelPackageSummaryList'][0]['ModelPackageArn']
    print(f"Latest approved model package ARN: {model_package_arn}")
    
    # Create a timestamped endpoint name if not provided
    if endpoint_name is None:
        endpoint_name = f"diabetes-endpoint-{time.strftime('%Y-%m-%d-%H-%M-%S')}"
    
    # Create a SageMaker session
    sm_session = sagemaker.Session()
    
    # Create a model package
    model = ModelPackage(
        role=role,
        model_package_arn=model_package_arn,
        sagemaker_session=sm_session
    )
    
    # Deploy the model
    print(f"Deploying model to endpoint: {endpoint_name}")
    predictor = model.deploy(
        initial_instance_count=initial_instance_count,
        instance_type=instance_type,
        endpoint_name=endpoint_name
    )
    
    print(f"Model deployed successfully to endpoint: {endpoint_name}")
    return endpoint_name

# Example usage (uncomment when ready to deploy):
"""
# Start the pipeline and wait for it to complete
execution = pipeline.start()
execution.wait()

# Check if the pipeline execution succeeded
if execution.describe()['PipelineExecutionStatus'] == 'Succeeded':
    # Deploy the latest approved model
    endpoint_name = deploy_latest_approved_model()
    print(f"Model deployed to endpoint: {endpoint_name}")
else:
    print("Pipeline execution failed. Check the SageMaker console for more details.")
"""

# Alternatively, if you already have a successful pipeline execution:
"""
endpoint_name = deploy_latest_approved_model()
print(f"Model deployed to endpoint: {endpoint_name}")
"""

SageMaker Role: arn:aws:iam::533267367615:role/CFN-SM-IM-Lambda-catalog-SageMakerExecutionRole-SytEij7wtq5A
SageMaker Session Region: us-east-1
Default S3 bucket: sagemaker-us-east-1-533267367615
Loading diabetes dataset from S3...

Dataset columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'target']

Sample of the data:
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  target  
0                     0.627   50       1  
1                     0.351   31       0  
2                     0.672   32       1  



Setting up model creation step...



Setting up RegisterModel step...
{'Get': "Steps.ModelEvaluation.ProcessingOutputConfig.Outputs['evaluation'].S3Output.S3Uri"}

Creating pipeline: DiabetesMLPipeline-V2

Creating the pipeline in SageMaker...




Pipeline ARN: {'PipelineArn': 'arn:aws:sagemaker:us-east-1:533267367615:pipeline/DiabetesMLPipeline-V2', 'ResponseMetadata': {'RequestId': '1374221d-b001-4735-8ef7-16053928d6ec', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '1374221d-b001-4735-8ef7-16053928d6ec', 'content-type': 'application/x-amz-json-1.1', 'content-length': '89', 'date': 'Tue, 08 Apr 2025 18:35:49 GMT'}, 'RetryAttempts': 0}}

IMPORTANT: The pipeline has been created but not started.
To start the pipeline execution, run:

execution = pipeline.start()

The steps will train the model, evaluate it, and register it (with auto-approval).

Pipeline setup complete!


'\nendpoint_name = deploy_latest_approved_model()\nprint(f"Model deployed to endpoint: {endpoint_name}")\n'

**Once the pipeline is completed, the latest approved model will be deployed to the endpoint with the following code:**

In [None]:
endpoint_name = deploy_latest_approved_model()
print(f"Model deployed to endpoint: {endpoint_name}")

Latest approved model package ARN: arn:aws:sagemaker:us-east-1:533267367615:model-package/DiabetesModelPackageGroup/1
Deploying model to endpoint: diabetes-endpoint-2025-04-08-18-42-32


---