In [None]:
# ============================================================
# Environment Setup
# ============================================================

import sys
import os
import warnings
warnings.filterwarnings('ignore')

project_root = os.path.abspath('../..')
if project_root not in sys.path:
    sys.path.append(project_root)

import sagemaker
import boto3
from sagemaker import get_execution_role
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CreateModelStep
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.functions import JsonGet, Join
from sagemaker.workflow.parameters import ParameterString, ParameterFloat, ParameterInteger
from sagemaker.workflow.properties import PropertyFile
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.model_metrics import MetricsSource, ModelMetrics
import pandas as pd
import json
import time

try:
    from utils.sagemaker_config import get_sagemaker_config
    config = get_sagemaker_config(s3_prefix='lab7-pipelines')
    role = config['role']
    session = config['session']
    bucket = config['bucket']
    region = config['region']
except ImportError:
    role = get_execution_role()
    session = sagemaker.Session()
    bucket = session.default_bucket()
    region = session.boto_region_name

print(f"Configuration complete")
print(f"Region: {region}")
print(f"S3 Bucket: s3://{bucket}")
print(f"Role: {role}")

sm_client = boto3.client('sagemaker', region_name=region)


---

## Section 1: Pr√©parer les Scripts de Pipeline


In [None]:
# ============================================================
# Cr√©er les scripts de processing et training
# ============================================================

# Script de preprocessing
preprocessing_script = """
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--test-size', type=float, default=0.2)
    args = parser.parse_args()
    
    # Generate synthetic data
    np.random.seed(42)
    n_samples = 5000
    
    X = np.random.randn(n_samples, 10)
    y = (X[:, 0] + X[:, 1] - X[:, 2] > 0).astype(int)
    
    # Add noise features
    noise = np.random.randn(n_samples, 5) * 0.1
    X = np.hstack([X, noise])
    
    # Create DataFrame
    feature_cols = [f'feature_{i}' for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=feature_cols)
    df['target'] = y
    
    # Split data
    train_df, test_df = train_test_split(df, test_size=args.test_size, random_state=42)
    
    print(f"Train size: {len(train_df)}")
    print(f"Test size: {len(test_df)}")
    
    # Save processed data
    os.makedirs('/opt/ml/processing/train', exist_ok=True)
    os.makedirs('/opt/ml/processing/test', exist_ok=True)
    
    train_df.to_csv('/opt/ml/processing/train/train.csv', index=False)
    test_df.to_csv('/opt/ml/processing/test/test.csv', index=False)
    
    print("Preprocessing complete")
"""

# Script de training
training_script = """
import argparse
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib
import json

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--n-estimators', type=int, default=100)
    parser.add_argument('--max-depth', type=int, default=10)
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    
    args = parser.parse_args()
    
    # Load data
    train_df = pd.read_csv(os.path.join(args.train, 'train.csv'))
    test_df = pd.read_csv(os.path.join(args.test, 'test.csv'))
    
    X_train = train_df.drop('target', axis=1)
    y_train = train_df['target']
    X_test = test_df.drop('target', axis=1)
    y_test = test_df['target']
    
    # Train model
    print(f"Training with n_estimators={args.n_estimators}, max_depth={args.max_depth}")
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_depth=args.max_depth,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    # Evaluate
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    
    metrics = {
        'train_accuracy': float(accuracy_score(y_train, train_pred)),
        'test_accuracy': float(accuracy_score(y_test, test_pred)),
        'test_precision': float(precision_score(y_test, test_pred)),
        'test_recall': float(recall_score(y_test, test_pred)),
        'test_f1': float(f1_score(y_test, test_pred))
    }
    
    print(f"Metrics: {metrics}")
    
    # Save model
    joblib.dump(model, os.path.join(args.model_dir, 'model.pkl'))
    
    # Save metrics for pipeline evaluation
    os.makedirs(args.output_data_dir, exist_ok=True)
    with open(os.path.join(args.output_data_dir, 'evaluation.json'), 'w') as f:
        json.dump(metrics, f)
    
    print("Training complete")
"""

# Script d'√©valuation
evaluation_script = """
import argparse
import os
import pandas as pd
import json
import joblib
import tarfile
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-dir', type=str, default='/opt/ml/processing/model')
    parser.add_argument('--test-data', type=str, default='/opt/ml/processing/test')
    parser.add_argument('--output-dir', type=str, default='/opt/ml/processing/evaluation')
    
    args = parser.parse_args()
    
    # Extract model from tar.gz
    model_path = os.path.join(args.model_dir, 'model.tar.gz')
    extract_dir = '/tmp/model'
    os.makedirs(extract_dir, exist_ok=True)
    
    with tarfile.open(model_path, 'r:gz') as tar:
        tar.extractall(path=extract_dir)
    
    # Load model
    model = joblib.load(os.path.join(extract_dir, 'model.pkl'))
    
    # Load test data
    test_df = pd.read_csv(os.path.join(args.test_data, 'test.csv'))
    X_test = test_df.drop('target', axis=1)
    y_test = test_df['target']
    
    # Predict
    predictions = model.predict(X_test)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, output_dict=True)
    conf_matrix = confusion_matrix(y_test, predictions).tolist()
    
    evaluation_metrics = {
        'classification_metrics': {
            'accuracy': {'value': accuracy},
            'precision': {'value': report['1']['precision']},
            'recall': {'value': report['1']['recall']},
            'f1_score': {'value': report['1']['f1-score']}
        },
        'confusion_matrix': conf_matrix
    }
    
    print(f"Evaluation metrics: {evaluation_metrics}")
    
    # Save evaluation report
    os.makedirs(args.output_dir, exist_ok=True)
    with open(os.path.join(args.output_dir, 'evaluation.json'), 'w') as f:
        json.dump(evaluation_metrics, f)
    
    print("Evaluation complete")
"""

# Sauvegarder les scripts
os.makedirs('pipeline_scripts', exist_ok=True)

with open('pipeline_scripts/preprocessing.py', 'w') as f:
    f.write(preprocessing_script)

with open('pipeline_scripts/training.py', 'w') as f:
    f.write(training_script)

with open('pipeline_scripts/evaluation.py', 'w') as f:
    f.write(evaluation_script)

print("Scripts cr√©√©s:")
print("- pipeline_scripts/preprocessing.py")
print("- pipeline_scripts/training.py")
print("- pipeline_scripts/evaluation.py")

---

## Section 2: D√©finir les Param√®tres du Pipeline


In [None]:
# ============================================================
# Param√®tres du pipeline
# ============================================================

# Param√®tres d'entr√©e du pipeline (peuvent √™tre modifi√©s lors de l'ex√©cution)
processing_instance_type = ParameterString(
    name="ProcessingInstanceType",
    default_value="ml.t3.medium"  # Changed from ml.m5.large due to quota limits
)

training_instance_type = ParameterString(
    name="TrainingInstanceType",
    default_value="ml.m5.xlarge"  # Training often has higher quotas
)

n_estimators = ParameterInteger(
    name="NEstimators",
    default_value=100
)

max_depth = ParameterInteger(
    name="MaxDepth",
    default_value=10
)

test_size = ParameterFloat(
    name="TestSize",
    default_value=0.2
)

accuracy_threshold = ParameterFloat(
    name="AccuracyThreshold",
    default_value=0.75
)

model_approval_status = ParameterString(
    name="ModelApprovalStatus",
    default_value="PendingManualApproval"
)

print("Param√®tres du pipeline d√©finis:")
print(f"- ProcessingInstanceType: {processing_instance_type.default_value}")
print(f"- TrainingInstanceType: {training_instance_type.default_value}")
print(f"- NEstimators: {n_estimators.default_value}")
print(f"- MaxDepth: {max_depth.default_value}")
print(f"- TestSize: {test_size.default_value}")
print(f"- AccuracyThreshold: {accuracy_threshold.default_value}")
print(f"- ModelApprovalStatus: {model_approval_status.default_value}")


---

## Section 3: √âtape de Preprocessing


In [None]:
# ============================================================
# √âtape 1: Processing
# ============================================================

sklearn_processor = SKLearnProcessor(
    framework_version='1.2-1',
    role=role,
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name='pipeline-preprocessing',
    sagemaker_session=session
)

step_process = ProcessingStep(
    name="PreprocessData",
    processor=sklearn_processor,
    code='pipeline_scripts/preprocessing.py',
    job_arguments=[
        '--test-size', str(test_size.default_value)
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name='train',
            source='/opt/ml/processing/train',
            destination=f's3://{bucket}/pipeline-data/train'
        ),
        sagemaker.processing.ProcessingOutput(
            output_name='test',
            source='/opt/ml/processing/test',
            destination=f's3://{bucket}/pipeline-data/test'
        )
    ]
)

print("√âtape de preprocessing cr√©√©e")
print(f"Nom: {step_process.name}")
print(f"Outputs: train, test")

---

## Section 4: √âtape de Training


In [None]:
# ============================================================
# √âtape 2: Training
# ============================================================

sklearn_estimator = SKLearn(
    entry_point='pipeline_scripts/training.py',
    framework_version='1.2-1',
    instance_type=training_instance_type,
    instance_count=1,
    role=role,
    base_job_name='pipeline-training',
    sagemaker_session=session,
    hyperparameters={
        'n-estimators': n_estimators,
        'max-depth': max_depth
    },
    output_path=f's3://{bucket}/pipeline-models'
)

step_train = TrainingStep(
    name="TrainModel",
    estimator=sklearn_estimator,
    inputs={
        'train': sagemaker.inputs.TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs['train'].S3Output.S3Uri,
            content_type='text/csv'
        ),
        'test': sagemaker.inputs.TrainingInput(
            s3_data=step_process.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,
            content_type='text/csv'
        )
    }
)

print("√âtape de training cr√©√©e")
print(f"Nom: {step_train.name}")
print(f"Framework: scikit-learn")
print(f"Hyperparam√®tres: n_estimators={n_estimators.default_value}, max_depth={max_depth.default_value}")

---

## Section 5: √âtape d'√âvaluation


In [None]:
# ============================================================
# √âtape 3: Evaluation
# ============================================================

evaluation_processor = SKLearnProcessor(
    framework_version='1.2-1',
    role=role,
    instance_type='ml.t3.medium',
    instance_count=1,
    base_job_name='pipeline-evaluation',
    sagemaker_session=session
)

evaluation_report = PropertyFile(
    name="EvaluationReport",
    output_name="evaluation",
    path="evaluation.json"
)

step_eval = ProcessingStep(
    name="EvaluateModel",
    processor=evaluation_processor,
    code='pipeline_scripts/evaluation.py',
    inputs=[
        sagemaker.processing.ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination='/opt/ml/processing/model',
            input_name='model'
        ),
        sagemaker.processing.ProcessingInput(
            source=step_process.properties.ProcessingOutputConfig.Outputs['test'].S3Output.S3Uri,
            destination='/opt/ml/processing/test',
            input_name='test'
        )
    ],
    outputs=[
        sagemaker.processing.ProcessingOutput(
            output_name='evaluation',
            source='/opt/ml/processing/evaluation',
            destination=f's3://{bucket}/pipeline-evaluation'
        )
    ],
    property_files=[evaluation_report]
)

print("√âtape d'√©valuation cr√©√©e")
print(f"Nom: {step_eval.name}")
print(f"Property file: {evaluation_report.name}")

---

## Section 6: Enregistrement dans Model Registry


In [None]:
# ============================================================
# √âtape 4: Register Model
# ============================================================

model_package_group_name = 'pipeline-fraud-detection-models'

# Cr√©er le Model Package Group s'il n'existe pas
try:
    sm_client.create_model_package_group(
        ModelPackageGroupName=model_package_group_name,
        ModelPackageGroupDescription='Models from SageMaker Pipeline'
    )
    print(f"Model Package Group cr√©√©: {model_package_group_name}")
except:
    print(f"Model Package Group existe d√©j√†: {model_package_group_name}")

# D√©finir les m√©triques du mod√®le
# Utiliser Join() pour concat√©ner les variables Pipeline
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on="/",
            values=[
                step_eval.properties.ProcessingOutputConfig.Outputs['evaluation'].S3Output.S3Uri,
                "evaluation.json"
            ]
        ),
        content_type="application/json"
    )
)

# √âtape d'enregistrement
step_register = RegisterModel(
    name="RegisterModel",
    estimator=sklearn_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.c5.xlarge", "ml.m5.xlarge"],
    transform_instances=["ml.c5.xlarge"],
    model_package_group_name=model_package_group_name,
    approval_status=model_approval_status,
    model_metrics=model_metrics
)

print("√âtape d'enregistrement cr√©√©e")
print(f"Model Package Group: {model_package_group_name}")
print(f"Approval status: {model_approval_status.default_value}")


---

## Section 7: Condition de D√©ploiement


In [None]:
# ============================================================
# √âtape 5: Conditional Step
# ============================================================

# Condition: d√©ployer seulement si accuracy >= threshold
cond_gte_threshold = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="classification_metrics.accuracy.value"
    ),
    right=accuracy_threshold
)

step_cond = ConditionStep(
    name="CheckAccuracyThreshold",
    conditions=[cond_gte_threshold],
    if_steps=[step_register],
    else_steps=[]
)

print("Condition step cr√©√©e")
print(f"Condition: accuracy >= {accuracy_threshold.default_value}")
print(f"If true: enregistrer le mod√®le")
print(f"If false: skip registration")

---

## Section 8: Cr√©er et Ex√©cuter le Pipeline


In [None]:
# ============================================================
# Cr√©er le pipeline
# ============================================================

pipeline_name = f'fraud-detection-pipeline-{int(time.time())}'

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_type,
        training_instance_type,
        n_estimators,
        max_depth,
        test_size,
        accuracy_threshold,
        model_approval_status
    ],
    steps=[
        step_process,
        step_train,
        step_eval,
        step_cond
    ],
    sagemaker_session=session
)

print(f"Pipeline cr√©√©: {pipeline_name}")
print(f"\n√âtapes du pipeline:")
print(f"1. {step_process.name}")
print(f"2. {step_train.name}")
print(f"3. {step_eval.name}")
print(f"4. {step_cond.name}")
print(f"   ‚îî‚îÄ {step_register.name} (si accuracy >= threshold)")

In [None]:
# ============================================================
# Cr√©er ou mettre √† jour le pipeline
# ============================================================

# Upsert le pipeline
pipeline.upsert(role_arn=role)
print(f"\nPipeline '{pipeline_name}' cr√©√© dans SageMaker")

# Afficher la d√©finition
pipeline_definition = json.loads(pipeline.definition())
print(f"\nNombre d'√©tapes: {len(pipeline_definition['Steps'])}")
print(f"Nombre de param√®tres: {len(pipeline_definition['Parameters'])}")

In [None]:
# ============================================================
# Ex√©cuter le pipeline
# ============================================================

execution = pipeline.start(
    parameters={
        'NEstimators': 50,
        'MaxDepth': 10,
        'AccuracyThreshold': 0.7
    }
)

print(f"\nEx√©cution du pipeline d√©marr√©e")
print(f"Execution ARN: {execution.arn}")
execution_name = execution.arn.split('/')[-1]
print(f"Nom: {execution_name}")
print(f"\nVous pouvez suivre l'ex√©cution dans la console SageMaker:")
print(f"https://console.aws.amazon.com/sagemaker/home?region={region}#/pipelines/{pipeline_name}/executions")

In [None]:
# ============================================================
# Attendre la fin de l'ex√©cution (optionnel)
# ============================================================

print("\nAttente de la fin de l'ex√©cution...")

try:
    execution.wait()
    print("\nEx√©cution termin√©e!")
    print(f"Status: {execution.describe()['PipelineExecutionStatus']}")
except Exception as e:
    print(f"\n‚ö†Ô∏è  Erreur lors de l'ex√©cution du pipeline: {e}")
    print("\nDiagnostic des erreurs...")
    
    # Obtenir les d√©tails de l'ex√©cution
    execution_details = execution.describe()
    print(f"\nStatus: {execution_details['PipelineExecutionStatus']}")
    
    # Lister les √©tapes et trouver celle qui a √©chou√©
    steps_list = execution.list_steps()
    
    print("\nüìã √âtat des √©tapes:")
    for step in steps_list:
        status = step['StepStatus']
        step_name = step['StepName']
        
        if status == 'Failed':
            print(f"\n‚ùå {step_name}: FAILED")
            
            # Afficher les d√©tails de l'erreur
            if 'FailureReason' in step:
                print(f"   Raison: {step['FailureReason']}")
            
            # Afficher les m√©tadonn√©es pour obtenir le nom du job
            if 'Metadata' in step:
                if 'ProcessingJob' in step['Metadata']:
                    job_name = step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]
                    print(f"   Processing Job: {job_name}")
                    
                    # R√©cup√©rer les logs du processing job
                    try:
                        job_details = sm_client.describe_processing_job(ProcessingJobName=job_name)
                        if 'FailureReason' in job_details:
                            print(f"   D√©tails: {job_details['FailureReason']}")
                    except:
                        pass
                        
                elif 'TrainingJob' in step['Metadata']:
                    job_name = step['Metadata']['TrainingJob']['Arn'].split('/')[-1]
                    print(f"   Training Job: {job_name}")
                    
                    # R√©cup√©rer les logs du training job
                    try:
                        job_details = sm_client.describe_training_job(TrainingJobName=job_name)
                        if 'FailureReason' in job_details:
                            print(f"   D√©tails: {job_details['FailureReason']}")
                    except:
                        pass
        else:
            print(f"‚úÖ {step_name}: {status}")
    
    print("\nüí° Conseil: V√©rifiez les logs CloudWatch pour plus de d√©tails sur l'erreur.")


---

## Section 9: Analyser les R√©sultats


In [None]:
# ============================================================
# Analyser l'ex√©cution
# ============================================================

# R√©cup√©rer les d√©tails de l'ex√©cution
execution_details = execution.describe()

print("D√©tails de l'ex√©cution:")
print(f"Status: {execution_details['PipelineExecutionStatus']}")
print(f"Start Time: {execution_details['CreationTime']}")

if 'LastModifiedTime' in execution_details:
    duration = execution_details['LastModifiedTime'] - execution_details['CreationTime']
    print(f"Dur√©e totale: {duration}")

# Lister les √©tapes
print("\n\n√âtapes ex√©cut√©es:")
steps_list = execution.list_steps()

for step in steps_list:
    print(f"\n{step['StepName']}:")
    print(f"  Status: {step['StepStatus']}")
    
    if 'StartTime' in step and 'EndTime' in step:
        duration = step['EndTime'] - step['StartTime']
        print(f"  Dur√©e: {duration}")
    
    if 'Metadata' in step:
        if 'ProcessingJob' in step['Metadata']:
            print(f"  Processing Job: {step['Metadata']['ProcessingJob']['Arn'].split('/')[-1]}")
        elif 'TrainingJob' in step['Metadata']:
            print(f"  Training Job: {step['Metadata']['TrainingJob']['Arn'].split('/')[-1]}")
        elif 'RegisterModel' in step['Metadata']:
            print(f"  Model Package: {step['Metadata']['RegisterModel']['Arn'].split('/')[-1]}")


In [None]:
# ============================================================
# R√©cup√©rer les m√©triques d'√©valuation
# ============================================================

import boto3

s3 = boto3.client('s3')

# Trouver le fichier d'√©valuation
evaluation_s3_path = f's3://{bucket}/pipeline-evaluation/'

print("M√©triques d'√©valuation:")

try:
    # Lister les objets dans le bucket
    response = s3.list_objects_v2(
        Bucket=bucket,
        Prefix='pipeline-evaluation/'
    )
    
    if 'Contents' in response:
        # Prendre le fichier le plus r√©cent
        latest_file = sorted(response['Contents'], key=lambda x: x['LastModified'])[-1]
        
        # T√©l√©charger et lire le fichier
        obj = s3.get_object(Bucket=bucket, Key=latest_file['Key'])
        evaluation_metrics = json.loads(obj['Body'].read().decode('utf-8'))
        
        print(json.dumps(evaluation_metrics, indent=2))
        
        # Extraire accuracy
        accuracy = evaluation_metrics['classification_metrics']['accuracy']['value']
        print(f"\n\nAccuracy du mod√®le: {accuracy:.4f}")
        
        if accuracy >= accuracy_threshold.default_value:
            print(f"‚úÖ Mod√®le enregistr√© (accuracy >= {accuracy_threshold.default_value})")
        else:
            print(f"‚ùå Mod√®le non enregistr√© (accuracy < {accuracy_threshold.default_value})")
            
except Exception as e:
    print(f"Impossible de r√©cup√©rer les m√©triques: {e}")

In [None]:
# ============================================================
# V√©rifier les mod√®les enregistr√©s
# ============================================================

# Lister les mod√®les dans le Model Package Group
model_packages = sm_client.list_model_packages(
    ModelPackageGroupName=model_package_group_name,
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=5
)

print(f"\nMod√®les dans '{model_package_group_name}':")
print(f"{'Version':<10} {'Status':<25} {'Date':<20}")
print("-" * 55)

for package in model_packages['ModelPackageSummaryList']:
    version = package['ModelPackageVersion']
    status = package['ModelApprovalStatus']
    date = package['CreationTime'].strftime('%Y-%m-%d %H:%M:%S')
    print(f"{version:<10} {status:<25} {date:<20}")

if len(model_packages['ModelPackageSummaryList']) > 0:
    latest_package = model_packages['ModelPackageSummaryList'][0]
    print(f"\n\nDernier mod√®le enregistr√©:")
    print(f"ARN: {latest_package['ModelPackageArn']}")
    print(f"Status: {latest_package['ModelApprovalStatus']}")

---

## Section 10: Gestion du Pipeline


In [None]:
# ============================================================
# Re-ex√©cuter le pipeline avec diff√©rents param√®tres
# ============================================================

print("Ex√©cution du pipeline avec n_estimators=100...")

execution2 = pipeline.start(
    parameters={
        'NEstimators': 100,
        'MaxDepth': 15,
        'AccuracyThreshold': 0.75
    }
)

execution2_name = execution2.arn.split('/')[-1]
print(f"Nouvelle ex√©cution d√©marr√©e: {execution2_name}")
print("Le pipeline s'ex√©cute en arri√®re-plan...")

In [None]:
# ============================================================
# Lister toutes les ex√©cutions
# ============================================================

executions_response = sm_client.list_pipeline_executions(
    PipelineName=pipeline_name,
    SortBy='CreationTime',
    SortOrder='Descending',
    MaxResults=10
)

print(f"\nEx√©cutions du pipeline '{pipeline_name}':")
print(f"{'Nom':<40} {'Status':<20} {'Date':<20}")
print("-" * 80)

for exec_summary in executions_response['PipelineExecutionSummaries']:
    name = exec_summary['PipelineExecutionArn'].split('/')[-1]
    status = exec_summary['PipelineExecutionStatus']
    date = exec_summary['StartTime'].strftime('%Y-%m-%d %H:%M:%S')
    print(f"{name:<40} {status:<20} {date:<20}")

---

## Section 11: Cleanup


In [None]:
# ============================================================
# Cleanup (optionnel)
# ============================================================

# Option 1: Supprimer le pipeline
# sm_client.delete_pipeline(PipelineName=pipeline_name)
# print(f"Pipeline '{pipeline_name}' supprim√©")

# Option 2: Supprimer les artefacts S3 (attention!)
# s3 = boto3.resource('s3')
# bucket_obj = s3.Bucket(bucket)
# bucket_obj.objects.filter(Prefix='pipeline-data/').delete()
# bucket_obj.objects.filter(Prefix='pipeline-models/').delete()
# bucket_obj.objects.filter(Prefix='pipeline-evaluation/').delete()

print("\nCleanup:")
print("- Pipeline conserv√© pour r√©f√©rence")
print("- Artefacts S3 conserv√©s")
print(f"\nPour supprimer manuellement:")
print(f"  sm_client.delete_pipeline(PipelineName='{pipeline_name}')")

---

## R√©sum√©

Dans ce lab, vous avez:

1. **Cr√©√© un pipeline SageMaker complet** avec 5 √©tapes
2. **Impl√©ment√© le preprocessing** des donn√©es
3. **Entra√Æn√© un mod√®le** avec hyperparam√®tres configurables
4. **√âvalu√© le mod√®le** automatiquement
5. **Ajout√© une condition** pour l'enregistrement automatique
6. **Enregistr√© le mod√®le** dans Model Registry
7. **Ex√©cut√© et monitor√©** le pipeline

### Architecture du Pipeline

```
Input Parameters
      ‚Üì
[1. PreprocessData] ‚Üí train.csv, test.csv
      ‚Üì
[2. TrainModel] ‚Üí model.tar.gz
      ‚Üì
[3. EvaluateModel] ‚Üí evaluation.json
      ‚Üì
[4. CheckAccuracyThreshold]
      ‚Üì (if accuracy >= threshold)
[5. RegisterModel] ‚Üí Model Registry
```

### Avantages des Pipelines

- **Automatisation**: Bout-en-bout, reproductible
- **Versioning**: Tous les artefacts sont versionn√©s
- **Param√©trage**: Hyperparam√®tres configurables
- **Conditions**: Logique de d√©cision automatique
- **Tra√ßabilit√©**: Lineage complet
- **Scalabilit√©**: Ex√©cution parall√®le possible

### Cas d'Usage

1. **CI/CD pour ML**: Automatiser re-training
2. **Experimentation**: Tester plusieurs hyperparam√®tres
3. **Production**: D√©ploiement automatique si qualit√© OK
4. **Audit**: Tra√ßabilit√© compl√®te des mod√®les

### Next Steps

- Lab 8: Deployment Strategies (Blue/Green, Canary)
- Ajouter des tests de qualit√© de donn√©es
- Impl√©menter Model Monitor
- Int√©grer avec EventBridge pour scheduling

---

**Best Practice**: Utiliser les pipelines pour tout workflow ML r√©p√©titif
