# 02 â€” Scikit-learn Experimentation
Local experiments using scikit-learn.

In [None]:
import sagemaker
import boto3
import pandas as pd
import numpy as np

# --- 1. SageMaker Session & Role ---
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
prefix = 'titanic-ml' # Project prefix for S3 organization

print(f"SageMaker Role ARN: {role}")
print(f"S3 Bucket: {bucket}")
print(f"S3 Prefix: {prefix}")

In [None]:
# --- Configuration (Run after the Prerequisites Setup block) ---
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.inputs import TrainingInput

# The S3 path where your processed data was saved in the previous notebook
INPUT_S3_URI = f"s3://{bucket}/{prefix}/processed"

# --- 1. Configure Estimator ---
sklearn_estimator = SKLearn(
    entry_point='model_sklearn.py',
    source_dir='src', # Directory containing the script
    role=role,
    instance_count=1,
    instance_type='ml.c5.xlarge', # Use a suitable instance type
    framework_version='1.2-1', # Choose a supported Scikit-learn version
    py_version='py3',
    hyperparameters={
        'n_estimators': 200,
        'max_depth': 8
    },
    base_job_name='sagemaker-titanic-sklearn'
)

# --- 2. Prepare Data Input Channel ---
# The 'train' channel name must match the argument parser in your script
inputs = {
    'train': TrainingInput(
        s3_data=INPUT_S3_URI, 
        distribution='FullyReplicated', 
        content_type='text/csv', 
        s3_data_type='S3Prefix'
    )
}

# --- 3. Launch Training Job ---
print("Launching Scikit-learn training job in SageMaker...")
sklearn_estimator.fit(inputs, wait=False) # Use wait=True for synchronous execution
print(f"Training job launched: {sklearn_estimator.latest_training_job.job_name}")