In [1]:
import sagemaker
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Session setup
session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = session.default_bucket()

print(f"Role: {role}")
print(f"Bucket: {bucket}")
print(f"Region: {session.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
Role: arn:aws:iam::360138725243:role/service-role/AmazonSageMaker-ExecutionRole-20260206T133954
Bucket: sagemaker-us-east-1-360138725243
Region: us-east-1


In [2]:
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput

# Define the processor
sklearn_processor = SKLearnProcessor(
    framework_version='1.2-1',
    role=role,
    instance_type='ml.m5.large',
    instance_count=1,
    base_job_name='insurance-preprocess',
    sagemaker_session=session
)

# Run the processing job
sklearn_processor.run(
    code='../src/data/preprocess.py',
    inputs=[
        ProcessingInput(
            source=f's3://{bucket}/data/raw/motor-vehicle-insurance-data.csv',
            destination='/opt/ml/processing/input'
        )
    ],
    outputs=[
        ProcessingOutput(
            output_name='train',
            source='/opt/ml/processing/output/train',
            destination=f's3://{bucket}/data/processed/train'
        ),
        ProcessingOutput(
            output_name='validation',
            source='/opt/ml/processing/output/validation',
            destination=f's3://{bucket}/data/processed/validation'
        ),
        ProcessingOutput(
            output_name='test',
            source='/opt/ml/processing/output/test',
            destination=f's3://{bucket}/data/processed/test'
        )
    ]
)

print("Processing job complete!")

INFO:sagemaker:Creating processing-job with name insurance-preprocess-2026-02-08-22-50-42-280


[34mINFO:__main__:STARTING PREPROCESSING PIPELINE[0m
[34mINFO:__main__:Loading data from /opt/ml/processing/input/motor-vehicle-insurance-data.csv[0m
[34mINFO:__main__:Raw data shape: (105555, 30)[0m
[34mINFO:__main__:Lapse rate: 20.41%[0m
[34mINFO:__main__:Dropped columns: ['Date_lapse', 'Lapse'][0m
[34mINFO:__main__:Engineered features: Age, Years_driving, Customer_tenure_days[0m
[34mINFO:__main__:Handling missing values...[0m
[34mINFO:__main__:  Filled Length missing values with median: 4.23[0m
[34mINFO:__main__:  Filled Type_fuel missing values with mode: D[0m
[34mINFO:__main__:Encoding categorical features...[0m
[34mINFO:__main__:  Encoded Type_fuel: 2 categories[0m
[34mINFO:__main__:Remaining missing values: 0[0m
[34mINFO:__main__:Splitting data by customer ID...[0m
[34mINFO:__main__:Train set: 63,274 rows (20.22% lapse rate)[0m
[34mINFO:__main__:Validation set: 21,111 rows (20.69% lapse rate)[0m
[34mINFO:__main__:Test set: 21,170 rows (20.73% lapse

In [3]:
import boto3

s3 = boto3.client('s3')

for split in ['train', 'validation', 'test']:
    prefix = f'data/processed/{split}/'
    response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
    
    if 'Contents' in response:
        for obj in response['Contents']:
            size_mb = obj['Size'] / (1024 * 1024)
            print(f"  {obj['Key']}  ({size_mb:.2f} MB)")
    else:
        print(f"  ⚠️ No files found in {prefix}")

  data/processed/train/train.csv  (5.55 MB)
  data/processed/validation/validation.csv  (1.85 MB)
  data/processed/test/test.csv  (1.86 MB)


In [None]:
import pandas as pd

for split in ['train', 'validation', 'test']:
    path = f's3://{bucket}/data/processed/{split}/{split}.csv'
    df = pd.read_csv(path)
    print(f"{split:>12}: {df.shape[0]:>6,} rows × {df.shape[1]} cols | Lapse rate: {df['Lapsed'].mean():.2%}")

print(f"\n{'Features':>12}: {[col for col in df.columns if col != 'Lapsed']}")

       train: 63,274 rows × 26 cols | Lapse rate: 20.22%
