In [2]:
# Cell 1: Environment Setup & Load Prepared Data
import sagemaker
import boto3
import pandas as pd

# Initialize SageMaker session and get execution role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
prefix = 'iot-intrusion-detection' # The main project folder

print("--- Loading prepared feature set from S3 ---")
print(bucket)
# Define the path to the feature set created by the first notebook
feature_path = f"s3://{bucket}/{prefix}/features/combined_features.parquet"

# Load the data into a DataFrame
combined_features_df = pd.read_parquet(feature_path)

print(f"Successfully loaded {len(combined_features_df)} records.")
display(combined_features_df.head())

--- Loading prepared feature set from S3 ---
sagemaker-us-east-2-696680564117
Successfully loaded 149 records.


Unnamed: 0,device_ip,timestamp,orig_bytes_sum,resp_bytes_sum,orig_pkts_sum,resp_pkts_sum,duration_mean,unique_dest_ips,unique_dest_ports,conn_count,alert_count,unique_alert_signatures
0,172.31.0.1,2025-08-30 21:01:00+00:00,1792.0,0.0,12,0,0.795472,1,1,2,0.0,0.0
1,172.31.0.1,2025-08-30 21:02:00+00:00,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0
2,172.31.0.1,2025-08-30 21:03:00+00:00,1792.0,0.0,12,0,0.793615,1,1,2,0.0,0.0
3,172.31.0.1,2025-08-30 21:04:00+00:00,0.0,0.0,0,0,0.0,0,0,0,0.0,0.0
4,172.31.0.1,2025-08-30 21:05:00+00:00,1792.0,0.0,12,0,0.792963,1,1,2,0.0,0.0


In [5]:
# Cell 2: Configure and Launch SageMaker Training Job (Corrected)
from io import StringIO
from sagemaker.amazon.amazon_estimator import get_image_uri
import sagemaker
import boto3

# 1. Select the numeric feature columns for the model
features_for_model = [
    'orig_bytes_sum', 'resp_bytes_sum', 'orig_pkts_sum', 'resp_pkts_sum',
    'duration_mean', 'unique_dest_ips', 'unique_dest_ports', 'conn_count',
    'alert_count', 'unique_alert_signatures'
]
training_data = combined_features_df[features_for_model].astype('float32')

# 2. Convert to CSV
csv_buffer = StringIO()
training_data.to_csv(csv_buffer, header=False, index=False)
csv_content = csv_buffer.getvalue()

# --- FIX IS HERE ---
# First, define the full S3 path where the data will be uploaded
training_s3_path = f"s3://{bucket}/{prefix}/rcf-training-data/train.csv"

# Now, upload the CSV string to that single, full path
sagemaker.s3.S3Uploader.upload_string_as_file_body(csv_content, training_s3_path)
print(f"Training data uploaded to: {training_s3_path}")

# 3. Configure the SageMaker Estimator
rcf_image = get_image_uri(boto3.Session().region_name, "randomcutforest")
rcf = sagemaker.estimator.Estimator(
    image_uri=rcf_image,
    role=role, # <-- FIX: Was 'role-role'
    instance_count=1, # <-- FIX: Was 'instance_count := 1'
    instance_type='ml.m5.xlarge',
    output_path=f"s3://{bucket}/{prefix}/rcf-output",
    sagemaker_session=sagemaker_session # <-- FIX: Was missing '='
)

# 4. Set hyperparameters
rcf.set_hyperparameters(
    num_samples_per_tree=256,
    num_trees=100,
    feature_dim=len(features_for_model)
)

# 5. Launch the training job
s3_input_train = sagemaker.inputs.TrainingInput(s3_data=training_s3_path, content_type='text/csv;label_size=0', distribution='ShardedByS3Key')

rcf.fit({'train': s3_input_train})

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-08-31-06-13-49-091


Training data uploaded to: s3://sagemaker-us-east-2-696680564117/iot-intrusion-detection/rcf-training-data/train.csv
2025-08-31 06:13:51 Starting - Starting the training job...
2025-08-31 06:14:21 Downloading - Downloading input data...
2025-08-31 06:14:42 Downloading - Downloading the training image.........
2025-08-31 06:16:18 Training - Training image download completed. Training in progress...Docker entrypoint called with argument(s): train
Running default environment configuration script
  if num_device is 1 and 'dist' not in kvstore:
  if cons['type'] is 'ineq':
  if len(self.X_min) is not 0:
[08/31/2025 06:16:30 INFO 140331238184768] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'aut

In [7]:
# Cell 3: Deploy the Trained Model
rcf_predictor = rcf.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium'
)
print("Model endpoint is now active.")

INFO:sagemaker:Creating model with name: randomcutforest-2025-08-31-06-21-52-072
INFO:sagemaker:Creating endpoint-config with name randomcutforest-2025-08-31-06-21-52-072
INFO:sagemaker:Creating endpoint with name randomcutforest-2025-08-31-06-21-52-072


-----------------!Model endpoint is now active.


In [8]:
# Cell 4: Test the Endpoint with Sample Data
import json
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

# Select a sample of data to test
sample_data = training_data.head(5).to_numpy()

# Set the correct serializer/deserializer for the RCF model
rcf_predictor.serializer = CSVSerializer()
rcf_predictor.deserializer = JSONDeserializer()

# Get predictions
results = rcf_predictor.predict(sample_data)
scores = [record['score'] for record in results['scores']]

print("Anomaly scores for sample data (higher is more anomalous):")
print(scores)

Anomaly scores for sample data (higher is more anomalous):
[0.0, 0.0, 0.0, 0.0, 0.0]
