In [9]:
# Cell 1: Environment Setup & Load Prepared Data
import sagemaker
import boto3
import pandas as pd

# Initialize SageMaker session and get execution role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
prefix = 'iot-intrusion-detection' # The main project folder

print("--- Loading prepared feature set from S3 ---")
print(bucket)
# Define the path to the feature set created by the first notebook
feature_path = f"s3://{bucket}/{prefix}/features/combined_features.parquet"

# Load the data into a DataFrame
combined_features_df = pd.read_parquet(feature_path)

print(f"Successfully loaded {len(combined_features_df)} records.")
display(combined_features_df.head())

--- Loading prepared feature set from S3 ---
sagemaker-us-east-2-696680564117
Successfully loaded 819 records.


Unnamed: 0,device_id,window_end,orig_bytes_sum,resp_bytes_sum,orig_pkts_sum,resp_pkts_sum,duration_mean,unique_dest_ips,unique_dest_ports,conn_count,alert_count,unique_alert_signatures
0,0.0.0.0,2025-09-18 18:09:00+00:00,600,0,2,0,0.001393,1,1,1,0,0
1,0.0.0.0,2025-09-18 18:15:00+00:00,0,0,1,0,0.0,1,1,1,0,0
2,0.0.0.0,2025-09-18 18:26:00+00:00,600,0,2,0,0.001184,1,1,1,0,0
3,0.0.0.0,2025-09-18 18:43:00+00:00,600,0,2,0,0.00121,1,1,1,0,0
4,0.0.0.0,2025-09-18 18:51:00+00:00,0,0,1,0,0.0,1,1,1,0,0


In [8]:
# Cell 2: Configure and Launch SageMaker Training Job (Corrected & robust)

from io import StringIO
import boto3, sagemaker
from sagemaker import image_uris

# 1) Features (match Lambda)
features_for_model = ["orig_bytes_sum", "resp_bytes_sum", "conn_count", "alert_count"]
training_data = combined_features_df[features_for_model].astype("float32")

# Clean up any NaN/Inf and drop all-zero rows (optional but recommended)
training_data = training_data.replace([pd.NA, pd.NaT], 0).fillna(0)
training_data = training_data.replace([float("inf"), float("-inf")], 0)
training_data = training_data[(training_data != 0).any(axis=1)]

# Safety: ensure we still have enough rows
n = len(training_data)
assert n > 10, f"Too few rows ({n}) after cleaning; gather more data."

# 2) Upload single CSV to S3
csv_buffer = StringIO()
training_data.to_csv(csv_buffer, header=False, index=False)
training_s3_path = f"s3://{bucket}/{prefix}/rcf-training-data/train.csv"
sagemaker.s3.S3Uploader.upload_string_as_file_body(csv_buffer.getvalue(), training_s3_path)
print(f"Training data uploaded to: {training_s3_path}")

# 3) Estimator (modern image resolver)
rcf_image = image_uris.retrieve(framework="randomcutforest", region=boto3.Session().region_name)
rcf = sagemaker.estimator.Estimator(
    image_uri=rcf_image,
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket}/{prefix}/rcf-output",
    sagemaker_session=sagemaker_session,
)

# 4) Hyperparameters
nspt = min(64, max(8, n - 1))
print(f"Training rows: {n}, num_samples_per_tree: {nspt}")
rcf.set_hyperparameters(
    num_samples_per_tree=nspt,
    num_trees=100,
    feature_dim=training_data.shape[1],
)

# 5) Fit
s3_input_train = sagemaker.inputs.TrainingInput(
    s3_data=training_s3_path,
    content_type="text/csv;label_size=0",   # <= critical
    distribution="ShardedByS3Key",
)
rcf.fit({"train": s3_input_train})


INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: randomcutforest-2025-09-18-21-50-41-822


Training matrix shape: (819, 10)
Std dev per feature:
 orig_bytes_sum             3.292967
resp_bytes_sum             3.087289
orig_pkts_sum              1.251838
resp_pkts_sum              1.397090
duration_mean              1.150380
unique_dest_ips            0.829810
unique_dest_ports          0.204350
conn_count                 0.981606
alert_count                0.089902
unique_alert_signatures    0.089902
Training data uploaded to: s3://sagemaker-us-east-2-696680564117/iot-intrusion-detection/rcf-training-data/train_10f.csv
Training rows: 819, num_samples_per_tree: 64, feature_dim: 10
2025-09-18 21:50:43 Starting - Starting the training job...
2025-09-18 21:50:56 Starting - Preparing the instances for training...
2025-09-18 21:51:19 Downloading - Downloading input data...
2025-09-18 21:51:59 Downloading - Downloading the training image........Docker entrypoint called with argument(s): train
Running default environment configuration script
  if num_device is 1 and 'dist' not in kv

In [5]:
# Cell 3: Deploy the Trained Model (stable name)

endpoint_name = "iot-rcf-4feat-prod"  # or reuse existing and set update_endpoint=True elsewhere
rcf_predictor = rcf.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=endpoint_name
)
print("Model endpoint is now active:", endpoint_name)

INFO:sagemaker:Creating model with name: randomcutforest-2025-09-18-20-21-09-726
INFO:sagemaker:Creating endpoint-config with name iot-rcf-4feat-prod
INFO:sagemaker:Creating endpoint with name iot-rcf-4feat-prod


--------!Model endpoint is now active: iot-rcf-4feat-prod


In [11]:
# Cell 4: Test the Endpoint with Sample Data

from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
import numpy as np

rcf_predictor.serializer = CSVSerializer()
rcf_predictor.deserializer = JSONDeserializer()

sample_data = training_data.sample(min(5, len(training_data)), random_state=42).to_numpy()
results = rcf_predictor.predict(sample_data)
scores = [record["score"] for record in results["scores"]]
print("Anomaly scores for sample data (higher is more anomalous):")
print(scores)


Anomaly scores for sample data (higher is more anomalous):
[1.7194540282, 1.7529868017, 1.2274669087, 1.2241892695, 1.2432982537]
