In [35]:
import sagemaker
import boto3
import pandas as pd
import numpy as np
from sagemaker import get_execution_role
from sagemaker.estimator import Estimator
from sagemaker.inputs import TrainingInput
from sagemaker.deserializers import JSONDeserializer

from sagemaker.model import Model
from sagemaker.serializers import CSVSerializer
from sklearn.preprocessing import LabelEncoder
import os



# Step 1: Set up SageMaker session and role

In [3]:
sagemaker_session = sagemaker.Session()
role = get_execution_role()
region = sagemaker_session.boto_region_name
bucket = 'mybacket05'  # Replace with your S3 bucket name
prefix = 'sagemaker/houseprice-project'  # Replace with your S3 folder path
data_key = 'Housing.csv'  # Replace with your dataset file name
data_location = f's3://{bucket}/{data_key}'

In [4]:
df=pd.read_csv(data_location)

df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


# Handle categorical columns

In [8]:
categorical_cols = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']
label_encoders = {}  # Store encoders for inference
for col in categorical_cols:
    df[col] = df[col].str.lower()  # Convert to lowercase for consistency
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder for later use

In [9]:
df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,1,0,0,0,1,2,1,0
1,12250000,8960,4,4,4,1,0,0,0,1,3,0,0
2,12250000,9960,3,2,2,1,0,1,0,0,2,1,1
3,12215000,7500,4,2,2,1,0,1,0,1,3,1,0
4,11410000,7420,4,1,2,1,1,1,0,1,2,0,0


# Split data: 70% train, 20% validation, 10% test

In [11]:
train_data, validation_data, test_data = np.split(df.sample(frac=1, random_state=42),
    [int(0.7 * len(df)), int(0.9 * len(df))])

  return bound(*args, **kwds)


# Save splits to CSV (target as first column, no headers for SageMaker)

In [18]:
train_data = pd.concat([train_data["price"], train_data.drop("price", axis=1)], axis=1)
validation_data = pd.concat([validation_data["price"], validation_data.drop("price", axis=1)], axis=1)
test_data = pd.concat([test_data["price"], test_data.drop("price", axis=1)], axis=1)

# Save splits to CSV (no headers for SageMaker)

In [20]:
train_data.to_csv('train.csv', index=False, header=False)
validation_data.to_csv('validation.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

# Upload to S3

In [24]:
train_location = sagemaker_session.upload_data('train.csv', bucket=bucket, key_prefix=f'{prefix}/train')
validation_location = sagemaker_session.upload_data('validation.csv', bucket=bucket, key_prefix=f'{prefix}/validation')
test_location = sagemaker_session.upload_data('test.csv', bucket=bucket, key_prefix=f'{prefix}/test')

print("Train data:", train_location)
print("Validation data:", validation_location)
print("Test data:", test_location)


Train data: s3://mybacket05/sagemaker/houseprice-project/train/train.csv
Validation data: s3://mybacket05/sagemaker/houseprice-project/validation/validation.csv
Test data: s3://mybacket05/sagemaker/houseprice-project/test/test.csv



# Step 2: Train the model (XGBoost for regression)

In [25]:
xgboost = Estimator(
    image_uri=sagemaker.image_uris.retrieve('xgboost', region, 'latest'),
    role=role,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=f's3://{bucket}/{prefix}/output',
    sagemaker_session=sagemaker_session
)



# Set hyperparameters for regression

In [32]:
xgboost.set_hyperparameters(
    objective='reg:linear',  # For regression
    num_round=100,
    max_depth=5,
    eta=0.3,
    subsample=0.8
)


# Define input channels

In [29]:
train_input = TrainingInput(s3_data=train_location, content_type='csv')
validation_input = TrainingInput(s3_data=validation_location, content_type='csv')

# Train the model

In [33]:
xgboost.fit({'train': train_input, 'validation': validation_input})

INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: xgboost-2025-06-05-13-59-33-938


2025-06-05 13:59:37 Starting - Starting the training job...
2025-06-05 14:00:01 Starting - Preparing the instances for training...
2025-06-05 14:00:27 Downloading - Downloading input data...
2025-06-05 14:00:52 Downloading - Downloading the training image...
2025-06-05 14:01:33 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2025-06-05:14:01:40:INFO] Running standalone xgboost training.[0m
[34m[2025-06-05:14:01:40:INFO] File size need to be processed in the node: 0.02mb. Available memory size in the node: 8065.79mb[0m
[34m[2025-06-05:14:01:40:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:01:40] S3DistributionType set as FullyReplicated[0m
[34m[14:01:40] 381x12 matrix with 4572 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2025-06-05:14:01:40:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:01:40] S3DistributionType set as FullyReplicated[0m
[34m[14:01:40] 

# Step 3: Deploy for real-time inference

In [37]:
predictor=xgboost.deploy(initial_instance_count=1,instance_type="ml.t2.medium",
                        serializer=CSVSerializer(),
                        deserializer=JSONDeserializer())

INFO:sagemaker:Creating model with name: xgboost-2025-06-05-14-06-11-970
INFO:sagemaker:Creating endpoint-config with name xgboost-2025-06-05-14-06-11-970
INFO:sagemaker:Creating endpoint with name xgboost-2025-06-05-14-06-11-970


------------!

# Test real-time inference

In [42]:
from sagemaker.deserializers import CSVDeserializer

# Reconfigure predictor with CSVDeserializer
predictor.deserializer = CSVDeserializer()

# Test real-time inference
test_sample = test_data.drop('price', axis=1).copy()

# Apply LabelEncoder to categorical columns if they are still strings
for col in categorical_cols:
    if test_sample[col].dtype == object:
        test_sample[col] = label_encoders[col].transform(test_sample[col])

# Select first two rows and convert to CSV string
test_sample = test_sample.iloc[0:2]  # Use iloc to keep as DataFrame
test_sample_csv = test_sample.to_csv(index=False, header=False)

# Make prediction
try:
    predictions = predictor.predict(test_sample_csv)
    # CSVDeserializer returns a list of lists; extract first column for regression
    predictions = [float(pred[0]) for pred in predictions]
    print("Real-Time Predictions:", predictions)
except Exception as e:
    print(f"Error during prediction: {e}")
    # Fallback: Handle raw response
    response = sagemaker_session.sagemaker_runtime_client.invoke_endpoint(
        EndpointName=predictor.endpoint_name,
        ContentType='text/csv',
        Accept='text/csv',
        Body=test_sample_csv
    )
    raw_predictions = response['Body'].read().decode('utf-8')
    # Parse comma-separated predictions
    predictions = [float(pred) for pred in raw_predictions.strip().split(',')]
    print("Raw Predictions (CSV fallback):", predictions)

Real-Time Predictions: [3738832.75]


# Step 4 (Optional): Batch inference

In [47]:
# Step 4: Batch inference
test_data_no_target = test_data.drop('price', axis=1).copy()
for col in categorical_cols:
    if test_data_no_target[col].dtype == object:
        test_data_no_target[col] = label_encoders[col].transform(test_data_no_target[col])
test_data_no_target.to_csv('test_no_target.csv', index=False, header=False)
test_no_target_location = sagemaker_session.upload_data('test_no_target.csv', bucket=bucket, key_prefix=f'{prefix}/test')

transformer = xgboost.transformer(
    instance_count=1,
    instance_type='ml.m5.large',  # Use supported instance type
    output_path=f's3://{bucket}/{prefix}/batch_output'
)
try:
    transformer.transform(
        data=test_no_target_location,
        content_type='text/csv',
        split_type='Line'
    )
    transformer.wait()
    print("Batch predictions saved to:", transformer.output_path)
except Exception as e:
    print(f"Batch transform error: {e}")
    print("Ensure the instance type is supported and within your quota. Check quotas in AWS Service Quotas console.")

INFO:sagemaker:Creating model with name: xgboost-2025-06-05-14-22-37-369
INFO:sagemaker:Creating transform job with name: xgboost-2025-06-05-14-22-38-179


Batch transform error: An error occurred (ResourceLimitExceeded) when calling the CreateTransformJob operation: The account-level service limit 'ml.m5.large for transform job usage' is 0 Instances, with current utilization of 0 Instances and a request delta of 1 Instances. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.
Ensure the instance type is supported and within your quota. Check quotas in AWS Service Quotas console.


In [50]:
import boto3

# Initialize Service Quotas client
try:
    client = boto3.client('service-quotas', region_name=region)
    # Get SageMaker quotas
    response = client.list_service_quotas(ServiceCode='sagemaker')
    transform_quotas = [
        quota for quota in response['Quotas']
        if 'for transform job usage' in quota['QuotaName'].lower()
    ]
    if transform_quotas:
        print("Available Transform Job Quotas:")
        for quota in transform_quotas:
            print(f"Instance Type: {quota['QuotaName']}, Quota: {quota['Value']}")
    else:
        print("No transform job quotas found. Check AWS Service Quotas console or request a quota increase.")
except client.exceptions.AccessDeniedException:
    print("Error: IAM role lacks 'servicequotas:GetServiceQuota' permission. Add this permission to your role.")
    print("IAM Policy Example:")
    print("""
    {
        "Effect": "Allow",
        "Action": ["servicequotas:GetServiceQuota"],
        "Resource": "*"
    }
    """)
except Exception as e:
    print(f"Error fetching quotas: {e}")
    print("Manually check quotas in AWS Service Quotas console under 'Amazon SageMaker'.")

No transform job quotas found. Check AWS Service Quotas console or request a quota increase.
