In [40]:
!pip install boto3 sagemaker



In [41]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFace
import yaml
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import Session
import json
from sagemaker.predictor import Predictor
from sagemaker.serializers import CSVSerializer
from sagemaker.xgboost.model import XGBoostModel
from sagemaker.inputs import TrainingInput

In [42]:
# Step 6: Download a file from your S3 bucket
s3_client = boto3.client('s3')

# Step 3: Define your S3 bucket name
bucket_name = 'sagemaker-us-east-1-025066244931'
def download_file_from_s3(s3_file, local_file):
    try:
        s3_client.download_file(bucket_name, s3_file, local_file)
        print(f"Successfully downloaded {s3_file} from {bucket_name} to {local_file}")
    except Exception as e:
        print(f"Error downloading file: {e}")

# Example: Download a file from S3
s3_file_to_download = 'validation.csv'  # Replace with the name of your file in S3
local_file_to_save = 'validation.csv'  # Replace with the desired local file name
download_file_from_s3(s3_file_to_download, local_file_to_save)

Successfully downloaded validation.csv from sagemaker-us-east-1-025066244931 to validation.csv


In [43]:
s3_file_to_download = 'train.csv'  # Replace with the name of your file in S3
local_file_to_save = 'train.csv'  # Replace with the desired local file name
download_file_from_s3(s3_file_to_download, local_file_to_save)

Successfully downloaded train.csv from sagemaker-us-east-1-025066244931 to train.csv


In [45]:
data_path = "./train.csv"
df_1 = pd.read_csv(data_path)
df_1['risklevel'].value_counts()

risklevel
8    92
7    90
4    90
1    85
2    84
0    81
9    79
6    71
3    64
5    64
Name: count, dtype: int64

In [46]:
data_path = "./validation.csv"
df_1 = pd.read_csv(data_path)
df_1['risklevel'].value_counts()

risklevel
4    30
5    23
6    22
3    21
0    21
7    21
2    18
1    18
8    13
9    13
Name: count, dtype: int64

In [59]:
data_path = "../Data/clean_test_model_data.csv"
df = pd.read_csv(data_path, header=None)
df = df.iloc[1:,:]

In [60]:
df

Unnamed: 0,0,1,2
1,22306.0,677220.0,2
2,36376.0,73000.0,9
3,97304.0,389825.0,9
4,48419.0,146225.0,2
5,26362.0,268250.0,4
...,...,...,...
996,63124.0,996175.0,2
997,44049.0,187450.0,5
998,85248.0,445700.0,1
999,79381.0,416950.0,8


In [54]:
df['risklevel'].value_counts()

risklevel
4    120
7    111
8    105
1    103
0    102
2    102
6     93
9     92
5     87
3     85
Name: count, dtype: int64

In [55]:
print(df.isnull().sum())
df['risklevel'] = df['risklevel'].astype('category')


zipcode      0
price        0
risklevel    0
dtype: int64


In [56]:
print(df['risklevel'].dtype)
print(df['risklevel'].value_counts())

category
risklevel
4    120
7    111
8    105
1    103
0    102
2    102
6     93
9     92
5     87
3     85
Name: count, dtype: int64


In [47]:
s3 = boto3.client('s3')
bucket_name = 'sagemaker-us-east-1-025066244931'  # Replace with your S3 bucket name

In [67]:
new_order = [df.columns[2]] + df.columns[:2].tolist() + df.columns[3:].tolist()
df = df[new_order]

In [69]:
df.reset_index(drop=True, inplace=True)

In [78]:
df.to_csv('./total_df.csv', index=False)

In [79]:
df_another_method = pd.read_csv('./total_df.csv', skiprows=1, header=None)
df_another_method

Unnamed: 0,0,1,2
0,2,48419.0,146225.0
1,4,26362.0,268250.0
2,4,49887.0,117400.0
3,0,79830.0,271875.0
4,5,39402.0,278975.0
...,...,...,...
992,2,63124.0,996175.0
993,5,44049.0,187450.0
994,1,85248.0,445700.0
995,8,79381.0,416950.0


In [82]:
df_another_method.iloc[:,1:]

Unnamed: 0,1,2
0,48419.0,146225.0
1,26362.0,268250.0
2,49887.0,117400.0
3,79830.0,271875.0
4,39402.0,278975.0
...,...,...
992,63124.0,996175.0
993,44049.0,187450.0
994,85248.0,445700.0
995,79381.0,416950.0


## Training the model

In [84]:
# Extract features and labels
X = df_another_method.iloc[:,:1]
y = df_another_method.iloc[:,1:]

with open("./config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, **config["train_test_split"])

# Save the train and test sets as CSV files
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

train_data.to_csv('./internal_data/train.csv', index=False)
test_data.to_csv('./internal_data/validation.csv', index=False)

# Upload them to S3
s3.upload_file('./internal_data/train.csv', bucket_name, 'train.csv')
s3.upload_file('./internal_data/validation.csv', bucket_name, 'validation.csv')


In [85]:
with open("./config.yaml", "r") as config_file:
    config = yaml.safe_load(config_file)

# set an output path where the trained model will be saved
bucket = sagemaker.Session().default_bucket()
prefix = 'DEMO-xgboost-as-a-built-in-algo'

# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
xgboost_container = sagemaker.image_uris.retrieve("xgboost", region, "1.7-1")

# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=xgboost_container, 
                                          hyperparameters=config['hyperparameters'],
                                          role=sagemaker.get_execution_role(),
                                          instance_count=config['sagemaker_config']['instance_count'], 
                                          instance_type=config['sagemaker_config']['instance_type'], 
                                          volume_size=config['sagemaker_config']['volume_size'],
                                          output_path=config['sagemaker_config']['output_path'])

content_type = "csv"
# define the data type and paths to the training and validation datasets
train_input = TrainingInput(
    s3_data=f's3://{bucket_name}/train',  # S3 path for training data
    content_type='csv'
)
validation_input = TrainingInput(
    s3_data=f's3://{bucket_name}/validation',  # S3 path for validation data
    content_type='csv'
)

# execute the XGBoost training job
estimator.fit({'train': train_input, 'validation': validation_input})

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-10-25-03-39-12-308


2024-10-25 03:39:13 Starting - Starting the training job...
2024-10-25 03:39:27 Starting - Preparing the instances for training...
2024-10-25 03:40:06 Downloading - Downloading the training image...
2024-10-25 03:40:46 Training - Training image download completed. Training in progress....
2024-10-25 03:41:12 Uploading - Uploading generated training model[34m[2024-10-25 03:41:04.479 ip-10-0-81-47.ec2.internal:7 INFO utils.py:28] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34m[2024-10-25 03:41:04.506 ip-10-0-81-47.ec2.internal:7 INFO profiler_config_parser.py:111] User has disabled profiler.[0m
[34m[2024-10-25:03:41:04:INFO] Imported framework sagemaker_xgboost_container.training[0m
[34m[2024-10-25:03:41:04:INFO] Failed to parse hyperparameter objective value multi:softmax to Json.[0m
[34mReturning the value itself[0m
[34m[2024-10-25:03:41:04:INFO] No GPUs detected (normal if no gpus installed)[0m
[34m[2024-10-25:03:41:04:INFO] Running XGBoost Sagemaker in algorithm mode[0m
[3

## Endpoint

In [86]:
model_artifact = f's3://{bucket_name}/output/sagemaker-xgboost-2024-10-25-03-39-12-308/output/model.tar.gz'

xgb_model = XGBoostModel(
    model_data=model_artifact,  # Path to the trained model artifact from the training job
    role='real_estate_endpoint',
    entry_point=None,           # Not needed for built-in XGBoost
    framework_version='1.7-1',  # Version of XGBoost used in training (align with image_uri)
)

# Configure serverless inference
serverless_config = sagemaker.serverless.ServerlessInferenceConfig(
    memory_size_in_mb=2048,  # Memory allocated for requests
    max_concurrency=5         # Maximum concurrent invocations
)

# Deploy the model using serverless inference
predictor = xgb_model.deploy(
    serverless_inference_config=serverless_config,
    container_startup_health_check_timeout=600  # Increase timeout to 300 seconds
)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-10-25-03-48-16-520
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-10-25-03-48-17-285
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-10-25-03-48-17-285


-----------------------------------------!

## Test Endpoint

In [88]:
# Initialize a SageMaker session
sagemaker_session = Session()

# Replace with your endpoint name
endpoint_name = 'sagemaker-xgboost-2024-10-25-03-48-17-285'

# Create a Predictor object
predictor = Predictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)

# Set the serializer for the predictor to CSV
predictor.serializer = CSVSerializer()

# Prepare the input data as CSV
csv_input = "8523,384444"

# Make a prediction using CSV input
response = predictor.predict(
    data=csv_input,  # Send the CSV data
    initial_args={'ContentType': 'text/csv'}  # Specify the content type as CSV
)

# Read and decode the response
result = response.decode('utf-8')
print("Prediction result:", result)

Prediction result: 4.0

