In [6]:
# upload vgsales.csv to S3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

session = sagemaker.Session()
role = get_execution_role()

# upload dataset
s3_uri = session.upload_data("vgsales.csv", bucket=session.default_bucket(), key_prefix="vgsales-data")
print("Uploaded to S3:", s3_uri)


Uploaded to S3: s3://sagemaker-us-east-2-411482901950/vgsales-data/vgsales.csv


In [9]:
%%writefile vgsales_model.py
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from io import StringIO

def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "vgsales_model.pkl"))
    return model

def input_fn(request_body, request_content_type):
    if request_content_type == 'text/csv':
        columns = ['Platform', 'Genre', 'Publisher', 'Year']
        input_df = pd.read_csv(StringIO(request_body), names=columns)
        return input_df
    else:
        raise ValueError("Unsupported content type: " + request_content_type)

def output_fn(prediction, content_type):
    if content_type == "text/csv":
        return ','.join(str(x) for x in prediction)
    else:
        raise ValueError("Unsupported content type: " + content_type)

def predict_fn(input_data, model):
    model_columns = joblib.load(os.path.join("/opt/ml/model", "model_columns.pkl"))
    le = joblib.load(os.path.join("/opt/ml/model", "label_encoder.pkl"))

    input_data_encoded = pd.get_dummies(input_data)
    for col in model_columns:
        if col not in input_data_encoded:
            input_data_encoded[col] = 0
    input_data_encoded = input_data_encoded[model_columns]

    predictions = model.predict(input_data_encoded)
    return le.inverse_transform(predictions)

def main():
    df = pd.read_csv("/opt/ml/input/data/training/vgsales.csv")
    df = df.dropna(subset=['Platform', 'Genre', 'Publisher', 'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales'])

    def get_top_region(row):
        return max(['NA_Sales', 'EU_Sales', 'JP_Sales'], key=lambda region: row[region])
    df['Top_Region'] = df.apply(get_top_region, axis=1)

    df = df.drop(columns=['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales'])

    X = df[['Platform', 'Genre', 'Publisher', 'Year']]
    y = df['Top_Region']

    X_encoded = pd.get_dummies(X)
    model_columns = X_encoded.columns.tolist()
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_encoded, y_encoded)

    model_dir = "/opt/ml/model"
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(model, os.path.join(model_dir, "vgsales_model.pkl"))
    joblib.dump(le, os.path.join(model_dir, "label_encoder.pkl"))
    joblib.dump(model_columns, os.path.join(model_dir, "model_columns.pkl"))

if __name__ == "__main__":
    main()


Writing vgsales_model.py


In [11]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point="vgsales_model.py",
    source_dir=".",
    role=role,
    instance_type="ml.m5.large",
    framework_version="0.23-1",
    py_version="py3",
    sagemaker_session=session
)

sklearn_estimator.fit({"training": s3_uri})


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2025-05-29-15-51-34-153


2025-05-29 15:51:39 Starting - Starting the training job...
2025-05-29 15:52:10 Downloading - Downloading input data...
2025-05-29 15:52:36 Downloading - Downloading the training image.....[34m2025-05-29 15:53:20,503 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-05-29 15:53:20,506 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-29 15:53:20,543 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-05-29 15:53:20,905 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-29 15:53:20,917 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-29 15:53:20,929 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-29 15:53:20,938 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m

In [12]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=sklearn_estimator.model_data,
    role=role,
    entry_point="vgsales_model.py",
    framework_version="0.23-1",
    sagemaker_session=session
)

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="vgsales-endpoint"
)


INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-05-29-15-55-29-037
INFO:sagemaker:Creating endpoint-config with name vgsales-endpoint
INFO:sagemaker:Creating endpoint with name vgsales-endpoint


----------!

In [13]:
from sagemaker.predictor import Predictor

df = pd.read_csv("vgsales.csv")
df = df.dropna(subset=['Platform', 'Genre', 'Publisher', 'Year', 'NA_Sales', 'EU_Sales', 'JP_Sales'])
sample = df[['Platform', 'Genre', 'Publisher', 'Year']].iloc[[0]]
csv_input = sample.to_csv(index=False, header=False)

predictor = Predictor(endpoint_name="vgsales-endpoint")

response = predictor.predict(
    csv_input,
    initial_args={"ContentType": "text/csv", "Accept": "text/csv"}
)

print("Prediction:", response)


Prediction: b'NA_Sales'


In [14]:
import boto3

job_name = sklearn_estimator.latest_training_job.name
bucket = session.default_bucket()
key_prefix = f"{job_name}/output"
model_tar_path = f"s3://{bucket}/{key_prefix}/model.tar.gz"

session.download_data(path=".", bucket=bucket, key_prefix=key_prefix)
print("Downloaded model to current directory.")


Downloaded model to current directory.
