In [46]:
import pandas as pd

# load uploaded dataset files
red = pd.read_csv("winequality-red.csv", sep=';')
white = pd.read_csv("winequality-white.csv", sep=';')

# Add wine type column
red["wine_type"] = 0  # red
white["wine_type"] = 1  # white

# combine into one DataFrame
df = pd.concat([red, white], ignore_index=True)

# save combined dataset to CSV
df.to_csv("winequality_combined.csv", index=False)

# Preview
df.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [57]:
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

session = sagemaker.Session()
role = get_execution_role()

s3_uri = session.upload_data("winequality_combined.csv", bucket=session.default_bucket(), key_prefix="wine-quality")
print("Uploaded to S3:", s3_uri)


Uploaded to S3: s3://sagemaker-us-east-2-411482901950/wine-quality/winequality_combined.csv


In [65]:
%%writefile wine_lr_model.py
import pandas as pd
from sklearn.linear_model import LinearRegression
import joblib
import os
from io import StringIO

# load model function for SageMaker
def model_fn(model_dir):
    model = joblib.load(os.path.join(model_dir, "model.joblib"))
    return model

# prediction function for SageMaker
def predict_fn(input_data, model):
    return model.predict(input_data)

# input function to process incoming CSV request
def input_fn(request_body, request_content_type):
    if request_content_type == 'text/csv':
        # explicitly define the feature column names (used during training)
        columns = [
            'fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
            'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
            'pH', 'sulphates', 'alcohol', 'wine_type'
        ]
        return pd.read_csv(StringIO(request_body), names=columns)
    else:
        raise ValueError("This model only supports CSV input")

# output function to return CSV-formatted prediction
def output_fn(prediction, response_content_type):
    if response_content_type == 'text/csv':
        return ','.join(str(x) for x in prediction)
    else:
        raise ValueError("This model only supports CSV output")

# main training logic
def main():
    # load training data
    df = pd.read_csv("/opt/ml/input/data/training/winequality_combined.csv")
    X = df.drop(columns=["quality"])
    y = df["quality"]

    # train model
    model = LinearRegression()
    model.fit(X, y)

    # save model to the expected location
    model_dir = "/opt/ml/model"
    os.makedirs(model_dir, exist_ok=True)
    joblib.dump(model, os.path.join(model_dir, "model.joblib"))

if __name__ == "__main__":
    main()


Overwriting wine_lr_model.py


In [66]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point="wine_lr_model.py",
    source_dir=".", 
    role=role,
    instance_type="ml.m5.large",
    framework_version="0.23-1",
    py_version="py3",
    sagemaker_session=session
)


sklearn_estimator.fit({"training": s3_uri})


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-scikit-learn-2025-05-24-21-35-43-567


2025-05-24 21:35:48 Starting - Starting the training job...
2025-05-24 21:36:21 Downloading - Downloading input data...
2025-05-24 21:36:46 Downloading - Downloading the training image.....[34m2025-05-24 21:37:31,680 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-05-24 21:37:31,683 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-24 21:37:31,724 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-05-24 21:37:32,031 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-24 21:37:32,044 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-24 21:37:32,056 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-05-24 21:37:32,065 sagemaker-training-toolkit INFO     Invoking user script[0m
[34mTraining Env:[0m
[34m

In [67]:
from sagemaker.sklearn.model import SKLearnModel

# create SKLearnModel from the trained estimator
model = SKLearnModel(
    model_data=sklearn_estimator.model_data,
    role=role,
    entry_point="wine_lr_model.py",
    framework_version="0.23-1",
    sagemaker_session=session
)

# deploy the model
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.t2.medium",
    endpoint_name="wine-quality-endpoint6"
)



INFO:sagemaker:Creating model with name: sagemaker-scikit-learn-2025-05-24-21-38-25-399
INFO:sagemaker:Creating endpoint-config with name wine-quality-endpoint6
INFO:sagemaker:Creating endpoint with name wine-quality-endpoint6


--------!

In [68]:
from sagemaker.predictor import Predictor
import pandas as pd

# load dataset and prepare single test sample
df = pd.read_csv("winequality_combined.csv")
sample = df.drop(columns=["quality"]).iloc[[0]]  # Drop the label column
csv_input = sample.to_csv(index=False, header=False)

# connect to the deployed SageMaker endpoint
predictor = Predictor(endpoint_name="wine-quality-endpoint6")

# send the CSV input for prediction
response = predictor.predict(
    csv_input,
    initial_args={"ContentType": "text/csv", "Accept": "text/csv"}
)

# output the result
print("Prediction:", response)


Prediction: b'4.971138098249767'


In [69]:
import sagemaker
import boto3
import os


sess = sagemaker.Session()
job_name = sklearn_estimator.latest_training_job.name
bucket = sess.default_bucket()

# path to the model artifact in S3
s3_model_path = f"s3://{bucket}/{job_name}/output/model.tar.gz"

# download it locally
sess.download_data(path=".", bucket=bucket, key_prefix=f"{job_name}/output")

print("Downloaded model.tar.gz")


Downloaded model.tar.gz
