In [2]:
import argparse
import logging
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from datetime import datetime

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

BUCKET_NAME = "sagemaker-strokeprediction-mlops"
BUCKET = f's3://{BUCKET_NAME}'

RAW_DATA_FOLDER = 'Dataset'
RAW_DATA_FILE = 'healthcare-dataset-stroke-data.csv'
RAW_DATA_PATH = os.path.join(BUCKET, RAW_DATA_FOLDER, RAW_DATA_FILE)


TARGET_COLUMN = 'stroke'

def extract_features_types(df, unique_threshold=10):
    numerical_features = []
    categorical_features = []

    for col in df.columns:
        if col == TARGET_COLUMN:
            continue
        if df[col].nunique() <= unique_threshold:
            categorical_features.append(col)
        else:
            numerical_features.append(col)

    return numerical_features, categorical_features

def split_dataset(dataset, target_column, test_size=0.2, validation_size=0.2, random_state=None):
    X = dataset.drop(target_column, axis=1)
    y = dataset[target_column]
    
    # Split dataset into train and test sets using StratifiedShuffleSplit
    stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    for train_index, test_index in stratified_split.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Split the remaining data into validation and train sets using StratifiedShuffleSplit
    stratified_split = StratifiedShuffleSplit(n_splits=1, test_size=validation_size, random_state=random_state)
    
    for train_index, val_index in stratified_split.split(X_train, y_train):
        X_train, X_val = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train, y_val = y_train.iloc[train_index], y_train.iloc[val_index]
    
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

def process_target(df: pd.DataFrame, col_target: str) -> pd.DataFrame:
    
    # Make sure that the 0 error type is also mapped to 1 (we do a binary classification later)
    df.loc[df[col_target] == 0, col_target] = 1
    
    df = fill_nulls(df=df, col=col_target)
    
    # Reorder columns
    colnames = list(df.columns)
    colnames.insert(0, colnames.pop(colnames.index(col_target)))
    df = df[colnames]
    
    return df



In [3]:

# Read raw input data
df = pd.read_csv('../Dataset/healthcare-dataset-stroke-data.csv')
logger.info(f"Shape of data is: {df.shape}")

numerical_features, categorical_features = extract_features_types(df)
    
numeric_transformer = Pipeline(steps=[
    ('imputer', KNNImputer(n_neighbors=5)),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

logger.info("Splitting %d rows of data into train, validation, test datasets.", len(df))

(X_train, y_train), (X_val, y_val), (X_test, y_test) = split_dataset(df, TARGET_COLUMN)

# Apply preprocessor.fit_transform to train, validation and test before writing them to directories
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), index=y_train.index)
X_val = pd.DataFrame(preprocessor.transform(X_val), index=y_val.index)
X_test = pd.DataFrame(preprocessor.transform(X_test), index=y_test.index)



Shape of data is: (5110, 12)
Splitting 5110 rows of data into train, validation, test datasets.


In [7]:
%ls

 Volume in drive C has no label.
 Volume Serial Number is 388D-BB99

 Directory of c:\Users\kheri\OneDrive - Carleton University\dev\Personal Coding Projects\Stroke Prediction\Notebooks

2023-07-20  06:53 PM    <DIR>          .
2023-07-20  06:53 PM    <DIR>          ..
2023-07-19  11:52 AM         5,640,938 EDA_SageMaker.ipynb
2023-07-19  12:24 PM             1,824 logs.log
2023-07-18  07:27 PM                 0 Model.ipynb
2023-07-20  06:53 PM            19,951 model.tar.gz
2023-07-20  06:44 PM    <DIR>          rubish
               4 File(s)      5,662,713 bytes
               3 Dir(s)  160,156,835,840 bytes free


In [4]:
# Create local output directories. These directories live on the container that is spun up.
os.makedirs("processing/train")
os.makedirs("processing/validation")
os.makedirs("processing/test")

# Save data locally on the container that is spun up.
try:
    pd.concat([y_train, X_train], axis=1).to_csv("processing/train/train.csv", index=False)
    pd.concat([y_val, X_val], axis=1).to_csv("processing/validation/val.csv", index=False)
    pd.concat([y_test, X_test], axis=1).to_csv("processing/test/test.csv", index=False)
    logger.info("Files Successfully Written Locally")
except Exception as e:
    logger.debug("Could Not Write the Files")
    logger.debug(e)
    pass

Files Successfully Written Locally


In [1]:
import pickle
import tarfile
import pandas as pd
import xgboost

In [2]:
model_path = "model.tar.gz"
with tarfile.open(model_path) as tar:
    tar.extractall(path="..")

model = pickle.load(open("../xgboost-model", "rb"))

  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.



In [15]:
# ================================================================================
# Author:      Kheri Hughes @ HMC - 2023
# Description: This script contains the evaluation logic.
# ================================================================================
import json
import logging
import os
import pickle
import tarfile

import pandas as pd
import xgboost

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler())

from sklearn.metrics import (
    accuracy_score,
    classification_report,
    roc_auc_score,
    recall_score,
    precision_score
)

model_path = "model.tar.gz"
with tarfile.open(model_path) as tar:
    tar.extractall(path="..")

logger.debug("Loading xgboost model.")
model = pickle.load(open("../xgboost-model", "rb"))

print("Loading test input data")
test_path = "processing/test/test.csv"
df = pd.read_csv(test_path, header=0)

logger.debug("Reading test data.")
y_test = df.iloc[:, 0].to_numpy()
df.drop(df.columns[0], axis=1, inplace=True)

# Debug code - Check the shapes of your loaded data
print(f"Shape of y_test: {y_test.shape}")
print(f"Shape of dataframe after dropping labels: {df.shape}")

X_test = xgboost.DMatrix(df.values)

logger.info("Performing predictions against test data.")
predictions = model.predict(X_test)

# Debug code - Check the shapes of your predictions
print(f"Shape of predictions: {predictions.shape}")

# y_test = y_test.astype(int)
# predictions = predictions.round().astype(int)

print("Creating classification evaluation report")
acc = accuracy_score(y_test, predictions.round())
recall = recall_score(y_test, predictions.round())
precision = precision_score(y_test, predictions.round())

report_dict = {
    "binary_classification_metrics": {
        "accuracy": {
            "value": acc,
            "standard_deviation": "NaN",
        },
        "recall": {
            "value": recall,
            "standard_deviation": "NaN"
        },
        "precision": {
            "value": precision,
            "standard_deviation": "NaN"
        },
    },
}

print("Classification report:\n{}".format(report_dict))

evaluation_dir = "evaluation"
os.makedirs(evaluation_dir, exist_ok=True)
evaluation_output_path = os.path.join(evaluation_dir, "evaluation.json")
print("Saving classification report to {}".format(evaluation_output_path))

with open(evaluation_output_path, "w") as f:
    f.write(json.dumps(report_dict))


Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.
Performing predictions against test data.


  If you are loading a serialized model (like pickle in Python, RDS in R) generated by
  older XGBoost, please export the model by calling `Booster.save_model` from that version
  first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/latest/tutorials/saving_model.html

  for more details about differences between saving model and serializing.

Loading test input data
Shape of y_test: (1022,)
Shape of dataframe after dropping labels: (1022, 23)
Shape of predictions: (1022,)
Creating classification evaluation report
Classification report:
{'binary_classification_metrics': {'accuracy': {'value': 0.952054794520548, 'standard_deviation': 'NaN'}, 'recall': {'value': 0.04, 'standard_deviation': 'NaN'}, 'precision': {'value': 0.6666666666666666, 'standard_deviation': 'NaN'}}}
Saving classification report to evaluation\evaluation.json


In [10]:
print(y_test[:5])
print(predictions.round()[:5])


['stroke' '0' '0' '0' '0']
[0. 0. 0. 0. 0.]


In [3]:
import pandas as pd

test_path = "processing/test/test.csv"
df = pd.read_csv(test_path, header=0)

# choose the first row of df to test
single_row = df.iloc[0]

print(single_row


stroke    0.000000
0         1.349604
1        -1.660928
2        -0.935928
3        -1.480927
4         0.000000
5         1.000000
6         1.000000
7         0.000000
8         1.000000
9         0.000000
10        1.000000
11        0.000000
12        0.000000
13        0.000000
14        0.000000
15        0.000000
16        1.000000
17        1.000000
18        0.000000
19        1.000000
20        0.000000
21        0.000000
22        0.000000
Name: 0, dtype: float64


In [1]:
%pip install sagemaker
import sagemaker

# Specify your SageMaker endpoint name and role
endpoint_name = "your-endpoint-name"
role = "your-sagemaker-role-arn"

# Create a SageMaker predictor
predictor = sagemaker.predictor.RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sagemaker.Session(),
    content_type="text/csv",  # The content type of the input data
    role=role,
)


Collecting sagemaker
  Using cached sagemaker-2.173.0.tar.gz (854 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting attrs<24,>=23.1.0
  Using cached attrs-23.1.0-py3-none-any.whl (61 kB)
Collecting boto3<2.0,>=1.26.131
  Using cached boto3-1.28.8-py3-none-any.whl (135 kB)
Collecting smdebug_rulesconfig==1.0.1
  Using cached smdebug_rulesconfig-1.0.1-py2.py3-none-any.whl (20 kB)
Collecting pathos
  Using cached pathos-0.3.0-py3-none-any.whl (79 kB)
Collecting schema
  Using cached schema-0.7.5-py2.py3-none-any.whl (17 kB)
Collecting tblib==1.7.0
  Using cached tblib-1.7.0-py2.py3-none-any.whl (12 kB)
Collecting botocore<1.32.0,>=1.31.8
  Using cached botocore-1.31.8-py3-none-any.whl (11.0 MB)
Collecting s3transfer<0.7.0,>=0.6.0
  Using cached s3transfer-0.6.1-py3-none-any.whl (79 kB)
Collecting jmespath<2.0.0,>=0.7.1
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting multiprocess>=0.70.14
  Using cached mu

The class RealTimePredictor has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


TypeError: Predictor.__init__() missing 1 required positional argument: 'endpoint_name'