In [1]:
# Install necessary packages
!pip install boto3 sagemaker



In [4]:
import pandas as pd
import boto3
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.estimator import Estimator

In [5]:
# Set up SageMaker session and role
sagemaker_session = sagemaker.Session()
role = get_execution_role()
print(role)

arn:aws:iam::241533149150:role/aws-sagemaker-role


In [6]:
# S3 bucket details
s3_bucket = "laboratory-diagnostics-data"
s3_prefix = "SourceData/diabeticprediction"
file_path = f"s3://{s3_bucket}/{s3_prefix}/diabetes_prediction.csv"
print(file_path)

s3://laboratory-diagnostics-data/SourceData/diabeticprediction/diabetes_prediction.csv


In [7]:
pip install --upgrade s3fs

Collecting s3fs
  Downloading s3fs-2024.10.0-py3-none-any.whl.metadata (1.7 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.15.2-py3-none-any.whl.metadata (23 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from s3fs)
  Downloading aiohttp-3.11.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting botocore<1.35.37,>=1.35.16 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.35.36-py3-none-any.whl.metadata (5.7 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs)
  Downloading aiohappyeyeballs-2.4.3-py3-none-any.whl.metadata (6.1 kB)
Collecting aiosignal>=1.1.2 (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs)
  Downloading aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting async-timeout<6.0,>=4.0 (from aiohttp!=4.0.0a0,!=4.0.0a1->s3fs)
  Do

In [7]:
# Read data directly from S3
df = pd.read_csv(file_path)
print(df.head())

   gender  hypertension  heart_disease smoking_history  hbA1c_level  \
0    Male             1              0  Current smoker          5.7   
1  Female             0              1   Former smoker          6.1   
2  Female             1              1    Never smoked          5.8   
3    Male             0              0  Current smoker          6.3   
4  Female             1              0   Former smoker          5.9   

   pregnancies  glucose  blood_pressure  skin_thickness  insulin   bmi  \
0            0      110              80              32     15.0  28.0   
1            1      150              90              20     20.5  31.2   
2            2      130              85              22     10.8  25.5   
3            0      180             100              25     12.4  29.1   
4            3      140              75              18     14.3  26.4   

   diabetes_pedigree_function  age  is_diabetic  
0                         0.6   45            1  
1                         0.

In [8]:
# Ensure labels are binary (0 or 1)
df['is_diabetic'] = df['is_diabetic'].apply(lambda x: 1 if x > 0 else 0)

In [9]:
# Data Preprocessing and Feature Engineering
# Encode categorical variables (gender and smoking_history) using one-hot encoding
df = pd.get_dummies(df, columns=['gender', 'smoking_history'], drop_first=True)

In [10]:
print(df.head())

   hypertension  heart_disease  hbA1c_level  pregnancies  glucose  \
0             1              0          5.7            0      110   
1             0              1          6.1            1      150   
2             1              1          5.8            2      130   
3             0              0          6.3            0      180   
4             1              0          5.9            3      140   

   blood_pressure  skin_thickness  insulin   bmi  diabetes_pedigree_function  \
0              80              32     15.0  28.0                         0.6   
1              90              20     20.5  31.2                         0.7   
2              85              22     10.8  25.5                         0.8   
3             100              25     12.4  29.1                         0.9   
4              75              18     14.3  26.4                         0.6   

   age  is_diabetic  gender_Male  smoking_history_Former smoker  \
0   45            1            1     

In [11]:
# Split data into features and labels
X = df.drop('is_diabetic', axis=1)
y = df['is_diabetic']

In [12]:
# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Save preprocessed data to S3
train_data = pd.DataFrame(X_train)
train_data['is_diabetic'] = y_train.values
test_data = pd.DataFrame(X_test)
test_data['is_diabetic'] = y_test.values

In [14]:
train_data.to_csv('train.csv', index=False, header=False)
test_data.to_csv('test.csv', index=False, header=False)

In [15]:
train_s3_path = f's3://{sagemaker_session.default_bucket()}/diabetes-prediction/train/train.csv'
test_s3_path = f's3://{sagemaker_session.default_bucket()}/diabetes-prediction/test/test.csv'

print(train_s3_path)
print(test_s3_path)

s3://sagemaker-us-east-2-241533149150/diabetes-prediction/train/train.csv
s3://sagemaker-us-east-2-241533149150/diabetes-prediction/test/test.csv


In [16]:
# Upload training and testing data to S3
s3_client = boto3.client('s3')
s3_client.upload_file('train.csv', sagemaker_session.default_bucket(), 'diabetes-prediction/train/train.csv')
s3_client.upload_file('test.csv', sagemaker_session.default_bucket(), 'diabetes-prediction/test/test.csv')


In [17]:
# Set up and configure XGBoost model
xgboost_container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "1.2-1")

In [18]:
xgb = Estimator(
    image_uri=xgboost_container,
    instance_type="ml.m5.xlarge",
    instance_count=1,
    output_path=f's3://{sagemaker_session.default_bucket()}/diabetes-prediction/output',
    role=role,
    sagemaker_session=sagemaker_session
)

In [19]:
# Hyperparameters for XGBoost
xgb.set_hyperparameters(
    objective="binary:logistic",
    num_round=10,
    max_depth=3,
    eta=0.2,
    subsample=0.8,
    colsample_bytree=0.8
)

In [20]:
# Prepare data inputs for training
train_input = TrainingInput(s3_data=train_s3_path, content_type="csv")

In [21]:
# Train the model
xgb.fit({"train": train_input})

INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-11-29-03-12-30-012


2024-11-29 03:12:31 Starting - Starting the training job...
2024-11-29 03:12:46 Starting - Preparing the instances for training...
2024-11-29 03:13:30 Downloading - Downloading the training image......
2024-11-29 03:14:31 Training - Training image download completed. Training in progress.
2024-11-29 03:14:31 Uploading - Uploading generated training model[34m[2024-11-29 03:14:26.537 ip-10-0-204-208.us-east-2.compute.internal:7 INFO utils.py:27] RULE_JOB_STOP_SIGNAL_FILENAME: None[0m
[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of C

In [22]:
# Deploy the model to an endpoint
predictor = xgb.deploy(initial_instance_count=1, instance_type="ml.m5.xlarge")

INFO:sagemaker:Creating model with name: sagemaker-xgboost-2024-11-29-03-15-17-038
INFO:sagemaker:Creating endpoint-config with name sagemaker-xgboost-2024-11-29-03-15-17-038
INFO:sagemaker:Creating endpoint with name sagemaker-xgboost-2024-11-29-03-15-17-038


-----!

In [23]:
# Transform test set for predictions
import io
from sklearn.metrics import accuracy_score

In [24]:
test_data_df = pd.DataFrame(X_test, columns=X.columns)
# Convert the DataFrame to a CSV string
csv_buffer = io.StringIO()
test_data_df.to_csv(csv_buffer, header=False, index=False)

In [25]:
print(test_data_df.head())

    hypertension  heart_disease  hbA1c_level  pregnancies  glucose  \
0              1              0          5.7            0      110   
17             0              1          6.2            1      140   
15             0              0          6.1            0      155   
1              0              1          6.1            1      150   

    blood_pressure  skin_thickness  insulin   bmi  diabetes_pedigree_function  \
0               80              32     15.0  28.0                         0.6   
17              90              23     18.8  28.4                         0.6   
15              93              25     13.1  29.5                         0.7   
1               90              20     20.5  31.2                         0.7   

    age  gender_Male  smoking_history_Former smoker  \
0    45            1                              0   
17   50            1                              0   
15   48            1                              0   
1    52            0   

In [26]:
# Encode the CSV string to bytes
csv_bytes = csv_buffer.getvalue().encode('utf-8')

# Make predictions with specified content type
predictions = predictor.predict(csv_bytes, initial_args={'ContentType': 'text/csv'})

# Threshold predictions to get binary outcomes
predictions = [float(pred) for pred in predictions.decode('utf-8').strip().split(',')]
predictions = (pd.Series(predictions) > 0.4).astype(int)


# Evaluate the model
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 100.00%
