In [1]:
import io
import requests
from zipfile import ZipFile
from pathlib import Path

import pickle
import joblib

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [4]:
import boto3
import sagemaker

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [5]:
from sagemaker.amazon.amazon_estimator import get_image_uri

from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter

In [6]:
role = sagemaker.get_execution_role()
role

'arn:aws:iam::166664655187:role/service-role/AmazonSageMaker-ExecutionRole-20241104T113063'

In [7]:
sess = sagemaker.Session()
sess

<sagemaker.session.Session at 0x7fb31d9d4b50>

In [8]:
s3bucket = sess.default_bucket()
s3bucket

'sagemaker-us-east-1-166664655187'

In [9]:
# https://archive.ics.uci.edu/dataset/222/bank+marketing
data_url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'

resp = requests.get(data_url)
resp.raise_for_status()  # check if the request was ok.

In [10]:
data_dir = Path('bank_marketing_data')

with ZipFile(io.BytesIO(resp.content)) as zip_file:
    zip_file.extractall(data_dir)

In [11]:
with ZipFile(data_dir / 'bank.zip') as zip_file:
    zip_file.extractall(data_dir)

In [12]:
df = pd.read_csv(data_dir / 'bank-full.csv', sep=';')
df.shape

(45211, 17)

In [13]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [14]:
yesno_cols = ['default', 'housing', 'loan', 'y']

for col in yesno_cols:
    df[col] = (df[col] == 'yes').astype('float')


In [15]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,0.0,2143,1.0,0.0,unknown,5,may,261,1,-1,0,unknown,0.0
1,44,technician,single,secondary,0.0,29,1.0,0.0,unknown,5,may,151,1,-1,0,unknown,0.0
2,33,entrepreneur,married,secondary,0.0,2,1.0,1.0,unknown,5,may,76,1,-1,0,unknown,0.0
3,47,blue-collar,married,unknown,0.0,1506,1.0,0.0,unknown,5,may,92,1,-1,0,unknown,0.0
4,33,unknown,single,unknown,0.0,1,0.0,0.0,unknown,5,may,198,1,-1,0,unknown,0.0


In [16]:
df['balance'].describe()

count     45211.000000
mean       1362.272058
std        3044.765829
min       -8019.000000
25%          72.000000
50%         448.000000
75%        1428.000000
max      102127.000000
Name: balance, dtype: float64

In [17]:
df['month'].value_counts()

may    13766
jul     6895
aug     6247
jun     5341
nov     3970
apr     2932
feb     2649
jan     1403
oct      738
sep      579
mar      477
dec      214
Name: month, dtype: int64

In [18]:
# Split into training and testing sets
y = df['y'].values
X = df.drop(['y'], axis=1)
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, random_state=19)

In [19]:
X_trn.shape, X_tst.shape

((36168, 16), (9043, 16))

In [20]:
sample_input = X_trn.iloc[0].values.tolist()
','.join([str(x) for x in sample_input])

'30,admin.,married,secondary,0.0,692,1.0,0.0,cellular,11,may,290,1,-1,0,unknown'

In [21]:
','.join(X_trn.columns)

'age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome'

In [22]:
# Custom ColumnTransformer to remove prefixes in transformed column names
class CustomColumnTransformer(ColumnTransformer):
    def get_feature_names_out(self, input_features=None):
        # Get feature names with prefixes removed
        feature_names = []
        
        # Adjusting _iter to use the required arguments for current scikit-learn versions
        for name, trans, columns, _ in self._iter(fitted=True, column_as_labels=True, skip_drop=True, skip_empty_columns=True):
            if hasattr(trans, 'get_feature_names_out'):
                # Remove the transformer prefix (`cat_`, `num_`, etc.)
                trans_feature_names = trans.get_feature_names_out(columns)
                clean_names = [col.replace('cat_', '').replace('num_', '').replace('remainder_', '') 
                               for col in trans_feature_names]
                feature_names.extend(clean_names)
            else:
                # Directly add columns if no specific transformer applied
                feature_names.extend(columns)
        return np.array(feature_names)

In [23]:
cols_cat = ['job', 'marital', 'education', 'contact', 'poutcome', 'month']
cols_num = ['age', 'balance', 'duration']

preprocessor = CustomColumnTransformer(
    transformers=[
        ('num', StandardScaler(), cols_num),
        ('cat', OneHotEncoder(), cols_cat)
    ],
    remainder='passthrough'  # Keep binary columns as they are
)

# Fit the preprocessor on the training data and transform it
df_trn_transformed = preprocessor.fit_transform(X_trn)
df_tst_transformed = preprocessor.transform(X_tst)

In [24]:
# Get the column names after one-hot encoding for better readability
transformed_cols = preprocessor.get_feature_names_out()
transformed_cols

array(['age', 'balance', 'duration', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management',
       'job_retired', 'job_self-employed', 'job_services', 'job_student',
       'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'education_primary', 'education_secondary', 'education_tertiary',
       'education_unknown', 'contact_cellular', 'contact_telephone',
       'contact_unknown', 'poutcome_failure', 'poutcome_other',
       'poutcome_success', 'poutcome_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'default', 'housing', 'loan', 'day', 'campaign', 'pdays',
       'previous'], dtype='<U19')

In [25]:
df_trn_clean = pd.DataFrame(df_trn_transformed, columns=transformed_cols)
df_tst_clean = pd.DataFrame(df_tst_transformed, columns=transformed_cols)

In [26]:
# This will be the raw input
','.join([str(x) for x in X_trn.iloc[0].values.tolist()])

'30,admin.,married,secondary,0.0,692,1.0,0.0,cellular,11,may,290,1,-1,0,unknown'

In [27]:
# This will be step1; preprocess the raw data and prepare for inference
# The order will be like `transformed_cols`
','.join([str(x) for x in df_trn_clean.iloc[0].values.tolist()])

'-1.0307586832326059,-0.2210044429032759,0.12417163887544379,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,1.0,-1.0,0.0'

In [28]:
df_trn_clean['y'] = y_trn
df_tst_clean['y'] = y_tst

In [29]:
# Move 'y' column to the first position
# target column should be first.
df_trn_clean = df_trn_clean[['y'] + [col for col in df_trn_clean.columns if col != 'y']]
df_tst_clean = df_tst_clean[['y'] + [col for col in df_tst_clean.columns if col != 'y']]

In [30]:
df_trn_clean.shape, df_tst_clean.shape

((36168, 49), (9043, 49))

In [31]:
# Save the fitted preprocessor object
# joblib.dump(preprocessor, 'preprocessor.joblib')

# Save preprocessor as a pickle file
with open("preprocessor.pkl", "wb") as f:
    pickle.dump(preprocessor, f)


In [32]:
# Get the default bucket and create an S3 path
sess = sagemaker.Session()
# bucket = sess.default_bucket()
bucket_name = 'churn-bank-marketing'
prefix = 'sagemaker/preprocessors'
s3_path = f's3://{bucket_name}/{prefix}/preprocessor.pkl'

# Upload the file to S3
s3_uri = sess.upload_data("preprocessor.pkl", bucket=bucket_name, key_prefix=prefix)
print(f"Preprocessor uploaded to: {s3_uri}")


Preprocessor uploaded to: s3://churn-bank-marketing/sagemaker/preprocessors/preprocessor.pkl


In [33]:
# Download the file from S3
s3 = boto3.client("s3")
s3.download_file(bucket_name, f"{prefix}/preprocessor.pkl", "preprocessor2.pkl")

# Load preprocessor from the pickle file
with open("preprocessor2.pkl", "rb") as f:
    preprocessor2 = pickle.load(f)

print("Preprocessor loaded from S3.")

Preprocessor loaded from S3.


In [34]:
import joblib
import pandas as pd
import json

# # Load the preprocessor model at the global level, so it only loads once per container instance
# preprocessor_path = 'preprocessor.joblib'
# preprocessor = joblib.load(preprocessor_path)  # or load from S3 if needed

# Load preprocessor from the pickle file
with open("preprocessor2.pkl", "rb") as f:
    preprocessor = pickle.load(f)

def preprocess_inputdata(event, context):
    # Extract the input data from the event (assuming itâ€™s in a specific format)
    input_data = event['input']  # should be in the form of a comma-separated string
    input_values = input_data.split(',')

    # Define column names as per trained schema
    column_names = [
        'age', 'job', 'marital', 'education', 'default', 'balance', 
        'housing', 'loan', 'contact', 'day', 'month', 'duration', 
        'campaign', 'pdays', 'previous', 'poutcome'
    ]
    
    # Create DataFrame for the input data
    input_df = pd.DataFrame([input_values], columns=column_names)

    # Convert numeric columns to float
    numeric_columns = ['age', 'balance', 'duration', 'default',
                       'housing', 'loan', 'day', 'campaign', 'pdays', 'previous']
    input_df[numeric_columns] = input_df[numeric_columns].astype(float)

    # Transform the data using the pre-fitted preprocessor
    transformed_data = preprocessor.transform(input_df)
    
    # Convert to a list or any other suitable format for downstream processing
    transformed_list = transformed_data.flatten().tolist()
    
    # Return or forward the transformed data
    return {
        'statusCode': 200,
        'body': json.dumps({'transformed_data': transformed_list})
    }


In [35]:
# Sample input data
sample_input = '30,admin.,married,secondary,0.0,692,1.0,0.0,cellular,11,may,290,1,-1,0,unknown'

# A mock event-like dictionary with the 'input' key holding the string
event = {
    'input': sample_input
}

step1_transformed_data = preprocess_inputdata(event, None)
step1_transformed_data

{'statusCode': 200,
 'body': '{"transformed_data": [-1.0307586832326059, -0.2210044429032759, 0.12417163887544379, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 11.0, 1.0, -1.0, 0.0]}'}

In [36]:
# Also drop the header. Another SageMaker convention.
df_trn_clean.to_csv('bank_trn.csv', index=False, header=False)
df_tst_clean.to_csv('bank_tst.csv', index=False, header=False)

In [37]:
s3 = boto3.client('s3')  # Initialize the S3 client
bucket_name = 'churn-bank-marketing'

assert sess.boto_region_name == 'us-east-1'
s3.create_bucket(Bucket=bucket_name)  # region is `us-east-1`

{'ResponseMetadata': {'RequestId': 'EX4APKT99B1V9WS2',
  'HostId': 'JskeuaeKZbpI+mYWBa4SdBr9/3zlhBWmTuTwteHun842KOXJVhJRWo2xLr9AIFOiP4GmXG8SNckeiWVmEWzwOA==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'JskeuaeKZbpI+mYWBa4SdBr9/3zlhBWmTuTwteHun842KOXJVhJRWo2xLr9AIFOiP4GmXG8SNckeiWVmEWzwOA==',
   'x-amz-request-id': 'EX4APKT99B1V9WS2',
   'date': 'Fri, 08 Nov 2024 04:34:44 GMT',
   'location': '/churn-bank-marketing',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'Location': '/churn-bank-marketing'}

In [38]:
prefix = 'sagemaker/logistic-regression'

train_path = sess.upload_data("bank_trn.csv", bucket=bucket_name, key_prefix=prefix)
valid_path = sess.upload_data("bank_tst.csv", bucket=bucket_name, key_prefix=prefix)

print(f'train path >>> {train_path}')
print(f'validation path >>> {valid_path}')

train path >>> s3://churn-bank-marketing/sagemaker/logistic-regression/bank_trn.csv
validation path >>> s3://churn-bank-marketing/sagemaker/logistic-regression/bank_tst.csv


In [39]:
train_input = sagemaker.inputs.TrainingInput(train_path, content_type='text/csv')
validation_input = sagemaker.inputs.TrainingInput(valid_path, content_type='text/csv')

In [40]:
container = sagemaker.image_uris.retrieve("linear-learner", sess.boto_region_name)
container

'382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1'

In [41]:
linear_est = sagemaker.estimator.Estimator(
    container,
    role,
    instance_count=1,
    instance_type='ml.m5.large',
    output_path=f's3://{bucket_name}/{prefix}/output',
    sagemaker_session=sess
)

# Set static hyperparameters
linear_est.set_hyperparameters(
    predictor_type='binary_classifier',
    binary_classifier_model_selection_criteria='cross_entropy_loss'
)

In [42]:
# Define hyperparameter ranges to tune
hyperparameter_ranges = {
    "learning_rate": ContinuousParameter(0.001, 1),
    "mini_batch_size": IntegerParameter(32, 128),
    "l1": ContinuousParameter(0.0, 2.0),
}

# Define objective metric
objective_metric_name = "validation:binary_classification_accuracy"
objective_type = "Maximize"

# Create a HyperparameterTuner object
tuner = HyperparameterTuner(
    estimator=linear_est,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    max_jobs=20,  # Total number of tuning jobs
    max_parallel_jobs=3,  # Number of jobs to run in parallel
    objective_type=objective_type
)

# Launch hyperparameter tuning job
tuner.fit({'train': train_input, 'validation': validation_input})

No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


..........................................................................................................................................................................!


In [43]:
tuner.latest_tuning_job.name  # tuning-job-name
# also can be found in:
# Amazon SageMaker > Training > Hyperparameter tuning jobs

'linear-learner-241108-0434'

In [44]:
# Attach to the tuning job if not already done
tuner = HyperparameterTuner.attach(tuner.latest_tuning_job.name)

# Get the best model
best_estimator = tuner.best_estimator()
best_estimator


2024-11-08 04:47:20 Starting - Found matching resource for reuse
2024-11-08 04:47:20 Downloading - Downloading the training image
2024-11-08 04:47:20 Training - Training image download completed. Training in progress.
2024-11-08 04:47:20 Uploading - Uploading generated training model
2024-11-08 04:47:20 Completed - Resource reused by training job: linear-learner-241108-0434-019-d08a0ca2


<sagemaker.amazon.linear_learner.LinearLearner at 0x7fb31aad98d0>

In [45]:
best_estimator.model_data

's3://churn-bank-marketing/sagemaker/logistic-regression/output/linear-learner-241108-0434-016-e4ddd842/output/model.tar.gz'

In [46]:
# Best parameters picked:
best_estimator.hyperparameters()

{'predictor_type': 'binary_classifier',
 'binary_classifier_model_selection_criteria': 'cross_entropy_loss',
 'l1': '0.05993584894521531',
 'learning_rate': '0.5585981942204319'}

In [47]:
# Deploy the best model
linear_predictor = best_estimator.deploy(
    initial_instance_count=1,  # Number of instances for deployment
    instance_type="ml.m5.large",  # Choose an instance type based on your needs
    endpoint_name='logistic-reg-clf-churn-bank-v1'  # give customized name
)


--------!

In [48]:
linear_predictor.endpoint_name

'logistic-reg-clf-churn-bank-v1'

In [49]:
parsed_json = json.loads(step1_transformed_data['body'])
payload = parsed_json['transformed_data']
payload

[-1.0307586832326059,
 -0.2210044429032759,
 0.12417163887544379,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 11.0,
 1.0,
 -1.0,
 0.0]

In [50]:
endpoint_name = linear_predictor.endpoint_name

predictor = sagemaker.predictor.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=sagemaker.serializers.CSVSerializer()
)

predictor.predict(payload)

b'{"predictions": [{"score": 0.07681151479482651, "predicted_label": 0}]}'

In [51]:
step1_transformed_data

{'statusCode': 200,
 'body': '{"transformed_data": [-1.0307586832326059, -0.2210044429032759, 0.12417163887544379, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 11.0, 1.0, -1.0, 0.0]}'}

In [52]:
body = json.loads(step1_transformed_data['body'])
transformed_data = body['transformed_data']

runtime_client = boto3.client('sagemaker-runtime')

# Define the appropriate endpoint name
endpoint_name = 'logistic-reg-clf-churn-bank-v1'

# Create the CSV serializer
serializer = sagemaker.serializers.CSVSerializer()
# Prepare the payload by serializing the data into CSV format
payload = serializer.serialize(transformed_data)

response = runtime_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='text/csv',  # This matches the CSVSerializer
    Body=payload
)

# Read and decode the response body
result = json.loads(response['Body'].read().decode())  # Decode response
result

{'predictions': [{'score': 0.07681151479482651, 'predicted_label': 0}]}

In [53]:
linear_predictor.delete_endpoint()
