In [26]:
import os
from sagemaker import get_execution_role
from dotenv import load_dotenv
from load_data import load_data
from split_data import split_data
import importlib
from save_model_to_s3 import save_model_to_s3
from deploy_model_endpoint import deploy_model
from finalize_and_save_model import finalize_and_save_model
from delete_sagemaker_endpoint import delete_sagemaker_endpoint
from ydata_profiling import ProfileReport
import shap
import pandas as pd

In [27]:
# Variables Setup Stage
load_dotenv(".env")
role = get_execution_role()

# Env variables
data_location_s3 = os.getenv("data_location_s3")
algorithm_choice = os.getenv("algorithm_choice")
target = os.getenv("target")
endpoint_name = os.getenv("endpoint_name")
model_name = os.getenv("model_name")
# data_location = "s3://{}".format(data_location_s3)
data_location = data_location_s3
instance_type = os.getenv("instance_type")
model_instance_count = int(os.getenv("model_instance_count"))
image_uri = os.getenv("ecr_repo_uri")
tuning_metric = os.getenv("tuning_metric")

print(
    data_location_s3,
    algorithm_choice,
    target,
    endpoint_name,
    model_name,
    data_location,
    instance_type,
    image_uri,
    tuning_metric,
)

sagemaker.config INFO - Not applying SDK defaults from location: /Library/Application Support/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /Users/lanre.bakare/Library/Application Support/sagemaker/config.yaml
ethan_data.csv classification y classification-proba-endpoint banking-classification ethan_data.csv ml.m4.xlarge None None


In [28]:
df = load_data(data_location)
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [29]:
df = df.drop(columns=["poutcome"])

In [30]:
cat_features = df.select_dtypes(include=["object", "bool"]).columns.values
print(cat_features)

['job' 'marital' 'education' 'default' 'housing' 'loan' 'contact' 'month'
 'y']


In [31]:
# Apply one-hot encoding to categorical columns
data_encoded = pd.get_dummies(
    df, columns=["job", "marital", "contact", "default", "housing", "loan"]
)

In [32]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'y' column to numerical values
data_encoded["y"] = label_encoder.fit_transform(data_encoded["y"])

In [33]:
data_encoded["month"] = label_encoder.fit_transform(data_encoded["month"])

In [34]:
data_encoded["education"] = label_encoder.fit_transform(data_encoded["education"])

In [35]:
pd.set_option("display.max_columns", None)

In [36]:
data_encoded.head()

Unnamed: 0,age,education,balance,day,month,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,contact_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes
0,58,2,2143,5,8,261,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,0
1,44,1,29,5,8,151,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,1,0,0,1,1,0
2,33,1,2,5,8,76,1,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,1
3,47,3,1506,5,8,92,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,1,0
4,33,3,1,5,8,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,1,0,1,0


In [37]:
# Drop columns related to the 'unknown' category
unknown_columns = ["job_unknown", "contact_unknown"]

data_encoded.drop(unknown_columns, axis=1, inplace=True)

In [38]:
data_encoded.head()

Unnamed: 0,age,education,balance,day,month,duration,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,marital_divorced,marital_married,marital_single,contact_cellular,contact_telephone,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes
0,58,2,2143,5,8,261,1,-1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0
1,44,1,29,5,8,151,1,-1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,1,0
2,33,1,2,5,8,76,1,-1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1
3,47,3,1506,5,8,92,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1,0
4,33,3,1,5,8,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,1,0


In [39]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 32 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                45211 non-null  int64
 1   education          45211 non-null  int64
 2   balance            45211 non-null  int64
 3   day                45211 non-null  int64
 4   month              45211 non-null  int64
 5   duration           45211 non-null  int64
 6   campaign           45211 non-null  int64
 7   pdays              45211 non-null  int64
 8   previous           45211 non-null  int64
 9   y                  45211 non-null  int64
 10  job_admin.         45211 non-null  uint8
 11  job_blue-collar    45211 non-null  uint8
 12  job_entrepreneur   45211 non-null  uint8
 13  job_housemaid      45211 non-null  uint8
 14  job_management     45211 non-null  uint8
 15  job_retired        45211 non-null  uint8
 16  job_self-employed  45211 non-null  uint8
 17  job_services

In [40]:
data_encoded.shape

(45211, 32)

In [42]:
X = data_encoded.drop(columns=["y"])
y = data_encoded["y"]

In [43]:
print(X.shape, y.shape)

(45211, 31) (45211,)


In [44]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=123)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [45]:
# Concatenate the resampled 'X' and 'y' to form a new DataFrame
resampled_data = pd.concat([X_resampled, pd.Series(y_resampled, name="y")], axis=1)

In [46]:
# Calculate the count of 'yes' and 'no' in the 'y' column
yes_count = (resampled_data["y"] == 1).sum()
no_count = (resampled_data["y"] == 0).sum()

# Calculate the total count of values in the 'y' column
total_count = len(resampled_data)

# Calculate the percentage of 'yes' and 'no'
percentage_yes = (yes_count / total_count) * 100
percentage_no = (no_count / total_count) * 100
print("After sampling")
print("Percentage of 'yes' in 'y':", percentage_yes)
print("Percentage of 'no' in 'y':", percentage_no)

After sampling
Percentage of 'yes' in 'y': 50.0
Percentage of 'no' in 'y': 50.0


In [48]:
resampled_data.to_csv("cleaned_ethan_data.csv", index=False)