# Talent Retention MLOps Notebook

## General settings

### Required libraries

In [23]:
import os
import shutil

from datetime import datetime
from io import StringIO, BytesIO
import json
from pathlib import Path

import boto3
from botocore.client import Config
import great_expectations as ge
import kagglehub
import pandas as pd
from pandera import Column, DataFrameSchema, Check

### General variables

In [4]:
project_root = Path(os.getcwd()).parent

### Common functions

In [5]:
from infisical_sdk import InfisicalSDKClient

def get_secret(secret_name: str) -> str:
    with open("{}/.infisical_token".format(project_root)) as f:
        token = f.read().strip()

    client = InfisicalSDKClient(
        host="http://localhost",
        token=token,
        cache_ttl=300
    )

    secret = client.secrets.get_secret_by_name(
        project_id="talent-retention-mlops",
        environment_slug="dev",
        secret_path="/",
        secret_name=secret_name
    )

    return secret.secretValue

def s3_client() -> boto3.client:
    minio_endpoint = "http://localhost:9000"
    access_key = get_secret("MINIO_ACCESS_KEY")
    secret_key = get_secret("MINIO_SECRET_KEY")

    return boto3.client(
                "s3",
                endpoint_url=minio_endpoint,
                aws_access_key_id=access_key,
                aws_secret_access_key=secret_key,
                config=Config(signature_version="s3v4"),
                region_name="us-east-1",
            )

### Downloading the dataset

In [None]:
# https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset/data
# pip install kagglehub[pandas-datasets]

dataset_name = "pavansubhasht/ibm-hr-analytics-attrition-dataset"

path = kagglehub.dataset_download(dataset_name)
destination = "{}/data/bronze/".format(project_root)

os.makedirs(destination, exist_ok=True)

for file_name in os.listdir(path):
    shutil.move(os.path.join(path, file_name), os.path.join(destination, file_name))

print(f"Dataset stored in: {destination}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/pavansubhasht/ibm-hr-analytics-attrition-dataset?dataset_version_number=1...


100%|██████████| 50.1k/50.1k [00:00<00:00, 620kB/s]

Extracting files...
Dataset stored in: /code/talent-retention-mlops/data/bronze/





### Uploading dataset to MinIO

In [None]:
bucket_name = "datasets"
file_path = "{}/data/bronze/WA_Fn-UseC_-HR-Employee-Attrition.csv".format(project_root)
object_name = "bronze/WA_Fn-UseC_-HR-Employee-Attrition.csv"

client = s3_client()

existing_buckets = [bucket["Name"] for bucket in client.list_buckets()["Buckets"]]
if bucket_name not in existing_buckets:
    client.create_bucket(Bucket=bucket_name)

client.upload_file(
    Filename = file_path, 
    Bucket = bucket_name,
    Key = object_name
    )

Archivo subido como 'bronze/WA_Fn-UseC_-HR-Employee-Attrition.csv' en el bucket 'datasets'


## Data validation

### Load dataset from MinIO into pandas dataframe

In [None]:

bucket_name = "datasets"
object_name = "bronze/WA_Fn-UseC_-HR-Employee-Attrition.csv"

client = s3_client()
response = client.get_object(
    Bucket=bucket_name,
    Key=object_name
    )

csv_content = response['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(csv_content))
df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


### Running basic validations with great expectation library

In [None]:
gdf = ge.from_pandas(df)

gdf.expect_column_to_exist("Age")
gdf.expect_column_values_to_not_be_null("Age")
gdf.expect_column_values_to_be_between("Age", min_value=18, max_value=65)

gdf.expect_column_to_exist("Attrition")
gdf.expect_column_values_to_be_in_set("Attrition", ["Yes", "No"])

results = gdf.validate()

### Curating the dataframe using pandera library

In [None]:
if not results["success"]:
    schema = DataFrameSchema({
        "Age": Column(int, Check.in_range(18, 65), nullable=False),
        "Attrition": Column(str, Check.isin(["Yes", "No"]))
    })

    df = schema.validate(df)
    print("Data validation failed. Please check the data.")

### Saving curated dataframe

#### on local

In [None]:
df.to_csv("{}/data/silver/WA_Fn-UseC_-HR-Employee-Attrition_clean.csv".format(project_root), index=False)

#### on MinIO

In [None]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
bucket_name = "datasets"
object_name = f"silver/WA_Fn-UseC_-HR-Employee-Attrition_clean_{timestamp}.csv"
csv_buffer = BytesIO()
df.to_csv(csv_buffer, index=False)

client = s3_client()
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = csv_buffer.getvalue(),
    ContentType = "text/csv"
)

{'ResponseMetadata': {'RequestId': '183E441B0E4D1319',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"699b02f7f57c2d38871d6610b3716b7d"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'g7bRtw==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E441B0E4D1319',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sat, 10 May 2025 20:23:08 GMT'},
  'RetryAttempts': 0},
 'ETag': '"699b02f7f57c2d38871d6610b3716b7d"',
 'ChecksumCRC32': 'g7bRtw=='}

### Saving JSON file containing results report

#### on local

In [None]:
with open("validation_report.json", "w") as f:
    json.dump(results.to_json_dict(), f, indent=2)

#### on MinIO

In [None]:
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
bucket_name = "reports"
object_name = f"training/validation/report_{timestamp}.json"

client = s3_client()

json_data = json.dumps(results.to_json_dict(), indent=2)

client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = BytesIO(json_data.encode('utf-8')),
    ContentType = 'application/json'
)

{'ResponseMetadata': {'RequestId': '183E42CAA658BD37',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"dc12ecbcbcffc97742658b938b2e6cab"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': '7i5K3w==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E42CAA658BD37',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sat, 10 May 2025 19:59:03 GMT'},
  'RetryAttempts': 0},
 'ETag': '"dc12ecbcbcffc97742658b938b2e6cab"',
 'ChecksumCRC32': '7i5K3w=='}

## Data preprocessing

### Logistic Regression preprocessing

#### Read curated dataframe !!

In [284]:
client = s3_client()
response = client.get_object(
    Bucket = "datasets",
    Key = "silver/WA_Fn-UseC_-HR-Employee-Attrition_clean_20250510-152308.csv"
)

df = pd.read_csv(BytesIO(response['Body'].read()))
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [285]:
target_col = "Attrition"
X = df.drop(columns=[target_col])
y = df[target_col].map({"No": 0, "Yes": 1})

In [286]:
# Ordinal encoder for Random Forest, XGBoost, LightGBM (fake order does not matter)
# One hot encoder for Regresión lineal, SVM, Random Forest, XGBoost, LightGBM

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

# Categorical features
categorical_features = X.select_dtypes(include="object").columns.tolist()

binary_features = [feature for feature in categorical_features if X[feature].nunique() == 2]
for feature in binary_features:
    le = LabelEncoder()
    X[feature] = le.fit_transform(X[feature])

multicategorical_features = [feature for feature in categorical_features if feature not in binary_features]
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

# Numerical features
numerical_features = [
    feature for feature in X.select_dtypes(include=["int64", "float64"]).columns
    if feature not in binary_features
]
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Preprocessor
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, numerical_features),
    ("category", categorical_pipeline, multicategorical_features)
])
X_processed = preprocessor.fit_transform(X)

In [287]:
# Interpretabilidad, Feature Importance, Postprocesamiento, Consistencia, Debugging
encoder = preprocessor.named_transformers_["category"]["encoder"]
onehot_feature_names = encoder.get_feature_names_out(multicategorical_features)

final_feature_names = list(binary_features) + list(numerical_features) + list(onehot_feature_names)


In [288]:
import numpy as np
X_total_transformed = np.concatenate([X[binary_features].values, X_processed], axis=1)
X = pd.DataFrame(X_total_transformed, columns=final_feature_names)
X.head()

Unnamed: 0,Gender,OverTime,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,...,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y
0,0.0,1.0,0.44635,0.742527,-1.010909,-0.891688,0.0,-1.701283,-0.660531,1.383138,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0
1,1.0,0.0,1.322365,-1.297775,-0.14715,-1.868426,0.0,-1.699621,0.254625,-0.240677,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,1.0,1.0,0.008343,1.414363,-0.887515,-0.891688,0.0,-1.696298,1.169781,1.284725,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,0.0,1.0,-0.429664,1.461466,-0.764121,1.061787,0.0,-1.694636,1.169781,-0.486709,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0.0,-1.086676,-0.524295,-0.887515,-1.868426,0.0,-1.691313,-1.575686,-1.274014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


#### Saving feature names for Logistic Regression

In [289]:
import json

with open("preprocessor_feature_names_logreg.json", "w") as f:
    json.dump(final_feature_names, f, indent=2)


In [290]:
bucket_name = "artifacts"
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
client = s3_client()

In [291]:
object_name = "preprocessor/logisticregression/{}/feature_names.json".format(timestamp)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body=BytesIO(json.dumps(final_feature_names, indent=2).encode("utf-8")),
    ContentType="application/json"
)

object_name = "preprocessor/logisticregression/latest/feature_names.json"
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body=BytesIO(json.dumps(final_feature_names, indent=2).encode("utf-8")),
    ContentType="application/json"
)

{'ResponseMetadata': {'RequestId': '183E55532F3A88F0',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"8a85cc241d637f3c5344b9a35e0efd5d"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'K6eTBg==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E55532F3A88F0',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sun, 11 May 2025 01:38:41 GMT'},
  'RetryAttempts': 0},
 'ETag': '"8a85cc241d637f3c5344b9a35e0efd5d"',
 'ChecksumCRC32': 'K6eTBg=='}

#### Saving joblib for Logistic Regression

In [292]:

import joblib

buffer = BytesIO()
joblib.dump(preprocessor, buffer)

object_name = "preprocessor/logisticregression/{}/preprocessor.joblib".format(timestamp)
buffer.seek(0)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = buffer,
    ContentType = "application/octet-stream"
)

object_name = "preprocessor/logisticregression/latest/preprocessor.joblib"
buffer.seek(0)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = buffer,
    ContentType = "application/octet-stream"
)

{'ResponseMetadata': {'RequestId': '183E55533169C516',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"6cf68ec23e9a82253cf556d5cdf1eaea"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'dbnmvA==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E55533169C516',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sun, 11 May 2025 01:38:41 GMT'},
  'RetryAttempts': 0},
 'ETag': '"6cf68ec23e9a82253cf556d5cdf1eaea"',
 'ChecksumCRC32': 'dbnmvA=='}

#### Saving data for Logistic Regression

In [293]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

numpy_files = {
    "X_train.npy": X_train,
    "y_train.npy": y_train,
    "X_test.npy": X_test,
    "y_test.npy": y_test
}

for file_name, data in numpy_files.items():
    buffer = BytesIO()
    np.save(buffer, data)
    
    object_name = "preprocessor/logisticregression/{}/{}".format(timestamp, file_name)
    buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = buffer,
        ContentType = "application/octet-stream"
    )

    object_name = "preprocessor/logisticregression/latest/{}".format(file_name)
    buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = buffer,
        ContentType = "application/octet-stream"
    )

In [294]:
df_train = pd.DataFrame(X_train, columns=final_feature_names)
df_train["target"] = y_train
df_test = pd.DataFrame(X_test, columns=final_feature_names)
df_test["target"] = y_test

csv_files = {
    "data_train.csv": df_train,
    "data_test.csv": df_test
}

for file_name, df in csv_files.items():
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False)

    object_name = "preprocessor/logisticregression/{}/{}".format(timestamp, file_name)
    csv_buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = csv_buffer,
        ContentType = "text/csv"
    )

    object_name = "preprocessor/logisticregression/latest/{}".format(file_name)
    csv_buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = csv_buffer,
        ContentType = "text/csv"
    )

### Random Forest & XGBoost preprocessing

#### Read curated dataframe !!

In [295]:
client = s3_client()
response = client.get_object(
    Bucket = "datasets",
    Key = "silver/WA_Fn-UseC_-HR-Employee-Attrition_clean_20250510-152308.csv"
)

df = pd.read_csv(BytesIO(response['Body'].read()))
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [296]:
target_col = "Attrition"
X = df.drop(columns=[target_col])
y = df[target_col].map({"No": 0, "Yes": 1})

In [297]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Categorical features
categorical_features = X.select_dtypes(include="object").columns.tolist()
categorical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

# Numerical features
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
numerical_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

# Preprocessor
preprocessor = ColumnTransformer([
    ("numeric", numerical_pipeline, numerical_features),
    ("category", categorical_pipeline, categorical_features)
])

X_processed = preprocessor.fit_transform(X)

In [298]:
final_feature_names = list(numerical_features) + list(categorical_features)

In [299]:
X = pd.DataFrame(X_processed, columns=final_feature_names)
X.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,YearsSinceLastPromotion,YearsWithCurrManager,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,41.0,1102.0,1.0,2.0,1.0,1.0,2.0,94.0,3.0,2.0,...,0.0,5.0,2.0,2.0,1.0,0.0,7.0,2.0,0.0,1.0
1,49.0,279.0,8.0,1.0,1.0,2.0,3.0,61.0,2.0,2.0,...,1.0,7.0,1.0,1.0,1.0,1.0,6.0,1.0,0.0,0.0
2,37.0,1373.0,2.0,2.0,1.0,4.0,4.0,92.0,2.0,1.0,...,0.0,0.0,2.0,1.0,4.0,1.0,2.0,2.0,0.0,1.0
3,33.0,1392.0,3.0,4.0,1.0,5.0,4.0,56.0,3.0,1.0,...,3.0,0.0,1.0,1.0,1.0,0.0,6.0,1.0,0.0,1.0
4,27.0,591.0,2.0,1.0,1.0,7.0,1.0,40.0,3.0,1.0,...,2.0,2.0,2.0,1.0,3.0,1.0,2.0,1.0,0.0,0.0


#### Saving feature names for Random Forest

In [300]:
import json

with open("preprocessor_feature_names_randomforest.json", "w") as f:
    json.dump(final_feature_names, f, indent=2)


In [301]:
bucket_name = "artifacts"
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
client = s3_client()

In [302]:
object_name = "preprocessor/randomforest/{}/feature_names.json".format(timestamp)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = BytesIO(json.dumps(final_feature_names, indent=2).encode("utf-8")),
    ContentType="application/json"
)

object_name = "preprocessor/randomforest/latest/feature_names.json"
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = BytesIO(json.dumps(final_feature_names, indent=2).encode("utf-8")),
    ContentType="application/json"
)

{'ResponseMetadata': {'RequestId': '183E555369F45D97',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"a48efcf5af42ca90c2a2214748eba719"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'rVMPFQ==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E555369F45D97',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sun, 11 May 2025 01:38:42 GMT'},
  'RetryAttempts': 0},
 'ETag': '"a48efcf5af42ca90c2a2214748eba719"',
 'ChecksumCRC32': 'rVMPFQ=='}

#### Saving joblib for Random Forest

In [303]:
import joblib

buffer = BytesIO()
joblib.dump(preprocessor, buffer)

object_name = "preprocessor/randomforest/{}/preprocessor.joblib".format(timestamp)
buffer.seek(0)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = buffer,
    ContentType = "application/octet-stream"
)

object_name = "preprocessor/randomforest/latest/preprocessor.joblib"
buffer.seek(0)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = buffer,
    ContentType = "application/octet-stream"
)

{'ResponseMetadata': {'RequestId': '183E55536DF9676A',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"e899f8a26af1f0df90baa074cd3d3b67"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'rhSawQ==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E55536DF9676A',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sun, 11 May 2025 01:38:42 GMT'},
  'RetryAttempts': 0},
 'ETag': '"e899f8a26af1f0df90baa074cd3d3b67"',
 'ChecksumCRC32': 'rhSawQ=='}

#### Saving data for Random Forest

In [304]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

numpy_files = {
    "X_train.npy": X_train,
    "y_train.npy": y_train,
    "X_test.npy": X_test,
    "y_test.npy": y_test
}

for file_name, data in numpy_files.items():
    buffer = BytesIO()
    np.save(buffer, data)
    
    object_name = "preprocessor/randomforest/{}/{}".format(timestamp, file_name)
    buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = buffer,
        ContentType = "application/octet-stream"
    )

    object_name = "preprocessor/randomforest/latest/{}".format(file_name)
    buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = buffer,
        ContentType = "application/octet-stream"
    )

In [305]:
df_train = pd.DataFrame(X_train, columns=final_feature_names)
df_train["target"] = y_train
df_test = pd.DataFrame(X_test, columns=final_feature_names)
df_test["target"] = y_test

csv_files = {
    "data_train.csv": df_train,
    "data_test.csv": df_test
}

for file_name, df in csv_files.items():
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False)
    
    object_name = "preprocessor/randomforest/{}/{}".format(timestamp, file_name)
    csv_buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = csv_buffer,
        ContentType = "text/csv"
    )
    
    object_name = "preprocessor/randomforest/latest/{}".format(file_name)
    csv_buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = csv_buffer,
        ContentType = "text/csv"
    )

#### Saving feature names for XGBoost

In [306]:
import json

with open("preprocessor_feature_names_xgboost.json", "w") as f:
    json.dump(final_feature_names, f, indent=2)

In [307]:
bucket_name = "artifacts"
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
client = s3_client()


In [308]:
object_name = "preprocessor/xgboost/{}/feature_names.json".format(timestamp)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = BytesIO(json.dumps(final_feature_names, indent=2).encode("utf-8")),
    ContentType="application/json"
)

object_name = "preprocessor/xgboost/latest/feature_names.json"
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = BytesIO(json.dumps(final_feature_names, indent=2).encode("utf-8")),
    ContentType="application/json"
)

{'ResponseMetadata': {'RequestId': '183E55538C4D166E',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"a48efcf5af42ca90c2a2214748eba719"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'rVMPFQ==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E55538C4D166E',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sun, 11 May 2025 01:38:42 GMT'},
  'RetryAttempts': 0},
 'ETag': '"a48efcf5af42ca90c2a2214748eba719"',
 'ChecksumCRC32': 'rVMPFQ=='}

#### Saving joblib for XGBoost

In [309]:
import joblib

buffer = BytesIO()
joblib.dump(preprocessor, buffer)

object_name = "preprocessor/xgboost/{}/preprocessor.joblib".format(timestamp)
buffer.seek(0)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = buffer,
    ContentType = "application/octet-stream"
)

object_name = "preprocessor/xgboost/latest/preprocessor.joblib"
buffer.seek(0)
client.put_object(
    Bucket = bucket_name,
    Key = object_name,
    Body = buffer,
    ContentType = "application/octet-stream"
)

{'ResponseMetadata': {'RequestId': '183E555390454813',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"e899f8a26af1f0df90baa074cd3d3b67"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'rhSawQ==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '183E555390454813',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '1148',
   'x-ratelimit-remaining': '1148',
   'x-xss-protection': '1; mode=block',
   'date': 'Sun, 11 May 2025 01:38:42 GMT'},
  'RetryAttempts': 0},
 'ETag': '"e899f8a26af1f0df90baa074cd3d3b67"',
 'ChecksumCRC32': 'rhSawQ=='}

#### Saving data for XGBoost

In [310]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42
)

numpy_files = {
    "X_train.npy": X_train,
    "y_train.npy": y_train,
    "X_test.npy": X_test,
    "y_test.npy": y_test
}

for file_name, data in numpy_files.items():
    buffer = BytesIO()
    np.save(buffer, data)

    object_name = "preprocessor/xgboost/{}/{}".format(timestamp, file_name)
    buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = buffer,
        ContentType = "application/octet-stream"
    )

    object_name = "preprocessor/xgboost/latest/{}".format(file_name)
    buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = buffer,
        ContentType = "application/octet-stream"
    )

In [311]:
df_train = pd.DataFrame(X_train, columns=final_feature_names)
df_train["target"] = y_train
df_test = pd.DataFrame(X_test, columns=final_feature_names)
df_test["target"] = y_test

csv_files = {
    "data_train.csv": df_train,
    "data_test.csv": df_test
}

for file_name, df in csv_files.items():
    csv_buffer = BytesIO()
    df.to_csv(csv_buffer, index=False)
    
    object_name = "preprocessor/xgboost/{}/{}".format(timestamp, file_name)
    csv_buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = csv_buffer,
        ContentType = "text/csv"
    )

    object_name = "preprocessor/xgboost/latest/{}".format(file_name)
    csv_buffer.seek(0)
    client.put_object(
        Bucket = bucket_name,
        Key = object_name,
        Body = csv_buffer,
        ContentType = "text/csv"
    )

## Evaluation

In [312]:
bucket_name = "artifacts"
object_name = "bronze/WA_Fn-UseC_-HR-Employee-Attrition.csv"

client = s3_client()
response = client.get_object(
    Bucket=bucket_name,
    Key=object_name
    )

csv_content = response['Body'].read().decode('utf-8')
df = pd.read_csv(StringIO(csv_content))

NoSuchKey: An error occurred (NoSuchKey) when calling the GetObject operation: The specified key does not exist.

# Improvements

Add model comparator to add interpretability, feature importance analysis, postprocesamiento, consistency, debugging
