In [31]:
from sagemaker.predictor import Predictor
import sagemaker
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
import pandas as pd

In [32]:
def clean_data(data):
    # Calcular el número de valores faltantes por columna
    missing_values = data.isnull().sum()
    
    # Filtrar solo las columnas con valores faltantes
    columns_with_missing = missing_values[missing_values > 0]
    
    # Imprimir el resultado
    print("Valores faltantes por columna:")
    print(columns_with_missing)
    
    # Mostrar el porcentaje de valores faltantes para tener más contexto
    percent_missing = (columns_with_missing / len(data)) * 100
    print("\nPorcentaje de valores faltantes por columna:")
    print(percent_missing)
    
    categorical_cols = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
    cat_imputer = SimpleImputer(strategy='most_frequent')
    data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])
    
    # Calcular el número de valores faltantes por columna
    missing_values = data.isnull().sum()
    
    # Filtrar solo las columnas con valores faltantes
    columns_with_missing = missing_values[missing_values > 0]
    
    # Imprimir el resultado
    print("Valores faltantes por columna:")
    print(columns_with_missing)
    
    # Mostrar el porcentaje de valores faltantes para tener más contexto
    percent_missing = (columns_with_missing / len(data)) * 100
    print("\nPorcentaje de valores faltantes por columna:")
    print(percent_missing)
    
    return data
    
def normalize_data_X(X):    
    categorical_cols = ["Sex", "Housing", "Saving accounts", "Checking account", "Purpose"]
    encoder = OneHotEncoder(sparse_output=False, drop="first")
    X_encoded = encoder.fit_transform(X[categorical_cols])
    encoded_cols = encoder.get_feature_names_out(categorical_cols)
    X_encoded = pd.DataFrame(X_encoded, columns=encoded_cols, index=X.index)
    
    numerical_cols = ["Age", "Job", "Credit amount", "Duration"]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X[numerical_cols])
    X_scaled = pd.DataFrame(X_scaled, columns=numerical_cols, index=X.index)
    
    X_preprocessed = pd.concat([X_scaled, X_encoded], axis=1)    
    return X_preprocessed

clean_data_transformer = FunctionTransformer(clean_data, validate=False)
normalize_data_X_transformer = FunctionTransformer(normalize_data_X, validate=False)

preprocessing_pipeline = Pipeline(steps=[
    ("clean_data", clean_data_transformer),
    ("normalization", normalize_data_X_transformer)
])

In [33]:
endpoint_name = "Credit-risk-model-2024-12-10-19-26-49"
predictor = Predictor(endpoint_name=endpoint_name)

In [34]:
data = pd.read_csv("data/raw/credit_risk_reto.csv")
data_processed = preprocessing_pipeline.fit_transform(data)

Valores faltantes por columna:
Saving accounts     183
Checking account    394
dtype: int64

Porcentaje de valores faltantes por columna:
Saving accounts     18.3
Checking account    39.4
dtype: float64
Valores faltantes por columna:
Series([], dtype: int64)

Porcentaje de valores faltantes por columna:
Series([], dtype: float64)


In [39]:
import numpy as np
predictor.serializer = sagemaker.serializers.CSVSerializer()

result = predictor.predict(data_processed[0:3].values.tolist())
labels = {0: 'good risk', 1: 'bad risk'}
result_decoded = [labels[int(value)] for value in eval(result.decode("utf-8"))]
print(result_decoded)

['good risk', 'good risk', 'bad risk']
