In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from langchain_aws import ChatBedrock
from langchain_core.prompts import PromptTemplate
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import time

def clean_data(data):
    # Calcular el número de valores faltantes por columna
    missing_values = data.isnull().sum()
    
    # Filtrar solo las columnas con valores faltantes
    columns_with_missing = missing_values[missing_values > 0]
    
    # Imprimir el resultado
    print("Valores faltantes por columna:")
    print(columns_with_missing)
    
    # Mostrar el porcentaje de valores faltantes para tener más contexto
    percent_missing = (columns_with_missing / len(data)) * 100
    print("\nPorcentaje de valores faltantes por columna:")
    print(percent_missing)
    
    categorical_cols = ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']
    cat_imputer = SimpleImputer(strategy='most_frequent')
    data[categorical_cols] = cat_imputer.fit_transform(data[categorical_cols])
    
    # Calcular el número de valores faltantes por columna
    missing_values = data.isnull().sum()
    
    # Filtrar solo las columnas con valores faltantes
    columns_with_missing = missing_values[missing_values > 0]
    
    # Imprimir el resultado
    print("Valores faltantes por columna:")
    print(columns_with_missing)
    
    # Mostrar el porcentaje de valores faltantes para tener más contexto
    percent_missing = (columns_with_missing / len(data)) * 100
    print("\nPorcentaje de valores faltantes por columna:")
    print(percent_missing)
    
    data.to_csv('data/processed/credit_risk_reto_preprocessed.csv', index=False)
    
    print("\nData guardada en credit_risk_reto_preprocessed.csv")
    
    return data
    

def add_description(data):
    llm = ChatBedrock(
        credentials_profile_name="bedrock-user-admin", model_id="amazon.titan-text-premier-v1:0"
    )
    
    text = """
    Eres un experto en riesgos crediticio bancario.
    Se te proveerá una serie de datos descritos a continuación:
    Edad: Edad de la persona
    Sexo: Sexo de la persona
    Trabajo: ( 0 - unskilled and non-resident, 1 - unskilled and resident, 2 - skilled, 3 - highly skilled)
    Alojamiento: Tipo de alojamiento
    Cuentas de ahorro: Tipo de cuenta de ahorro
    Cuenta corriente: Tipo de cuenta corriente
    Monto del crédito: Monto de crédito
    Duración (meses): Tiempo de préstamo
    Finalidad: Motivo del préstamo
    
    Tu tarea es describir los datos presentados en un máximo de 30 palabras con su relación con el riesgo crediticio.
    
    Estos son los datos:
    Edad: {age}
    Sexo: {sex}
    Trabajo: {job}
    Alojamiento: {Housing}
    Cuentas de ahorro: {Saving_accounts}
    Cuenta corriente: {Checking_account}
    Monto del crédito: {Credit_amount}
    Duración: {Duration}
    Finalidad: {Purpose}
    
    Escribe tu respuesta a continuación:
    """
    
    prompt_template = PromptTemplate.from_template(text)
    results = []
    for index, row in data.iterrows():
        age = row['Age']
        sex = row['Sex']
        job = row['Job']
        Housing = row['Housing']
        Saving_accounts = row['Saving accounts']
        Checking_account = row['Checking account']
        Credit_amount = row['Credit amount']
        Duration = row['Duration']
        Purpose = row['Purpose']
        result = prompt_template.invoke({"age": age,
                                "sex": sex,
                                "job": job,
                                "Housing": Housing,
                                "Saving_accounts": Saving_accounts,
                                "Checking_account": Checking_account,
                                "Credit_amount": Credit_amount,
                                "Duration": Duration,
                                "Purpose": Purpose
                                })
        classification = llm.invoke(input=result)
    
        results.append(classification.content)
    
        print(classification.content)
        time.sleep(5)
    
    data['description'] = results
    data.to_csv('data/processed/output_description.csv', index=False)
    
    print("DataFrame guardado en 'output_description.csv'")
    return data

def add_target(data):
    llm = ChatBedrock(
        credentials_profile_name="bedrock-user-admin", model_id="anthropic.claude-3-5-sonnet-20240620-v1:0"
    )
    
    text = """
    You are an expert in bank credit risk.
    Your task is to classify the credit risk as 'good risk' or 'bad risk'.
    
    Examples:
    Description: A 67 year old man requested a loan of 1169 for a TV, he is skilled, has little savings, his own home and has requested a loan for 6 months.
    Answer: bad risk
    
    Description: A 22 year old woman requested a loan of 5951 euros for 48 months to buy a radio or television. She is a skilled worker and owns her own home. She has a small savings account and a moderate current account.
    Answer: good risk
    
    The following is the description you must classify and is important your answer should be only 'good risk' or 'bad risk':
    
    Description: {description}
    Answer:
    """
    
    prompt_template = PromptTemplate.from_template(text)
    
    results = []
    for index, row in data.iterrows():
        description = row['description']
        result = prompt_template.invoke({"description": description})
        classification = llm.invoke(input=result)
    
        results.append(classification.content)
    
        print(classification.content)
        time.sleep(5)
    
    data['target'] = results
    data.to_csv('data/processed/output_target.csv', index=False)
    
    print("DataFrame guardado en 'output_target.csv'")
    return data

clean_data_transformer = FunctionTransformer(clean_data, validate=False)
add_description_transformer = FunctionTransformer(add_description, validate=False)
add_target_transformer = FunctionTransformer(add_target, validate=False)

preprocessing_pipeline = Pipeline(steps=[
    ("clean_data", clean_data_transformer),
    ("add_description", add_description_transformer),
    ("add_target", add_target_transformer)
])

In [14]:
def normalize_data_X(X):    
    categorical_cols = ["Sex", "Housing", "Saving accounts", "Checking account", "Purpose"]
    encoder = OneHotEncoder(sparse_output=False, drop="first")
    X_encoded = encoder.fit_transform(X[categorical_cols])
    encoded_cols = encoder.get_feature_names_out(categorical_cols)
    X_encoded = pd.DataFrame(X_encoded, columns=encoded_cols, index=X.index)
    
    numerical_cols = ["Age", "Job", "Credit amount", "Duration"]
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X[numerical_cols])
    X_scaled = pd.DataFrame(X_scaled, columns=numerical_cols, index=X.index)
    
    X_preprocessed = pd.concat([X_scaled, X_encoded], axis=1)    
    return X_preprocessed

normalize_data_X_transformer = FunctionTransformer(normalize_data_X, validate=False)

data_X_preparation_pipeline = Pipeline(steps=[
    ("normalization", normalize_data_X_transformer)
])

In [15]:
def normalize_data_y(y):
    y = y.map({'good risk': 0, 'bad risk': 1})
    return y

normalize_data_y_transformer = FunctionTransformer(normalize_data_y, validate=False)

data_y_preparation_pipeline = Pipeline(steps=[
    ("normalization", normalize_data_y_transformer)
])

In [16]:
import pandas as pd
data = pd.read_csv("data/raw/credit_risk_reto.csv")

data_processed = preprocessing_pipeline.fit_transform(data)

Valores faltantes por columna:
Saving accounts     183
Checking account    394
dtype: int64

Porcentaje de valores faltantes por columna:
Saving accounts     18.3
Checking account    39.4
dtype: float64
Valores faltantes por columna:
Series([], dtype: int64)

Porcentaje de valores faltantes por columna:
Series([], dtype: float64)

Data guardada en credit_risk_reto_preprocessed_2.csv
The credit applicant is a 67 year old man with a low savings and checking account balance, who works in a skilled job and owns his home. He is applying for a small loan for a radio/TV purchase.
A 22 year old female who is skilled, owns her home, has a moderate checking account and little savings has requested a loan for a radio/TV in the amount of $5951 for 48 months.
A 49 year old male with a skilled job, who owns his home and is applying for an education loan of 2096 over 12 months with little savings.
A 45-year-old male with a skilled job has requested a loan of 7882 for furniture/equipment. The loan dur

In [18]:
print(data_processed)

None


In [3]:
# Separar características y etiquetas
X = data_processed.drop(columns=["target", "description"])
y = data_processed["target"]

# Aplicar el pipeline de preprocesamiento
X_preprocessed = data_X_preparation_pipeline.fit_transform(X)
y_preprocessed = data_y_preparation_pipeline.fit_transform(y)

array([[0.85714286, 0.66666667, 0.05056674, ..., 1.        , 0.        ,
        0.        ],
       [0.05357143, 0.66666667, 0.31368989, ..., 1.        , 0.        ,
        0.        ],
       [0.53571429, 0.33333333, 0.10157368, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.33928571, 0.66666667, 0.03048311, ..., 1.        , 0.        ,
        0.        ],
       [0.07142857, 0.66666667, 0.08776274, ..., 1.        , 0.        ,
        0.        ],
       [0.14285714, 0.66666667, 0.23803235, ..., 0.        , 0.        ,
        0.        ]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_preprocessed, test_size=0.2, random_state=42, stratify=y_preprocessed)

trainX = pd.DataFrame(X_train)
trainX["target"] = y_train

testX = pd.DataFrame(X_test)
testX["target"] = y_test

In [None]:
trainX.to_csv("data/toTrain/train-V-1.csv", index=False)
testX.to_csv("data/toTrain/test-V-1.csv", index=False)