<a href="https://colab.research.google.com/github/joseop/ProyectoIA/blob/develop/HomeCreditDefaultRisk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importar las bibliotecas necesarias
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer

# Cargar los conjuntos de datos
application_train = pd.read_csv('application_train.csv')
application_test = pd.read_csv('application_test.csv')
bureau = pd.read_csv('bureau.csv')
bureau_balance = pd.read_csv('bureau_balance.csv')
credit_card_balance = pd.read_csv('credit_card_balance.csv')
installments_payments = pd.read_csv('installments_payments.csv')
POS_CASH_balance = pd.read_csv('POS_CASH_balance.csv')
previous_application = pd.read_csv('previous_application.csv')

# Limpieza de datos
# Eliminar columnas con más del 50% de valores faltantes
application_train.dropna(thresh=0.5*len(application_train.columns), axis=1, inplace=True)
application_test.dropna(thresh=0.5*len(application_test.columns), axis=1, inplace=True)

# Imputar valores faltantes
imputer = SimpleImputer(strategy='most_frequent')
application_train = pd.DataFrame(imputer.fit_transform(application_train), columns=application_train.columns)
application_test = pd.DataFrame(imputer.transform(application_test), columns=application_test.columns)

# Eliminar columnas no numéricas
application_train = application_train.select_dtypes(include=[np.number])
application_test = application_test.select_dtypes(include=[np.number])

# Escalado de datos
scaler = StandardScaler()
application_train_scaled = pd.DataFrame(scaler.fit_transform(application_train), columns=application_train.columns)
application_test_scaled = pd.DataFrame(scaler.transform(application_test), columns=application_test.columns)

# Codificación de variables categóricas
le = LabelEncoder()
for col in bureau.select_dtypes(include=['object']).columns:
    bureau[col] = le.fit_transform(bureau[col])
for col in credit_card_balance.select_dtypes(include=['object']).columns:
    credit_card_balance[col] = le.fit_transform(credit_card_balance[col])
for col in installments_payments.select_dtypes(include=['object']).columns:
    installments_payments[col] = le.fit_transform(installments_payments[col])
for col in POS_CASH_balance.select_dtypes(include=['object']).columns:
    POS_CASH_balance[col] = le.fit_transform(POS_CASH_balance[col])
for col in previous_application.select_dtypes(include=['object']).columns:
    previous_application[col] = le.fit_transform(previous_application[col])

# Selección de características
# Unir los conjuntos de datos relacionados
bureau_joined = bureau.merge(bureau_balance, on='SK_ID_BUREAU', how='left')
previous_application_joined = previous_application.merge(POS_CASH_balance, on='SK_ID_PREV', how='left')
previous_application_joined = previous_application_joined.merge(installments_payments, on='SK_ID_PREV', how='left')
previous_application_joined = previous_application_joined.merge(credit_card_balance, on='SK_ID_PREV', how='left')

# Unir los conjuntos de datos con el conjunto de datos de solicitud
application_train_joined = application_train.merge(bureau_joined, on='SK_ID_CURR', how='left')
application_train_joined = application_train_joined.merge(previous_application_joined, on='SK_ID_CURR', how='left')

# Realizar selección de características
selector = SelectKBest(f_classif, k=100)
selector.fit(application_train_joined.drop('TARGET', axis=1), application_train_joined['TARGET'])
selected_features = application_train_joined.drop('TARGET', axis=1).columns[selector.get_support()]
application_train_selected = application_train_joined[['TARGET'] + list(selected_features)]
application_test_selected = application_test_joined[list(selected_features)]

FileNotFoundError: ignored