In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## 1.- Analisis exploratorio de datos

In [None]:
df = pd.read_csv('/home/davian/ml_proyects/ML-Practice/datasets/loan_data.csv')

In [None]:
df.head()

In [None]:
df.rename({
    'person_age': 'edad',
    'person_gender': 'genero',
    'person_education': 'educacion',
    'person_income': 'ingreso anual',
    'person_emp_exp': 'experiencia laboral',
    'person_home_ownership': 'vivienda',
    'loan_amnt': 'monto prestamo',
    'loan_intent': 'proposito',
    'loan_int_rate': 'interes',
    'loan_percent_income': 'porc. prest. año',  
    'cb_person_cred_hist_length': 'dur. hist. cred.',
    'credit_score': 'puntuacion cred.',
    'previous_loan_defaults_on_file': 'ind. impagos',   
    'loan_status': 'estado'
}, axis=1, inplace=True)


In [None]:
df['proposito'].unique()

In [None]:
map_prop = {
    'PERSONAL': 'pers.', 
    'EDUCATION': 'educ.', 
    'MEDICAL': 'med.', 
    'VENTURE': 'vent.', 
    'HOMEIMPROVEMENT': 'hom.impr.', 
    'DEBTCONSOLIDATION': 'debt.consol.',
    }


df['proposito'] = df['proposito'].map(map_prop)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.isin([np.inf, -np.inf]).sum()

### 1.- Analisis univariado 

In [None]:
df_hist = ['edad','ingreso anual', 'experiencia laboral', 'monto prestamo', 'interes', 'porc. prest. año', 'dur. hist. cred.', 'puntuacion cred.']

plt.figure(figsize=(20, 10))

for i, columna in enumerate(df_hist, start=1):
    plt.subplot(4, 4, i)
    sns.histplot(data=df, x=columna, kde=True, bins=40)
    plt.tight_layout()

In [None]:
df.info()

In [None]:
df_count = ['genero', 'educacion', 'vivienda', 'ind. impagos', 'proposito']

plt.figure(figsize=(17, 7))

for i, columna in enumerate(df_count, start=1):
    plt.subplot(2, 3, i)
    sns.countplot(data=df, x=columna)
    plt.tight_layout()

In [None]:
df_box = ['edad','ingreso anual', 'experiencia laboral', 'monto prestamo', 'interes', 'porc. prest. año', 'dur. hist. cred.', 'puntuacion cred.']

plt.figure(figsize=(20, 10))

for i, columna in enumerate(df_box, start=1):
    plt.subplot(4, 4, i)
    sns.boxplot(data=df, x=columna, width=0.5)
    plt.tight_layout()

### B.- Analisis bivariado

In [None]:
df_scatter = ['edad','ingreso anual', 'experiencia laboral', 'monto prestamo', 'interes', 'porc. prest. año', 'dur. hist. cred.']

plt.figure(figsize=(20, 10))

for i, columna in enumerate(df_scatter, start=1):
    plt.subplot(4, 4, i)
    sns.scatterplot(data=df, x=columna, y='puntuacion cred.')
    plt.tight_layout()

In [None]:
df_bar = ['genero', 'educacion', 'vivienda', 'ind. impagos', 'proposito']

plt.figure(figsize=(16, 6))

for i, columna in enumerate(df_bar, start=1):
    plt.subplot(2, 3, i)
    sns.barplot(data=df, x=columna, y='puntuacion cred.')
    plt.tight_layout()

In [None]:
df_heat = df[['edad','ingreso anual', 'experiencia laboral', 'monto prestamo', 'interes', 'porc. prest. año', 'dur. hist. cred.', 'puntuacion cred.']]

plt.figure(figsize=(10, 5))
sns.heatmap(data=df_heat.corr(), annot=True)

#### Conclusiones

* Temos en todas las columnas numericas valores extremos.

* Los valores mas determinantes para predecir la puntuacion crediticia son edad, experiencia laboral y ingreso anual.

* (edad : exp. lab),  (edad : dur. hist. cred),  (exp. lab : dur. hist. cred.)  estan altamente relacionados.


#### limpieza

In [None]:
# vamos a eliminar ingreso por simplisidad

df.drop(['ingreso anual'], inplace=True, axis=1)

In [None]:
df['edad'] = df['edad'].clip(upper=38.0)
df['monto prestamo'] = df['monto prestamo'].clip(lower=0, upper=23000)
df['interes'] = df['interes'].clip(upper=19)
df['porc. prest. año'] = df['porc. prest. año'].clip(upper=0.37)
df['dur. hist. cred.'] = df['dur. hist. cred.'].clip(upper=15)
df['puntuacion cred.'] = df['puntuacion cred.'].clip(lower=500, upper=770)


In [None]:
df_box = ['edad', 'monto prestamo', 'interes', 'porc. prest. año', 'dur. hist. cred.', 'puntuacion cred.']

plt.figure(figsize=(20, 10))

for i, columna in enumerate(df_box, start=1):
    plt.subplot(4, 4, i)
    sns.boxplot(data=df, x=columna, width=0.5)
    plt.tight_layout()

## 4.- Preprocesamiento

#### A.- Transformacion de datos

In [None]:
x = df.drop('puntuacion cred.', axis=1)
y = df['puntuacion cred.']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
df.head()

In [None]:
#from sklearn.compose import make_column_selector, make_column_transformer
#from sklearn.preprocessing import RobustScaler, OneHotEncoder

# pipeline = make_column_transformer(
  #  (RobustScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
   # (OneHotEncoder(handle_unknown='ignore', sparse_output=False), make_column_selector(dtype_include=['object']))
#)

In [None]:
# x_prep = pipeline.fit_transform(x)

In [None]:
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

pipeline = Pipeline([
    ('preprocessor', make_column_transformer(
        (RobustScaler(), make_column_selector(dtype_include=['int64', 'float64'])),
        (OneHotEncoder(handle_unknown='ignore', sparse_output=False), make_column_selector(dtype_include=['object']))
    )),
]) 

In [None]:
x_array = pipeline.fit_transform(x)

In [None]:
x_array

In [None]:
num_cols = x.select_dtypes(include=['int64', 'float64']).columns

cat_cols = x.select_dtypes(include=['object']).columns
onehot_encoder = pipeline.named_steps['preprocessor'].transformers_[1][1]
cat_col_names = onehot_encoder.get_feature_names_out(cat_cols)

# Combinar todos los nombres de columnas
final_column_names = list(num_cols) + list(cat_col_names)

# Crear el DataFrame transformado
x_prep = pd.DataFrame(x_array, columns=final_column_names)

In [None]:
x_prep.head(
    
)

#### b.- Seleccion de caracteristicas

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=10)

rf_model.fit(x_prep, y)

In [None]:
feature_importance_df = pd.DataFrame(
    {'feature': x_prep.columns, 'importances': rf_model.feature_importances_}
).sort_values(by='importances', ascending=False)

In [None]:
feature_importance_df

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(data=feature_importance_df, y='feature', x='importances')

In [None]:
x_red = x_prep[['interes', 'monto prestamo', 'porc. prest. año', 'edad', 'dur. hist. cred.', 'experiencia laboral']]

## 5.- Entrenamiento