## Imports and functions

In [1]:
import os
import pandas as pd
import numpy as np
from plotly import express as px, graph_objects as go
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as lda ,QuadraticDiscriminantAnalysis as qda
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import plotly.figure_factory as ff
import wraper
from dotenv import load_dotenv

load_dotenv()

path = os.environ["file_path"]
data = pd.read_csv(path)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Dany\\OneDrive\\Documentos\\IDM\\credit risk analysis\\credit_risk_data-1.csv'

In [61]:
def dummy_vars_creation(var:pd.Series):
    new_vars = pd.unique(var)
    dummy_vars  = pd.DataFrame()
    
    for key in new_vars:
        dummy_vars[key] = var.isin([key])*1
    
    return dummy_vars

def standarizer(predic_vars:pd.DataFrame,scaler = None,not_transform:list[str]=[]):
    temp_headers = list(predic_vars.columns)
    for header in ["loan_status"]+not_transform:
        temp_headers.remove(header)

    if (scaler == None):
        scaler = StandardScaler()
    standar_vars = scaler.fit_transform(predic_vars[predic_vars.columns.difference(["loan_status"])])
    standar_vars = pd.DataFrame(standar_vars,columns=temp_headers)


    for header in temp_headers:
        predic_vars[header] = standar_vars[header]

def evaluate_and_plot(model_name, model, X_test_df, y_test):
    X_test_array = X_test_df.values 
    y_pred = model.predict(X_test_array)
    cm = confusion_matrix(y_test, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    z_text = [[f'{cm[i, j]}<br>({cm_normalized[i, j]:.1%})' for j in range(2)] for i in range(2)]
    
    fig = ff.create_annotated_heatmap(
        z=cm, 
        x=['Predicted 0 (No Default)', 'Predicted 1 (Default)'], 
        y=['Actual 0 (No Default)', 'Actual 1 (Default)'], 
        annotation_text=z_text, 
        colorscale='Blues',
        showscale=True
    )
    
    fig.update_layout(
        title=f'Matriz de Confusión: {model_name} (Test Set)',
        xaxis_title="Predicción",
        yaxis_title="Valor Real"
    )
    fig.show()

def calculate_roc_data(model, X_test_df, y_test):
    X_test_array = X_test_df.values
    y_proba = model.model.predict_proba(X_test_array)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc

## 1. Data Loading

In [62]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   application_id         2500 non-null   object 
 1   application_date       2500 non-null   object 
 2   loan_amount            2500 non-null   float64
 3   annual_income          2500 non-null   float64
 4   employment_years       2500 non-null   float64
 5   job_stability_score    2500 non-null   float64
 6   credit_score           2500 non-null   int64  
 7   credit_utilization     2500 non-null   float64
 8   payment_history_score  2500 non-null   float64
 9   open_credit_lines      2500 non-null   int64  
 10  debt_to_income_ratio   2500 non-null   float64
 11  savings_ratio          2500 non-null   float64
 12  asset_value            2500 non-null   float64
 13  age                    2500 non-null   int64  
 14  education_level        2500 non-null   object 
 15  mari

In [63]:
data.head()

Unnamed: 0,application_id,application_date,loan_amount,annual_income,employment_years,job_stability_score,credit_score,credit_utilization,payment_history_score,open_credit_lines,debt_to_income_ratio,savings_ratio,asset_value,age,education_level,marital_status,residential_stability,loan_status
0,APP_2328,2022-01-01,132221.82,60451.82,6.6,0.898,679,0.106,0.876,1,0.451,0.5,352569.55,41,High School,Married,3.5,0
1,APP_558,2022-01-01,134906.42,114634.08,10.3,0.808,718,0.03,0.719,4,0.09,0.235,224364.21,46,Masters,Divorced,11.4,0
2,APP_2477,2022-01-01,30285.19,82772.53,12.1,0.964,768,0.174,0.775,6,0.201,0.172,514765.55,44,High School,Widowed,8.6,0
3,APP_741,2022-01-01,32516.09,94023.36,9.1,0.69,670,0.141,0.993,3,0.322,0.368,182541.72,26,Bachelors,Single,3.9,0
4,APP_145,2022-01-02,77900.99,53515.02,7.2,0.679,651,0.097,0.946,2,0.222,0.324,223691.29,50,Associates,Single,9.6,0


In [64]:
data.describe()

Unnamed: 0,loan_amount,annual_income,employment_years,job_stability_score,credit_score,credit_utilization,payment_history_score,open_credit_lines,debt_to_income_ratio,savings_ratio,asset_value,age,residential_stability,loan_status
count,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0,2500.0
mean,155716.305344,67707.807596,6.67564,0.634643,681.7284,0.358176,0.740733,3.4516,0.408094,0.320784,175666.741236,42.0456,6.0232,0.2656
std,149605.357952,27302.931731,3.488021,0.293276,88.683309,0.289995,0.285966,2.083793,0.224736,0.192079,182652.56893,12.092395,3.205397,0.441741
min,5000.0,15000.0,0.0,0.011,334.0,0.004,0.029,0.0,0.009,0.0,550.63,18.0,0.0,0.0
25%,42984.5175,47475.3175,4.0,0.3755,642.75,0.131,0.5175,2.0,0.228,0.161,49513.0825,34.0,3.6,0.0
50%,97054.315,66963.475,6.7,0.752,700.0,0.246,0.8805,3.0,0.359,0.327,121018.75,42.0,5.9,0.0
75%,213214.9925,87347.6425,9.3,0.866,743.0,0.59225,0.956,5.0,0.565,0.464,235513.9025,50.0,8.4,1.0
max,500000.0,149929.96,19.3,0.999,850.0,0.998,1.0,11.0,0.979,0.893,1000000.0,75.0,16.4,1.0


In [65]:
# 1. Calcular la Tasa de Default
# 'loan_status' es binaria (0 o 1). La media es la proporción de '1's (Defaults).
default_rate = data['loan_status'].mean()
default_rate_pct = default_rate * 100
print(f"La Tasa de Default (loan_status = 1) es: {default_rate_pct:.2f}%")

# 2. Preparar los datos y Plotear la distribución de la variable objetivo
data_plot = data['loan_status'].value_counts().reset_index()
data_plot.columns = ['Estado del Préstamo', 'Conteo']
data_plot['Estado del Préstamo'] = data_plot['Estado del Préstamo'].map({0: 'No Default (0)', 1: 'Default (1)'})
data_plot['Porcentaje'] = (data_plot['Conteo'] / data_plot['Conteo'].sum()) * 100

# Definición de las variables (Asegúrate que estas listas coincidan con tus columnas)
numerical_features = [
    'loan_amount', 'annual_income', 'employment_years', 'job_stability_score',
    'credit_score', 'credit_utilization', 'payment_history_score',
    'open_credit_lines', 'debt_to_income_ratio', 'savings_ratio',
    'asset_value', 'age'
]
categorical_features = ['marital_status', 'education_level', 'employment_type', 'home_ownership']

La Tasa de Default (loan_status = 1) es: 26.56%


## 2. Plots and EDA

In [66]:
fig = px.bar(
    data_plot,
    x='Estado del Préstamo',
    y='Conteo',
    color='Estado del Préstamo',
    title=f'Distribución de la Variable Objetivo (Tasa de Default: {default_rate_pct:.2f}%)',
    text='Porcentaje',
    labels={'Conteo': 'Frecuencia (Conteo)', 'Estado del Préstamo': 'Estado del Préstamo'},
    color_discrete_map={'No Default (0)': 'lightsteelblue', 'Default (1)': 'darkblue'}
)
fig.update_traces(texttemplate='%{text:.2f}%', textposition='inside')
fig.show()

all_numeric_cols = numerical_features + ['loan_status']
corr_matrix = data[all_numeric_cols].corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.index,
    colorscale='RdBu',
    zmin=-1,
    zmax=1,
    text=corr_matrix.round(2).values,
    hoverongaps = False,
))

fig.update_layout(
    title='Heatmap de Correlación de Variables Numéricas y la Variable Objetivo',
    xaxis_title="Variables",
    yaxis_title="Variables",
    height=700
)

fig.show()

highest_3 = ["job_stability_score", "payment_history_score", "credit_utilization"]

for col in highest_3:
    fig = px.box(
        data,
        x='loan_status',
        y=col,
        color='loan_status',
        title=f'Distribución de "{col}" por Estado del Préstamo',
        labels={'loan_status': 'Estado del Préstamo', col: col},
        color_discrete_map={0: 'green', 1: 'red'}
    )
    # Mejorar las etiquetas del eje X
    fig.update_xaxes(tickvals=[0, 1], ticktext=['No Default (0)', 'Default (1)'])
    fig.show()

## 3. Data Preprocessing

In [67]:
mod_vars = data.copy(True).drop("application_id",axis=1)
class_vars = ["marital_status","education_level"]

for var in class_vars:
    temp = dummy_vars_creation(mod_vars[var])
    mod_vars = pd.concat([mod_vars,temp],axis=1).drop(var,axis=1)

mod_vars["year"] = mod_vars["application_date"].str.split("-").str[0]
mod_vars["month"] = mod_vars["application_date"].str.split("-").str[1]
mod_vars["day"] = mod_vars["application_date"].str.split("-").str[-1]
mod_vars = mod_vars.drop("application_date",axis=1)

mod_vars


Unnamed: 0,loan_amount,annual_income,employment_years,job_stability_score,credit_score,credit_utilization,payment_history_score,open_credit_lines,debt_to_income_ratio,savings_ratio,...,Widowed,Single,High School,Masters,Bachelors,Associates,Doctorate,year,month,day
0,132221.82,60451.82,6.6,0.898,679,0.106,0.876,1,0.451,0.500,...,0,0,1,0,0,0,0,2022,01,01
1,134906.42,114634.08,10.3,0.808,718,0.030,0.719,4,0.090,0.235,...,0,0,0,1,0,0,0,2022,01,01
2,30285.19,82772.53,12.1,0.964,768,0.174,0.775,6,0.201,0.172,...,1,0,1,0,0,0,0,2022,01,01
3,32516.09,94023.36,9.1,0.690,670,0.141,0.993,3,0.322,0.368,...,0,1,0,0,1,0,0,2022,01,01
4,77900.99,53515.02,7.2,0.679,651,0.097,0.946,2,0.222,0.324,...,0,1,0,0,0,1,0,2022,01,02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,500000.00,149929.96,8.5,0.877,668,0.431,0.669,0,0.206,0.541,...,0,1,0,0,1,0,0,2024,12,28
2496,38284.25,63593.34,1.0,0.411,609,0.838,0.242,3,0.896,0.074,...,0,1,0,0,0,1,0,2024,12,29
2497,166329.17,75670.19,2.1,0.684,672,0.499,0.995,1,0.518,0.234,...,0,0,0,1,0,0,0,2024,12,29
2498,43779.56,34662.35,0.7,0.127,534,0.776,0.463,2,0.698,0.021,...,0,0,1,0,0,0,0,2024,12,29


In [68]:
train_data,test_data = train_test_split(mod_vars,test_size=0.2,random_state=42)
print("traing len:",train_data.shape[0])
print("test len:",test_data.shape[0])
test_data["loan_status"].value_counts()

traing len: 2000
test len: 500


loan_status
0    378
1    122
Name: count, dtype: int64

In [69]:
X_train = train_data.drop("loan_status", axis=1)
X_test = test_data.drop("loan_status", axis=1)
y_train = train_data["loan_status"]
y_test = test_data["loan_status"]

scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled_array = scaler.transform(X_train)
X_test_scaled_array = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(
    X_train_scaled_array, 
    columns=X_train.columns, 
    index=X_train.index
)
X_test_scaled = pd.DataFrame(
    X_test_scaled_array, 
    columns=X_test.columns, 
    index=X_test.index
)

train_data = pd.concat([X_train_scaled, y_train], axis=1).reset_index(drop=True)
test_data = pd.concat([X_test_scaled, y_test], axis=1).reset_index(drop=True)

print(train_data.head())
print(test_data.head())

   loan_amount  annual_income  employment_years  job_stability_score  \
0     0.229381       1.034312          0.967720             1.038215   
1    -0.737516       1.379178          0.539374             0.447339   
2    -1.021620       0.397880         -0.145980             0.712214   
3    -1.008588       0.787704         -0.088868             0.280943   
4    -0.651089       1.642524         -0.545770             0.050026   

   credit_score  credit_utilization  payment_history_score  open_credit_lines  \
0      0.468943           -0.257491               0.665402          -0.697311   
1      0.313453           -0.274703               0.707353           2.138246   
2      1.224184           -0.687789               0.661906           0.720468   
3      1.113119           -0.742867               0.564020           1.665653   
4     -0.008635           -0.825485               0.766784           0.720468   

   debt_to_income_ratio  savings_ratio  ...    Single  High School  Masters  \
0

## 4. Statistical Assumption Testing

Tanto el **Análisis Discriminante Lineal (LDA)** como el **Análisis Discriminante Cuadrático (QDA)** asumen que las variables predictoras siguen una **Distribución Normal Multivariante** dentro de cada clase. El **EDA** (Sección 2) nos ayuda a evaluar visualmente si este supuesto se cumple razonablemente, observando el sesgo en los Boxplots y la presencia de *outliers*.

La diferencia fundamental reside en el supuesto de **Homogeneidad de las Matrices de Covarianza**.

1.  **LDA** asume que **todas** las clases comparten **una única** matriz de covarianza $(\Sigma)$. Esto implica que la **frontera de decisión** entre las clases es **LINEAL**.
2.  **QDA** asume que cada clase tiene su **propia** matriz de covarianza $(\Sigma_k)$, permitiéndole a la **frontera de decisión** ser **CUADRÁTICA** (curva).

**Hipótesis:** Si las matrices de covarianza son desiguales (es decir, la varianza es muy diferente entre los grupos de Default y No Default), **esperamos que el QDA supere al LDA** porque su flexibilidad le permitirá capturar mejor las diferencias de forma entre las clases.

## 5/6. LDA and QDA Models

In [70]:
model1 = wraper.Wraper(lda())
model2 = wraper.Wraper(qda())

model1.load_training(train_data[train_data.columns.difference(["loan_status"])],train_data["loan_status"])
model2.load_training(train_data[train_data.columns.difference(["loan_status"])],train_data["loan_status"])

model1.run_training()
model2.run_training()

model1.save_model(os.environ["model_lda_path"])
model2.save_model(os.environ["model_qda_path"])

model1 = wraper.Wraper(None)
model2 = wraper.Wraper(None)

model2.load_model(os.environ["model_qda_path"])
model1.load_model(os.environ["model_lda_path"])


Variables are collinear



In [71]:
try:
    lda_coefficients = model1.model.coef_[0] 
except AttributeError:
    print("Error:model1.model.coef_ no es accesible.")

feature_names = train_data.drop("loan_status", axis=1).columns

interpretation_df = pd.DataFrame({
    'Variable': feature_names,
    'Coeficiente LDA': lda_coefficients,
    'Importancia Absoluta': np.abs(lda_coefficients) 
}).sort_values(by='Importancia Absoluta', ascending=False)

fig = px.bar(
    interpretation_df.sort_values(by='Coeficiente LDA', ascending=True),
    x='Coeficiente LDA',
    y='Variable',
    color=np.where(interpretation_df.sort_values(by='Coeficiente LDA')['Coeficiente LDA'] > 0, 'Aumenta Default (+)', 'Reduce Default (-)'),
    title='Interpretación de Coeficientes de LDA (Impacto en la probabilidad de Default)',
    labels={'Coeficiente LDA': 'Impacto', 'color': 'Dirección'},
    color_discrete_map={'Aumenta Default (+)': 'seagreen', 'Reduce Default (-)': 'darkred'},
    orientation='h'
)
fig.update_layout(height=600)
fig.show()

interpretation_df.head(10)


Unnamed: 0,Variable,Coeficiente LDA,Importancia Absoluta
21,Doctorate,-15.4003,15.4003
17,High School,-13.132526,13.132526
13,Married,12.391914,12.391914
15,Widowed,4.980255,4.980255
12,residential_stability,-3.966654,3.966654
23,month,-2.838984,2.838984
16,Single,-2.730785,2.730785
10,asset_value,-1.854383,1.854383
11,age,-1.567202,1.567202
22,year,-1.540555,1.540555


### Análisis e Interpretación de Coeficientes LDA

Los coeficientes del modelo LDA (con datos estandarizados) indican el **impacto lineal** y la **dirección** de cada variable sobre el riesgo de **Default** (clase 1).

Los **tres impulsores de riesgo más importantes**, según la magnitud de su coeficiente, son: 
1.  **Doctorate** (15.40)
2.  **High School** (13.13)
3.  **Married** (12.39)

La influencia de estas variables de estado civil y educación es significativamente mayor que la de cualquier otra variable numérica.

El **signo** del coeficiente determina la dirección de la influencia sobre el riesgo de Default:
* **Signo Positivo (Riesgo Aumenta):** La categoría **Married (+12.39)** y **Widowed (+4.98)** están asociadas con un **AUMENTO** en la probabilidad de Default.
* **Signo Negativo (Riesgo Disminuye):** Las categorías educativas **Doctorate (-15.40)** y **High School (-13.13)** son los factores que más **REDUCEN** el riesgo. Variables como **residential_stability (-3.97)**, ser **Single (-2.73)**, y un mayor **asset_value (-1.85)** también reducen el riesgo.

El modelo se apoya fuertemente en las variables de estado civil y nivel educativo para clasificar el riesgo, siendo **Doctorate**, **High School** y **Married** los factores más determinantes.

## 7. Data Analysis and Results

In [72]:
feature_names = train_data.drop("loan_status", axis=1).columns
X_test = test_data.drop("loan_status", axis=1)
X_test_ordered = X_test[feature_names] 
y_test = test_data["loan_status"]

models = {
    "LDA (Model 1)": model1,
    "QDA (Model 2)": model2
}

for name, model in models.items():
    evaluate_and_plot(name, model, X_test_ordered, y_test)

fpr_lda, tpr_lda, auc_lda = calculate_roc_data(models["LDA (Model 1)"], X_test_ordered, y_test)
fpr_qda, tpr_qda, auc_qda = calculate_roc_data(models["QDA (Model 2)"], X_test_ordered, y_test)

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=fpr_lda, y=tpr_lda,
    mode='lines',
    name=f'LDA (AUC = {auc_lda:.3f})',
    line_color='blue'
))

fig.add_trace(go.Scatter(
    x=fpr_qda, y=tpr_qda,
    mode='lines',
    name=f'QDA (AUC = {auc_qda:.3f})',
    line_color='red'
))

fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    mode='lines',
    line=dict(dash='dash', color='gray'),
    name='Aleatorio (AUC = 0.50)',
))

fig.update_layout(
    title='Curva ROC Comparativa (LDA vs QDA)',
    xaxis_title='Tasa de Falsos Positivos (FPR)',
    yaxis_title='Tasa de Verdaderos Positivos (TPR)',
    xaxis=dict(constrain='domain', range=[0, 1]),
    yaxis=dict(constrain='domain', range=[0, 1])
)
fig.show()


X does not have valid feature names, but LinearDiscriminantAnalysis was fitted with feature names




X does not have valid feature names, but QuadraticDiscriminantAnalysis was fitted with feature names




X does not have valid feature names, but LinearDiscriminantAnalysis was fitted with feature names


X does not have valid feature names, but QuadraticDiscriminantAnalysis was fitted with feature names



In [73]:
metrics_lda, counts_lda = model1.classification_test_accuracy(
    test_data[test_data.columns.difference(["loan_status"])],
    test_data["loan_status"]
)

metrics_qda, counts_qda = model2.classification_test_accuracy(
    test_data[test_data.columns.difference(["loan_status"])],
    test_data["loan_status"]
)

print("\n--- Model 1 (LDA) Counts ---")
print(counts_lda)
print("\n--- Model 1 (LDA) Metrics ---")
print(metrics_lda)

print("\n--- Model 2 (QDA) Counts ---")
print(counts_qda)
print("\n--- Model 2 (QDA) Metrics ---")
print(metrics_qda)


--- Model 1 (LDA) Counts ---
{'correc_pos': 122, 'correct_neg': 378, 'false_pos': 0, 'false_neg': 0}

--- Model 1 (LDA) Metrics ---
{'Acur': 1.0, 'Press': 1.0, 'TPR': 1.0, 'F1': 1.0}

--- Model 2 (QDA) Counts ---
{'correc_pos': 122, 'correct_neg': 378, 'false_pos': 0, 'false_neg': 0}

--- Model 2 (QDA) Metrics ---
{'Acur': 1.0, 'Press': 1.0, 'TPR': 1.0, 'F1': 1.0}


## 8.Model selection and conclussion

Se selecciona el **Análisis Discriminante Lineal (LDA)** como el mejor modelo técnico, aunque ambos modelos demuestran un rendimiento perfecto en el conjunto de prueba.

### Justificación Técnica y Evidencia

La evidencia de la Sección 7 (Evaluación y Comparación) mostró que ambos modelos alcanzaron un rendimiento idéntico y perfecto en el conjunto de prueba:

| Métrica | LDA (Model 1) | QDA (Model 2) |
| :--- | :--- | :--- |
| **Acur** (Accuracy) | 1.00 (100%) | 1.00 (100%) |
| **TPR** (Recall Clase Default) | 1.00 (100%) | 1.00 (100%) |
| **Press** (Precision Clase Default) | 1.00 (100%) | 1.00 (100%) |

Dado que ambos modelos alcanzan la máxima capacidad de predicción (100% de *Recall*, *Precision* y *Accuracy*), la decisión técnica recae en el principio de **parsimonia** (simplicidad).

**LDA es el modelo elegido porque es el más simple y robusto.** Al asumir que las clases son separables linealmente (lo que implica una frontera de decisión más sencilla) y aun así alcanzar el rendimiento perfecto, es preferible a **QDA**, que es un modelo más complejo que asume una frontera cuadrática. En ciencia de datos, si dos modelos rinden igual, se elige el modelo más simple para mejorar la interpretabilidad y reducir el riesgo de sobreajuste con nuevos datos.

La conclusión es que las variables predictoras, una vez estandarizadas, parecen permitir una **separación lineal perfecta** de las clases de riesgo de crédito en el conjunto de prueba.