Tomar como base para comparar contra otros modelos

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report
)
from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

In [5]:
#Loading data

#Defining BASE_PATH
BASE_PATH = os.getenv("/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/", "/Users/carlos/Desktop/CURSOS/Anyone AI/Credit-Risk-App/dataset/")

train_file_path = os.path.join(BASE_PATH, "X_train_data.csv")
y_train_file_path = os.path.join(BASE_PATH, "y_train_data.csv")
val_file_path = os.path.join(BASE_PATH, "X_val_data.csv")
y_val_path = os.path.join(BASE_PATH, "y_val_data.csv")

train_df = pd.read_csv(train_file_path)

y_train_df = pd.read_csv(y_train_file_path)
y_train = y_train_df['TARGET_LABEL_BAD']
y_train = y_train.to_numpy()

val_df = pd.read_csv(val_file_path)

y_valid_df = pd.read_csv(y_val_path)
y_valid = y_valid_df['TARGET_LABEL_BAD']
y_valid = y_valid.to_numpy()


In [7]:

# Step 1: Train Logistic Regression without tuning
base_model = LogisticRegression(solver='liblinear', random_state=42)
base_model.fit(train_df, y_train)

# Predictions
y_pred_base = base_model.predict(val_df)
y_prob_base = base_model.predict_proba(val_df)[:, 1]

# Performance Metrics (Base Model)
print("=== Base Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred_base))
print("Precision:", precision_score(y_valid, y_pred_base))
print("Recall:", recall_score(y_valid, y_pred_base))
print("F1 Score:", f1_score(y_valid, y_pred_base))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob_base))

# Confusion Matrix (Base Model)
print("Confusion Matrix (Base Model):\n", confusion_matrix(y_valid, y_pred_base))

# Feature Importance (Base Model)
feature_importance_base = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': np.abs(base_model.coef_[0])
}).sort_values(by='Importance', ascending=False)

print(feature_importance_base.head(20))


=== Base Model Performance ===
Accuracy: 0.7329
Precision: 0.46956521739130436
Recall: 0.02027027027027027
F1 Score: 0.0388629003238575
ROC-AUC: 0.624247663011976
Confusion Matrix (Base Model):
 [[7275   61]
 [2610   54]]
                         Feature  Importance
4            RESIDENCIAL_BOROUGH    2.472860
3               RESIDENCIAL_CITY    2.214112
319                     AGE_95.0    1.132746
287                     AGE_63.0    0.953763
182     MONTHS_IN_RESIDENCE_65.0    0.942716
20             RESIDENCIAL_ZIP_3    0.913174
21            PROFESSIONAL_ZIP_3    0.913174
216          PROFESSION_CODE_5.0    0.893503
242                     AGE_18.0    0.884501
171     MONTHS_IN_RESIDENCE_54.0    0.880116
317                     AGE_93.0    0.861269
102        QUANT_DEPENDANTS_10.0    0.799360
6    RESIDENCIAL_PHONE_AREA_CODE    0.788374
308                     AGE_84.0    0.751453
179     MONTHS_IN_RESIDENCE_62.0    0.740512
225         PROFESSION_CODE_14.0    0.716047
292          

In [None]:
# Hyperparameter Tuning
# Define hyperparameter grid
param_grid = {
    'C': np.logspace(-4, 4, 50),  
    'penalty': ['l1', 'l2']       
}

# Ensure correct solver selection
solver = 'liblinear'  

# Initialize Logistic Regression model
log_reg = LogisticRegression(solver=solver, max_iter=1000, random_state=42)

# Perform Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

# Fit the model on training data
grid_search.fit(train_df, y_train)

# Print best hyperparameters
print("Best Parameters:", grid_search.best_params_)

# Get the best model from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions on validation data
y_pred = best_model.predict(val_df)
y_prob = best_model.predict_proba(val_df)[:, 1]

# Performance Metrics
print("\n=== Model Performance ===")
print("Accuracy:", accuracy_score(y_valid, y_pred))
print("Precision:", precision_score(y_valid, y_pred))
print("Recall:", recall_score(y_valid, y_pred))
print("F1 Score:", f1_score(y_valid, y_pred))
print("ROC-AUC:", roc_auc_score(y_valid, y_prob))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_valid, y_pred))



Best Parameters: {'C': np.float64(5.428675439323859), 'penalty': 'l1'}

=== Model Performance ===
Accuracy: 0.7317
Precision: 0.43448275862068964
Recall: 0.02364864864864865
F1 Score: 0.04485582057671769
ROC-AUC: 0.6266952782935608
Confusion Matrix:
 [[7254   82]
 [2601   63]]


In [19]:
# Get feature importance (coefficients)
feature_importance = pd.DataFrame({'Feature': train_df.columns, 'Importance': np.abs(best_model.coef_[0])})

# Sort by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

print("\n=== Feature Importance ===")
print(feature_importance.head(20))



=== Feature Importance ===
                       Feature  Importance
3             RESIDENCIAL_CITY   25.908720
21          PROFESSIONAL_ZIP_3   18.186803
20           RESIDENCIAL_ZIP_3   12.788677
179   MONTHS_IN_RESIDENCE_62.0    3.307447
317                   AGE_93.0    2.875880
240                   AGE_14.0    2.704479
4          RESIDENCIAL_BOROUGH    2.625630
171   MONTHS_IN_RESIDENCE_54.0    2.595274
199  MONTHS_IN_RESIDENCE_100.0    2.591211
216        PROFESSION_CODE_5.0    2.572411
198   MONTHS_IN_RESIDENCE_96.0    2.471088
193   MONTHS_IN_RESIDENCE_81.0    2.344181
191   MONTHS_IN_RESIDENCE_78.0    2.173772
239                    AGE_7.0    2.114371
320                   AGE_96.0    2.079770
319                   AGE_95.0    2.064375
325                  AGE_106.0    2.015768
175   MONTHS_IN_RESIDENCE_58.0    1.836031
324                  AGE_101.0    1.832175
201  MONTHS_IN_RESIDENCE_200.0    1.791683


# Training the model with an strategy to deal with imbalanced dataset  

In [20]:
# Logistic Regression with class balancing
log_reg_balanced = LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=500, random_state=42)

# Train the model
log_reg_balanced.fit(train_df, y_train)

# Predictions
y_pred = log_reg_balanced.predict(val_df)
y_prob = log_reg_balanced.predict_proba(val_df)[:, 1]


In [21]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
train_df_balanced, y_train_balanced = smote.fit_resample(train_df, y_train)

# Train Logistic Regression
log_reg_smote = LogisticRegression(solver='liblinear', max_iter=500, random_state=42)
log_reg_smote.fit(train_df_balanced, y_train_balanced)

# Predictions
y_pred_smote = log_reg_smote.predict(val_df)
y_prob_smote = log_reg_smote.predict_proba(val_df)[:, 1]


In [22]:
from imblearn.under_sampling import RandomUnderSampler

# Apply Undersampling
rus = RandomUnderSampler(random_state=42)
train_df_under, y_train_under = rus.fit_resample(train_df, y_train)

# Train Logistic Regression
log_reg_under = LogisticRegression(solver='liblinear', max_iter=500, random_state=42)
log_reg_under.fit(train_df_under, y_train_under)

# Predictions
y_pred_under = log_reg_under.predict(val_df)
y_prob_under = log_reg_under.predict_proba(val_df)[:, 1]


In [24]:
def evaluate_model(y_true, y_pred, y_prob, method):
    print(f"\n=== {method} Model Performance ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))
    print("F1 Score:", f1_score(y_true, y_pred))
    print("ROC-AUC:", roc_auc_score(y_true, y_prob))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

# Evaluate each model
evaluate_model(y_valid, y_pred, y_prob, "Class Weight Balanced")
evaluate_model(y_valid, y_pred_smote, y_prob_smote, "SMOTE Oversampling")
evaluate_model(y_valid, y_pred_under, y_prob_under, "Undersampling")



=== Class Weight Balanced Model Performance ===
Accuracy: 0.5888
Precision: 0.34456848432803777
Recall: 0.6024774774774775
F1 Score: 0.43840480742966403
ROC-AUC: 0.6249235535972177
Confusion Matrix:
 [[4283 3053]
 [1059 1605]]

=== SMOTE Oversampling Model Performance ===
Accuracy: 0.5886
Precision: 0.34037868780273006
Recall: 0.5803303303303303
F1 Score: 0.42908687205106855
ROC-AUC: 0.6206308373531656
Confusion Matrix:
 [[4340 2996]
 [1118 1546]]

=== Undersampling Model Performance ===
Accuracy: 0.5839
Precision: 0.3383718419347873
Recall: 0.5882132132132132
F1 Score: 0.42960932145305003
ROC-AUC: 0.6210937116232917
Confusion Matrix:
 [[4272 3064]
 [1097 1567]]
