# CELL 1: DATA LOADING.

In [13]:
# Load creditdb_train / creditdb_test from CSV
from pathlib import Path
import pandas as pd

data_dir = Path.cwd().parent / "data"

p_csv_train = data_dir / "creditdb_train.csv"
p_csv_test  = data_dir / "creditdb_test.csv"

if p_csv_train.exists() and p_csv_test.exists():
    creditdb_train = pd.read_csv(p_csv_train)
    creditdb_test  = pd.read_csv(p_csv_test)
    print("Loaded CSV: creditdb_train.csv, creditdb_test.csv")
else:
    raise FileNotFoundError("No encontré creditdb_train.csv o creditdb_test.csv en data/. Guarda los archivos desde el EDA primero.")

print("Shapes -> train:", creditdb_train.shape, " test:", creditdb_test.shape)

Loaded CSV: creditdb_train.csv, creditdb_test.csv
Shapes -> train: (94908, 11)  test: (23727, 11)


# CELL 2: QUICK CHECKS.

In [14]:
# Quick checks after loading the datasets
import pandas as pd
import numpy as np

creditdb_train.info()
print('')
creditdb_test.info()

print("\nDistribución target (train):")
display(creditdb_train['SeriousDlqin2yrs'].value_counts(normalize=True).rename("proportion"))

print("\nDistribución target (test):")
display(creditdb_test['SeriousDlqin2yrs'].value_counts(normalize=True).rename("proportion"))

# Basic statistics for MonthlyIncome and DebtRatio (verify capping)
print("\nResumen MonthlyIncome (train):")
display(creditdb_train['MonthlyIncome'].describe())
print("\nResumen DebtRatio (train):")
display(creditdb_train['DebtRatio'].describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94908 entries, 0 to 94907
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   RevolvingUtilizationOfUnsecuredLines  94908 non-null  float64
 1   age                                   94908 non-null  int64  
 2   NumberOfTime30-59DaysPastDueNotWorse  94908 non-null  int64  
 3   DebtRatio                             94908 non-null  float64
 4   MonthlyIncome                         94908 non-null  float64
 5   NumberOfOpenCreditLinesAndLoans       94908 non-null  int64  
 6   NumberOfTimes90DaysLate               94908 non-null  int64  
 7   NumberRealEstateLoansOrLines          94908 non-null  int64  
 8   NumberOfTime60-89DaysPastDueNotWorse  94908 non-null  int64  
 9   NumberOfDependents                    94908 non-null  float64
 10  SeriousDlqin2yrs                      94908 non-null  bool   
dtypes: bool(1), flo

SeriousDlqin2yrs
False    0.930111
True     0.069889
Name: proportion, dtype: float64


Distribución target (test):


SeriousDlqin2yrs
False    0.930122
True     0.069878
Name: proportion, dtype: float64


Resumen MonthlyIncome (train):


count    94908.000000
mean      6458.043547
std       4338.584722
min          1.000000
25%       3500.000000
50%       5458.000000
75%       8333.000000
max      25000.000000
Name: MonthlyIncome, dtype: float64


Resumen DebtRatio (train):


count    94908.000000
mean         0.376319
std          0.415131
min          0.000000
25%          0.141308
50%          0.291724
75%          0.471787
max          2.999627
Name: DebtRatio, dtype: float64

# CELL 3: SPLITTING THE DATASETS FOR MODELING PURPOSES.

In [15]:
# Prepare X/y for RandomForest (unscaled) and LogisticRegression (MinMax-scaled, fit on train)
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

# 1) Basic checks and ensure target is Boolean
if 'creditdb_train' not in globals() or 'creditdb_test' not in globals():
    raise RuntimeError("No encuentro creditdb_train/creditdb_test en memoria. Carga los CSV primero.")

# Ensure SeriousDlqin2yrs is Boolean (if loaded as 0/1 from CSV)
for df in (creditdb_train, creditdb_test):
    if df['SeriousDlqin2yrs'].dtype != bool:
        # convertir 0/1 a bool de forma segura
        df['SeriousDlqin2yrs'] = df['SeriousDlqin2yrs'].astype(int).astype(bool)

# 2) Prepare X/y for RandomForest (use columns as-is, without scaling)
feature_cols_rf = [c for c in creditdb_train.columns if c != 'SeriousDlqin2yrs']
X_train_rf = creditdb_train[feature_cols_rf].copy()
y_train_rf = creditdb_train['SeriousDlqin2yrs'].copy()

X_test_rf  = creditdb_test[feature_cols_rf].copy()
y_test_rf  = creditdb_test['SeriousDlqin2yrs'].copy()

# 3) Prepare X/y for LogisticRegression (MinMaxScaler fit on train
# Select numeric columns for scaling (excluding the target)
num_cols = creditdb_train.select_dtypes(include=[np.number]).columns.tolist()
if 'SeriousDlqin2yrs' in num_cols:
    num_cols.remove('SeriousDlqin2yrs')

scaler = MinMaxScaler()
scaler.fit(creditdb_train[num_cols].values)   # fit solo en train (evita data leakage)

# Transform and reconstruct scaled DataFrames (retain all original columns)
scaled_train = creditdb_train.copy()
scaled_test  = creditdb_test.copy()

scaled_train[num_cols] = scaler.transform(creditdb_train[num_cols].values)
scaled_test[num_cols]  = scaler.transform(creditdb_test[num_cols].values)

# Create X/y for LR (use the same set of features as in RF for comparability)
feature_cols_lr = [c for c in scaled_train.columns if c != 'SeriousDlqin2yrs']
X_train_lr = scaled_train[feature_cols_lr].copy()
y_train_lr = scaled_train['SeriousDlqin2yrs'].copy()

X_test_lr  = scaled_test[feature_cols_lr].copy()
y_test_lr  = scaled_test['SeriousDlqin2yrs'].copy()

# 4) Summary
print("Prepared datasets:")
print(" RF -> X_train:", X_train_rf.shape, " y_train:", y_train_rf.shape)
print(" RF -> X_test: ", X_test_rf.shape,  " y_test :", y_test_rf.shape)
print(" LR -> X_train:", X_train_lr.shape, " y_train:", y_train_lr.shape)
print(" LR -> X_test: ", X_test_lr.shape,  " y_test :", y_test_lr.shape)

Prepared datasets:
 RF -> X_train: (94908, 10)  y_train: (94908,)
 RF -> X_test:  (23727, 10)  y_test : (23727,)
 LR -> X_train: (94908, 10)  y_train: (94908,)
 LR -> X_test:  (23727, 10)  y_test : (23727,)


# -----------------------------------------------------------------------------------------
# MODEL TRAINING.

# CELL 4: LOGISTIC REGRESSION.

In [16]:
# TRAIN + EVALUATE: Logistic Regression baseline 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score
import numpy as np

# Modelo
lr = LogisticRegression(
    solver='liblinear',    
    penalty='l2',
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)

# Fit
lr.fit(X_train_lr, y_train_lr)

# Predicción en test (umbral 0.5 sobre probas)
y_pred_lr = lr.predict(X_test_lr)

# Métricas (positiva = True)
recall_lr = recall_score(y_test_lr, y_pred_lr, pos_label=True)
precision_lr = precision_score(y_test_lr, y_pred_lr, pos_label=True)

print("Logistic Regression (baseline) — Test results")
print(f" Recall  (positive class): {recall_lr:.4f}")
print(f" Precision (positive class): {precision_lr:.4f}")

Logistic Regression (baseline) — Test results
 Recall  (positive class): 0.6604
 Precision(positive class): 0.1624


# CELL 5: RANDOM FOREST.

In [22]:
# TRAIN + EVALUATE: Random Forest baseline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score, precision_score

# Modelo
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    n_jobs=-1,
    random_state=42,
    class_weight='balanced_subsample'  
)

# Fit
rf.fit(X_train_rf, y_train_rf)

# Predicción en test
y_pred_rf = rf.predict(X_test_rf)

# Métricas (positiva = True)
recall_rf = recall_score(y_test_rf, y_pred_rf, pos_label=True)
precision_rf = precision_score(y_test_rf, y_pred_rf, pos_label=True)

print("Random Forest (baseline) — Test results")
print(f" Recall  (positive class): {recall_rf:.4f}")
print(f" Precision (positive class): {precision_rf:.4f}")

Random Forest (baseline) — Test results
 Recall  (positive class): 0.1291
 Precision (positive class): 0.5912
