In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, f1_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [2]:
current_dir = os.getcwd() # Obtener la ruta del directorio actual del notebook
ROOT_PATH = os.path.dirname(current_dir) # Obtener la ruta del directorio superior
sys.path.insert(1, ROOT_PATH) # Insertar la ruta en sys.path

import root # Importar el módulo root

## Para info users

In [13]:
file_path = root.DIR_DATA_STAGE + 'train_infousers.csv'

In [14]:
df_infousers = pd.read_csv(file_path)
df_infousers.head()

Unnamed: 0,PLAZO,CAPITAL,DÍAS MORA,INGRESOS MENSUALES,GASTOS MENSUALES,NUM.CREDITOS SOLICITADOS,latitud,longitud,meses_transcurridos,mes_de_pago,TIPO EMPLEO_independiente,TIPO DE VIVIENDA_propia,ESTADO CIVIL_soltero/a,ESTADO CIVIL_union libre,NIVEL EDUCATIVO_postgrado,NIVEL EDUCATIVO_profesional,NIVEL EDUCATIVO_secundaria,NIVEL EDUCATIVO_tecnico,Cuotas en mora
0,30.0,0.25,1445.0,3.0,1.0,1.0,6.17591,-75.59174,31.07753,4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,30.0,0.12,905.0,1.38,0.7,1.0,4.60971,-74.08175,35.87385,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,30.0,0.3,2284.0,1.649434,0.6,1.0,4.60971,-74.08175,12.0,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
3,30.0,0.12,1675.0,1.0,0.6,1.0,6.25184,-75.56359,6.30749,10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,30.0,0.17,738.0,1.5,0.7,1.0,8.21639,-73.24139,17.805519,11,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0


In [33]:
classifier_pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100))

In [34]:
X = df_infousers.drop(columns=['Cuotas en mora'])
y = df_infousers['Cuotas en mora']

In [35]:
metricas = ['accuracy', 'precision', 'recall', 'f1','roc_auc','average_precision']

In [36]:
cv = KFold(n_splits=5, random_state=42, shuffle=True)

In [37]:
X.shape

(3262, 18)

In [44]:
sfs1 = SFS(classifier_pipeline, 
           k_features=1, 
           forward=False, 
           scoring='average_precision',
           cv=cv)

sfs1.fit(X,y)

sfs1.subsets_

{18: {'feature_idx': (0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17),
  'cv_scores': array([0.90316103, 0.87278109, 0.8694917 , 0.85308008, 0.89807966]),
  'avg_score': 0.879318710062227,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'DÍAS MORA',
   'INGRESOS MENSUALES',
   'GASTOS MENSUALES',
   'NUM.CREDITOS SOLICITADOS',
   'latitud',
   'longitud',
   'meses_transcurridos',
   'mes_de_pago',
   'TIPO EMPLEO_independiente',
   'TIPO DE VIVIENDA_propia',
   'ESTADO CIVIL_soltero/a',
   'ESTADO CIVIL_union libre',
   'NIVEL EDUCATIVO_postgrado',
   'NIVEL EDUCATIVO_profesional',
   'NIVEL EDUCATIVO_secundaria',
   'NIVEL EDUCATIVO_tecnico')},
 17: {'feature_idx': (0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17),
  'cv_scores': array([0.90258158, 0.87154347, 0.86711053, 0.86015398, 0.90104774]),
  'avg_score': 0.8804874608965804,
  'feature_names': ('PLAZO',
   '

In [45]:
datos = sfs1.subsets_
results = [d['avg_score'] for d in datos.values()]
keys = list(datos.keys())
results = [(k, v) for k, v in zip(keys, results)]

In [46]:
results

[(18, 0.879318710062227),
 (17, 0.8804874608965804),
 (16, 0.8811155797494651),
 (15, 0.8814961848753571),
 (14, 0.8818140323848371),
 (13, 0.8819667132506265),
 (12, 0.8820168766972707),
 (11, 0.8821022139178375),
 (10, 0.8820098436835808),
 (9, 0.8818015391502421),
 (8, 0.8816935664526706),
 (7, 0.8812197821795482),
 (6, 0.8806848409754904),
 (5, 0.879733573415689),
 (4, 0.8789380363880935),
 (3, 0.8783355802554903),
 (2, 0.873095053830809),
 (1, 0.856329772726869)]

In [47]:
max_value = max(results, key=lambda x: x[1])
print(max_value)

(11, 0.8821022139178375)


## Para credit info 

In [48]:
file_path = root.DIR_DATA_STAGE + 'train_creditinfo.csv'

In [49]:
df_credit_info = pd.read_csv(file_path)
df_credit_info.drop(columns=['FECHA DESEMBOLSO', 'DÍAS MORA'], inplace=True)

df_credit_info.head()

Unnamed: 0,PLAZO,CAPITAL,INT CORRIENTE,Cuotas en mora,NUM.CREDITOS SOLICITADOS,USUARIO RECURRENTE,mes,mora
0,30,500000,9385.0,0,4,1,10,1
1,30,350000,6569.5,0,5,1,1,1
2,30,300000,7080.0,1,1,0,4,1
3,30,210000,3941.7,0,3,1,10,1
4,29,400000,7257.733333,0,2,1,10,1


In [50]:
X = df_credit_info.drop(columns=['mora'])
y = df_credit_info['mora']

In [55]:
sfs1 = SFS(classifier_pipeline, 
           k_features=1, 
           forward=False, 
           scoring='f1',
           cv=cv)

sfs1.fit(X,y)

sfs1.subsets_

{7: {'feature_idx': (0, 1, 2, 3, 4, 5, 6),
  'cv_scores': array([0.70306416, 0.70755348, 0.70519083, 0.69275632, 0.70086485]),
  'avg_score': 0.7018859257670288,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'INT CORRIENTE',
   'Cuotas en mora',
   'NUM.CREDITOS SOLICITADOS',
   'USUARIO RECURRENTE',
   'mes')},
 6: {'feature_idx': (0, 1, 2, 3, 4, 6),
  'cv_scores': array([0.74135114, 0.75549296, 0.75165423, 0.75224135, 0.7489301 ]),
  'avg_score': 0.7499339566285513,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'INT CORRIENTE',
   'Cuotas en mora',
   'NUM.CREDITOS SOLICITADOS',
   'mes')},
 5: {'feature_idx': (0, 1, 2, 4, 6),
  'cv_scores': array([0.74098273, 0.7558155 , 0.75379659, 0.75235567, 0.75266809]),
  'avg_score': 0.7511237178118433,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'INT CORRIENTE',
   'NUM.CREDITOS SOLICITADOS',
   'mes')},
 4: {'feature_idx': (1, 2, 4, 6),
  'cv_scores': array([0.73978466, 0.75566784, 0.75554968, 0.75366178, 0.75229481]),
  'avg_score': 0.751

In [56]:
datos = sfs1.subsets_
results = [d['avg_score'] for d in datos.values()]
keys = list(datos.keys())
results = [(k, v) for k, v in zip(keys, results)]

In [57]:
results

[(7, 0.7018859257670288),
 (6, 0.7499339566285513),
 (5, 0.7511237178118433),
 (4, 0.7513917548727924),
 (3, 0.7509194396314449),
 (2, 0.7522426470220928),
 (1, 0.7483784891868858)]

In [58]:
max_value = max(results, key=lambda x: x[1])
print(max_value)

(2, 0.7522426470220928)
