In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import sys

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.model_selection import KFold , StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, roc_curve, f1_score, average_precision_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


In [2]:
current_dir = os.getcwd() # Obtener la ruta del directorio actual del notebook
ROOT_PATH = os.path.dirname(current_dir) # Obtener la ruta del directorio superior
sys.path.insert(1, ROOT_PATH) # Insertar la ruta en sys.path

import root # Importar el módulo root

## Para info users

In [3]:
file_path = root.DIR_DATA_STAGE + 'train_infousers.csv'

In [4]:
df_infousers = pd.read_csv(file_path)
df_infousers.head()

Unnamed: 0,PLAZO,CAPITAL,DÍAS MORA,INGRESOS MENSUALES,GASTOS MENSUALES,NUM.CREDITOS SOLICITADOS,latitud,longitud,meses_transcurridos,mes_de_pago,TIPO EMPLEO_independiente,TIPO DE VIVIENDA_propia,ESTADO CIVIL_soltero/a,ESTADO CIVIL_union libre,NIVEL EDUCATIVO_postgrado,NIVEL EDUCATIVO_profesional,NIVEL EDUCATIVO_secundaria,NIVEL EDUCATIVO_tecnico,Cuotas en mora
0,30.0,0.25,1445.0,3.0,1.0,1.0,6.17591,-75.59174,31.07753,4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,30.0,0.12,905.0,1.38,0.7,1.0,4.60971,-74.08175,35.87385,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,30.0,0.3,2284.0,1.649434,0.6,1.0,4.60971,-74.08175,12.0,2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
3,30.0,0.12,1675.0,1.0,0.6,1.0,6.25184,-75.56359,6.30749,10,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1
4,30.0,0.17,738.0,1.5,0.7,1.0,8.21639,-73.24139,17.805519,11,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0


In [5]:
classifier_pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=100))

In [6]:
X = df_infousers.drop(columns=['Cuotas en mora'])
y = df_infousers['Cuotas en mora']

In [7]:

#cv = KFold(n_splits=5, random_state=42, shuffle=True)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [8]:
X.shape

(3262, 18)

In [9]:
sfs1 = SFS(classifier_pipeline, 
           k_features=1, 
           forward=False, 
           scoring='average_precision',
           cv=cv)

sfs1.fit(X,y)

sfs1.subsets_

{18: {'feature_idx': (0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17),
  'cv_scores': array([0.88094562, 0.86584049, 0.89584422, 0.8844583 , 0.87800883]),
  'avg_score': 0.8810194931459779,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'DÍAS MORA',
   'INGRESOS MENSUALES',
   'GASTOS MENSUALES',
   'NUM.CREDITOS SOLICITADOS',
   'latitud',
   'longitud',
   'meses_transcurridos',
   'mes_de_pago',
   'TIPO EMPLEO_independiente',
   'TIPO DE VIVIENDA_propia',
   'ESTADO CIVIL_soltero/a',
   'ESTADO CIVIL_union libre',
   'NIVEL EDUCATIVO_postgrado',
   'NIVEL EDUCATIVO_profesional',
   'NIVEL EDUCATIVO_secundaria',
   'NIVEL EDUCATIVO_tecnico')},
 17: {'feature_idx': (0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17),
  'cv_scores': array([0.88153574, 0.86868196, 0.89671905, 0.88486007, 0.88253295]),
  'avg_score': 0.8828659526381282,
  'feature_names': ('PLAZO',
   

In [10]:
datos = sfs1.subsets_
results = [d['avg_score'] for d in datos.values()]
keys = list(datos.keys())
results = [(k, v) for k, v in zip(keys, results)]

In [11]:
results

[(18, 0.8810194931459779),
 (17, 0.8828659526381282),
 (16, 0.8832065745238238),
 (15, 0.8835113889007532),
 (14, 0.8835683660776766),
 (13, 0.8835206377710089),
 (12, 0.8836880956940169),
 (11, 0.8836497831395741),
 (10, 0.8836953130751176),
 (9, 0.8832720650686573),
 (8, 0.8827901622072446),
 (7, 0.882053594926238),
 (6, 0.8807391189685468),
 (5, 0.8797970831387012),
 (4, 0.8793677303978713),
 (3, 0.8786921057179351),
 (2, 0.8733779073844563),
 (1, 0.8569655776692819)]

In [12]:
max_value = max(results, key=lambda x: x[1])
print(max_value)

(10, 0.8836953130751176)


## Para credit info 

In [13]:
file_path = root.DIR_DATA_STAGE + 'train_creditinfo.csv'

In [14]:
df_credit_info = pd.read_csv(file_path)

df_credit_info.head()

Unnamed: 0,PLAZO,CAPITAL,INT CORRIENTE,NUM.CREDITOS SOLICITADOS,USUARIO RECURRENTE,mes,mora
0,30,500000,9385.0,4,1,10,1
1,30,350000,6569.5,5,1,1,1
2,30,300000,7080.0,1,0,4,1
3,30,210000,3941.7,3,1,10,1
4,29,400000,7257.733333,2,1,10,1


In [15]:
X = df_credit_info.drop(columns=['mora'])
y = df_credit_info['mora']

In [16]:
sfs1 = SFS(classifier_pipeline, 
           k_features=1, 
           forward=False, 
           scoring='f1',
           cv=cv)

sfs1.fit(X,y)

sfs1.subsets_

{6: {'feature_idx': (0, 1, 2, 3, 4, 5),
  'cv_scores': array([0.70213476, 0.70029228, 0.71297748, 0.70290876, 0.70631169]),
  'avg_score': 0.7049249909536505,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'INT CORRIENTE',
   'NUM.CREDITOS SOLICITADOS',
   'USUARIO RECURRENTE',
   'mes')},
 5: {'feature_idx': (0, 1, 2, 3, 5),
  'cv_scores': array([0.7516431 , 0.74293333, 0.75609917, 0.75201693, 0.75442095]),
  'avg_score': 0.7514226979018502,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'INT CORRIENTE',
   'NUM.CREDITOS SOLICITADOS',
   'mes')},
 4: {'feature_idx': (0, 1, 2, 3),
  'cv_scores': array([0.75196379, 0.74258949, 0.75594807, 0.75202957, 0.75549048]),
  'avg_score': 0.7516042798143294,
  'feature_names': ('PLAZO',
   'CAPITAL',
   'INT CORRIENTE',
   'NUM.CREDITOS SOLICITADOS')},
 3: {'feature_idx': (0, 1, 3),
  'cv_scores': array([0.75168412, 0.74660603, 0.75807629, 0.75422039, 0.75721042]),
  'avg_score': 0.7535594490340081,
  'feature_names': ('PLAZO', 'CAPITAL', 'NUM.CRED

In [17]:
datos = sfs1.subsets_
results = [d['avg_score'] for d in datos.values()]
keys = list(datos.keys())
results = [(k, v) for k, v in zip(keys, results)]

In [18]:
results

[(6, 0.7049249909536505),
 (5, 0.7514226979018502),
 (4, 0.7516042798143294),
 (3, 0.7535594490340081),
 (2, 0.7524794639068515),
 (1, 0.7483994778300698)]

In [19]:
max_value = max(results, key=lambda x: x[1])
print(max_value)

(3, 0.7535594490340081)
