In [9]:
%load_ext autoreload
%autoreload 2

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np
from category_encoders import TargetEncoder

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,GridSearchCV

# Visualizaciones
# -----------------------------------------------------------------------
import seaborn as sns
import matplotlib.pyplot as plt

# Vigilar progreso bucles
# -----------------------------------------------------------------------
from tqdm import tqdm

# Gestionar los warnings
# -----------------------------------------------------------------------
import warnings


# importar funciones de soporte
# -----------------------------------------------------------------------
from src import soporte_preprocesamiento as sup_prep
from src import soporte_nulos as sup_nul
import src.logistica.soporte_ajuste_logistica as sal

##aplicar configuraciones
#------------------------------------------------------------------------
warnings.filterwarnings('ignore')
pd.set_option('display.max_info_columns', 50)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format) #eliminamos la notacion cientifica

tqdm.pandas()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
df = pd.read_pickle('datos/encoded_financial_data.pkl').reset_index(drop=True)
df.sample(2)

Unnamed: 0,is_fraudulent,card_type_American Express,card_type_Discover,card_type_MasterCard,card_type_Visa,location_City-1,location_City-10,location_City-11,location_City-12,location_City-13,location_City-14,location_City-15,location_City-16,location_City-17,location_City-18,location_City-19,location_City-2,location_City-20,location_City-21,location_City-22,location_City-23,location_City-24,location_City-25,location_City-26,location_City-27,location_City-28,location_City-29,location_City-3,location_City-30,location_City-31,location_City-32,location_City-33,location_City-34,location_City-35,location_City-36,location_City-37,location_City-38,location_City-39,location_City-4,location_City-40,location_City-41,location_City-42,location_City-43,location_City-44,location_City-45,location_City-46,location_City-47,location_City-48,location_City-49,location_City-5,location_City-50,location_City-6,location_City-7,location_City-8,location_City-9,purchase_category,customer_age
904,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.35
9898,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.17


## Separamos X e y

In [11]:
X= df.drop(columns="is_fraudulent")
y= df["is_fraudulent"]

X_train,X_test,y_train,y_test= train_test_split(X,y,train_size=0.8, random_state=42, shuffle=True)

In [12]:
lr = LogisticRegression(penalty='l2', C=1.0,  max_iter=100, multi_class='auto', n_jobs=-1 )

In [13]:
# Ajustar el modelo con los datos de entrenamiento
lr.fit(X_train, y_train)

# Hacer las predicciones para los datos de entrenamiento
y_predict_train = lr.predict(X_train)

# Hacer las predicciones para los datos de prueba
y_predict_test = lr.predict(X_test)

# Probabilidades predichas
y_prob_train = lr.predict_proba(X_train)
y_prob_test = lr.predict_proba(X_test)

In [14]:
# instanciamos la clase de regresión logistica
logistica = sal.AnalisisModelosClasificacion(dataframe = df, variable_dependiente = "is_fraudulent")

# llamamos a la función para ajustar el modelo
logistica.ajustar_modelo("logistic_regression")

# llamamos al metodo para obtener los resultaods de las predicciones
df_resultados = logistica.calcular_metricas(modelo_nombre =  "logistic_regression" )
df_resultados.head()

Unnamed: 0,train,test
accuracy,0.52,0.54
precision,0.52,0.54
recall,0.52,0.54
f1,0.51,0.52
kappa,0.04,0.06


| **Métrica**    | **Train** | **Test** | **Descripción**                                                                                     |
|-----------------|-----------|----------|-----------------------------------------------------------------------------------------------------|
| **Accuracy**    | 0.52      | 0.54     | Proporción de predicciones correctas sobre el total. En este caso, el modelo tiene un rendimiento apenas mejor que el azar. |
| **Precision**   | 0.52      | 0.54     | Proporción de verdaderos positivos sobre los positivos predichos. Evalúa qué tan preciso es el modelo al identificar la clase positiva. |
| **Recall**      | 0.52      | 0.54     | Proporción de verdaderos positivos sobre los positivos reales. Indica cuántos casos positivos el modelo logra capturar. |
| **F1 Score**    | 0.51      | 0.52     | Promedio armónico de precisión y recall. Es útil cuando hay un desequilibrio entre clases.          |
| **Kappa**       | 0.04      | 0.06     | Métrica de concordancia que ajusta por el azar. Un valor bajo indica que el modelo apenas supera el desempeño aleatorio. |

---
