In [1]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.datasets import load_iris, load_breast_cancer
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import scorecardpy as sc
from mlxtend.feature_selection import SequentialFeatureSelector as SFS

## WOE Nítidas

In [2]:
class Node:
    def __init__(self, is_leaf=True, split_value=None, label=None, cardinality = None):
        self.is_leaf = is_leaf
        self.split_value = split_value
        self.label = label
        self.cardinality = cardinality
        self.left = None
        self.right = None
        
    def node_def(self) -> str:
        if self.is_leaf:
            return f"LEAF | Label = {round(self.label, 5)}"
        else:
            return (f"NODE | Cardinality = {round(self.cardinality, 5)} | "
                    f"Split Value: {round(self.split_value, 5)} | "
                    f"Label: {round(self.label, 5)} | ")

In [3]:
class DecisionTree:
    def __init__(self, criterion='gini', max_depth=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.tree = None
    
    def _entropy(self, y):
        if y.size == 0: return 0
        p = np.unique(y, return_counts=True)[1].astype(float)/len(y)
        return -1 * np.sum(p * np.log2(p + 1e-9))

    def _gini_impurity(self, y):
        if y.size == 0: return 0
        p = np.unique(y, return_counts=True)[1].astype(float)/len(y)
        return 1 - np.sum(p**2)

    def _variance(self, y):
        if y.size == 0: return 0
        return np.var(y)
    
    def _information_gain(self, y, mask, func):
        s1 = np.sum(mask)
        s2 = mask.size - s1
        if s1 == 0 or s2 == 0: return 0
        return func(y) - (s1 / float(s1 + s2)) * func(y[mask]) - (s2 / float(s1 + s2)) * func(y[np.logical_not(mask)])

    def _max_information_gain_split(self, y, x, func):
        best_change = None
        split_value = None
        
        for val in np.unique(np.sort(x)):
            mask = x < val
            change = self._information_gain(y, mask, func)
            if best_change is None or change > best_change:
                best_change = change
                split_value = val
                
        return {"split_value": split_value, "gain": best_change}

    def fit(self, X, y, depth=0):
        if self.max_depth is not None and depth >= self.max_depth:
            return Node(is_leaf=True, label=np.mean(y), cardinality=len(y))

        if self.criterion == 'entropy':
            func = self._entropy
        elif self.criterion == 'gini':
            func = self._gini_impurity
        elif self.criterion == 'variance':
            func = self._variance
        else:
            raise ValueError("Invalid criterion specified.")

        split = self._max_information_gain_split(y, X, func)
        split_value = split['split_value']

        if split_value is None:
            return Node(is_leaf=True, label=np.mean(y), cardinality=len(y))

        left_mask = X < split_value
        right_mask = ~left_mask

        if np.all(left_mask) or np.all(right_mask):
            return Node(is_leaf=True, label=np.mean(y), cardinality=len(y))

        left_subtree = self.fit(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self.fit(X[right_mask], y[right_mask], depth + 1)

        node = Node(
            is_leaf=False,
            split_value=split_value,
            label=np.mean(y),
            cardinality=len(y)
        )
        node.left = left_subtree
        node.right = right_subtree
        self.tree = node

        return node

    def predict(self, X):
        if self.tree is None:
            raise ValueError("The tree has not been trained yet!")

        def _traverse(node, x):
            if node.is_leaf:
                return node.label
            if x < node.split_value:
                return _traverse(node.left, x)
            else:
                return _traverse(node.right, x)

        return np.array([_traverse(self.tree, x) for x in X])
    
    def print_tree(self, node=None, depth=0):
        if node is None:
            node = self.tree
        if node is not None:
            print("  " * depth + node.node_def())
            if not node.is_leaf:
                self.print_tree(node.left, depth + 1)
                self.print_tree(node.right, depth + 1)            
    
    def calculate_woe(self, X, y):
        if self.tree is None:
            raise ValueError("The tree has not been trained yet!")

        splits = []
        def _collect_splits(node):
            if not node.is_leaf:
                splits.append(node.split_value)
                _collect_splits(node.left)
                _collect_splits(node.right)

        _collect_splits(self.tree)
        splits = sorted(splits)

        intervals = [(float('-inf'), splits[0])]
        for i in range(len(splits) - 1):
            intervals.append((splits[i], splits[i + 1]))
        intervals.append((splits[-1], float('inf')))

        woe_values = []
        for interval in intervals:
            mask = (X > interval[0]) & (X <= interval[1])
            good = np.sum(y[mask] == 1)
            bad = np.sum(y[mask] == 0)
            woe = np.log( (good+0.5) / (bad+0.5) )
            woe_values.append((interval, woe))

        return woe_values

In [4]:
def assign_woe(X, woe_values):
    X_woe = np.zeros_like(X, dtype=float)
    for value in woe_values:
        interval, woe = value
        mask = (X > interval[0]) & (X <= interval[1])
        X_woe[mask] = woe
    return X_woe

In [5]:
def woe_df(X_train, y_train, X_test):
    X_train_woe = pd.DataFrame()
    X_test_woe = pd.DataFrame()
    y_train = np.array(y_train)
    for col in X_train.columns:
        print(col)
        tree = DecisionTree(criterion='gini', max_depth=2)
        tree.fit(X_train[col].values, y_train)
        woe_values = tree.calculate_woe(X_train[col].values, y_train)
        
        X_train_woe[f'{col}_woe'] = assign_woe(X_train[col].values, woe_values)
        X_test_woe[f'{col}_woe'] = assign_woe(X_test[col].values, woe_values)
        print('ok')
    return X_train_woe, X_test_woe

In [6]:
def woe_df_na(X_train, y_train, X_test):
    X_train_woe = pd.DataFrame()
    X_test_woe = pd.DataFrame()
    y_train = np.array(y_train)
    for col in X_train.columns:
        print(col)
        
        # Separa los datos no nulos
        non_null_train = X_train[col].notnull()
        non_null_test = X_test[col].notnull()

        # Cálculo de los valores woe para los datos no nulos
        tree = DecisionTree(criterion='gini', max_depth=2)
        tree.fit(X_train.loc[non_null_train, col].values, y_train[non_null_train])
        woe_values = tree.calculate_woe(X_train.loc[non_null_train, col].values, y_train[non_null_train])

        array_woe_values_train = assign_woe(X_train[col][non_null_train], woe_values)
        array_woe_values_test = assign_woe(X_test[col][non_null_test], woe_values)

        # Cálculo de los valores woe para los datos nulos
        good = np.sum(y_train[~non_null_train] == 1)
        bad = np.sum(y_train[~non_null_train] == 0)
        woe_value_missing = np.log( (good+0.5) / (bad+0.5) )

        # Crear el array resultado de train
        result_train = np.zeros_like(non_null_train, dtype=np.float64)

        # Indice para recorrer
        index = 0

        # Llenar el array resultado
        for i in range(len(non_null_train)):
            if non_null_train[i]:
                result_train[i] = array_woe_values_train[index]
                index += 1
            else:
                result_train[i] = woe_value_missing

        # Crear el array resultado de test
        result_test = np.zeros_like(non_null_test, dtype=np.float64)

        # Indice para recorrer
        index = 0

        # Llenar el array resultado
        for i in range(len(non_null_test)):
            if non_null_test[i]:
                result_test[i] = array_woe_values_test[index]
                index += 1
            else:
                result_test[i] = woe_value_missing

        # Creacion de las nuevas variables woe
        X_train_woe[f'{col}_woe'] = result_train
        X_test_woe[f'{col}_woe'] = result_test
        
        print('ok')
        
    return X_train_woe, X_test_woe

## WOE AND

In [7]:
def v(x, alfa, beta):
    if beta != 0:
        if x < alfa - beta / 2:
            return 1
        elif x > alfa + beta / 2:
            return 0
        else:
            return (alfa + beta / 2 - x) / beta
    else:
        if x < alfa:
            return 1
        else:
            return 0

In [8]:
def error_func(y, x, grado_pertenencia_padre_func, alfa, beta, label_left, label_right):
    error = 0
    for i in range(len(x)):
        mu_prima = v(x[i], alfa, beta) * label_left + (1 - v(x[i], alfa, beta)) * label_right
        error_observacion = grado_pertenencia_padre_func(x[i]) * (y[i] - mu_prima) ** 2
        error += error_observacion
    return error

In [9]:
def fibonacci_search_fixed(f, y_fixed, x_fixed, grado_pertenencia_padre_func_fixed, alfa_fixed, label_left_fixed, label_right_fixed, a, b, epsilon=0.001):
    # Generar la secuencia de Fibonacci hasta que F_n >= (b - a) / epsilon
    fib = [0, 1]
    while fib[-1] < (b - a) / epsilon:
        fib.append(fib[-1] + fib[-2])
    
    n = len(fib) - 1
    
    # Inicializar los puntos
    x1 = a + (fib[n-2] / fib[n]) * (b - a)
    x2 = a + (fib[n-1] / fib[n]) * (b - a)
    
    f1 = f(y_fixed, x_fixed, grado_pertenencia_padre_func_fixed, alfa_fixed, x1, label_left_fixed, label_right_fixed)
    f2 = f(y_fixed, x_fixed, grado_pertenencia_padre_func_fixed, alfa_fixed, x2, label_left_fixed, label_right_fixed)
    
    for k in range(n-1, 1, -1):
        if f1 > f2:
            a = x1
            x1 = x2
            f1 = f2
            x2 = a + (fib[k-1] / fib[k]) * (b - a)
            f2 = f(y_fixed, x_fixed, grado_pertenencia_padre_func_fixed, alfa_fixed, x2, label_left_fixed, label_right_fixed)
        else:
            b = x2
            x2 = x1
            f2 = f1
            x1 = a + (fib[k-2] / fib[k]) * (b - a)
            f1 = f(y_fixed, x_fixed, grado_pertenencia_padre_func_fixed, alfa_fixed, x1, label_left_fixed, label_right_fixed)
    
    # Ajustar el punto final si es necesario
    if f1 < f2:
        return (x1, f1)
    else:
        return (x2, f2)

In [10]:
def grado_pertenencia_padre_func(x):
    return 1

In [11]:
def min_error_split_alfa(y, x, alfa, grado_pertenencia_padre_func):
    min_error = None
    beta = 0
    
    mask = x < alfa
    
    label_left = sum(grado_pertenencia_padre_func(xi) * yi for xi, yi, bandera in zip(x, y, mask) if bandera) / sum(grado_pertenencia_padre_func(xi) for xi, bandera in zip(x, mask) if bandera)
    label_right = sum(grado_pertenencia_padre_func(xi) * yi for xi, yi, bandera in zip(x, y, mask) if not bandera) / sum(grado_pertenencia_padre_func(xi) for xi, bandera in zip(x, mask) if not bandera)

    min_beta = 0
    max_beta = 2 * min(max(x) - alfa, alfa - min(x)) 
        
    beta, min_error = fibonacci_search_fixed(
        f=error_func,
        y_fixed=y,
        x_fixed=x,
        grado_pertenencia_padre_func_fixed=grado_pertenencia_padre_func,
        alfa_fixed=alfa,
        label_left_fixed=label_left,
        label_right_fixed=label_right,
        a=min_beta,
        b=max_beta,
        epsilon=0.001
    )
    
    def grado_pertenencia_izquierda_func(xi):
        return v(xi, alfa, beta) * grado_pertenencia_padre_func(xi)

    def grado_pertenencia_derecha_func(xi):
        return (1 - v(xi, alfa, beta)) * grado_pertenencia_padre_func(xi)
    
    return {
        "alfa": alfa,
        "beta": beta,
        "grado_pertenencia_izquierda_func": grado_pertenencia_izquierda_func,
        "grado_pertenencia_derecha_func": grado_pertenencia_derecha_func
    }

In [12]:
def fuzzy_fixed_alfas(x_prueba, y_prueba, grado_pertenencia_padre_func):
    y_prueba = np.array(y_prueba)
    
    tree1 = DecisionTree(criterion='gini', max_depth=1)
    tree1.fit(x_prueba, y_prueba)
    first_split = min_error_split_alfa(y_prueba, x_prueba, tree1.tree.split_value, grado_pertenencia_padre_func)
    
    left_mask = x_prueba < first_split['alfa']
    right_mask = ~left_mask
    
    tree2l = DecisionTree(criterion='gini', max_depth=1)
    tree2l.fit(x_prueba[left_mask], y_prueba[left_mask])
    second_split_left = min_error_split_alfa(y_prueba[left_mask], x_prueba[left_mask], tree2l.tree.split_value, first_split['grado_pertenencia_izquierda_func'])
    
    tree2r = DecisionTree(criterion='gini', max_depth=1)
    tree2r.fit(x_prueba[right_mask], y_prueba[right_mask])
    second_split_right = min_error_split_alfa(y_prueba[right_mask], x_prueba[right_mask], tree2r.tree.split_value, first_split['grado_pertenencia_derecha_func'])
    
    # Con los tres cortes, calculo las WOE normales en esos intervalos y luego pondero con el grado de pertenencia:
    splits = [second_split_left['alfa'], first_split['alfa'], second_split_right['alfa']]
    
    intervals = [(float('-inf'), splits[0]), (splits[0], splits[1]), (splits[1], splits[2]), (splits[2], float('inf'))]

    woe_values = []
    for interval in intervals:
        mask = (x_prueba > interval[0]) & (x_prueba <= interval[1])
        good = np.sum(y_prueba[mask] == 1)
        bad = np.sum(y_prueba[mask] == 0)
        woe = np.log((good+0.5) / (bad+0.5))
        woe_values.append((interval, woe))
    
    def woe_function(x):
        woe_pond = second_split_left['grado_pertenencia_izquierda_func'](x) * woe_values[0][1] + \
                   second_split_left['grado_pertenencia_derecha_func'](x) * woe_values[1][1] + \
                   second_split_right['grado_pertenencia_izquierda_func'](x) * woe_values[2][1] + \
                   second_split_right['grado_pertenencia_derecha_func'](x) * woe_values[3][1]
        return woe_pond
    
    return {'woe_values': woe_values, 
            'funciones_grado_pertenencia': [second_split_left['grado_pertenencia_izquierda_func'], second_split_left['grado_pertenencia_derecha_func'],
                                            second_split_right['grado_pertenencia_izquierda_func'], second_split_right['grado_pertenencia_derecha_func']],
            'woe_function': woe_function
           }

In [13]:
def plot_grados_pertenencia_fuzzy_woe_fixed_alfas(fuzzy_woe_fixed_alfas, x_range):
    sns.set(style="whitegrid")
    
    plt.figure(figsize=(10, 6))

    i = 0
    for funcion in fuzzy_woe_fixed_alfas['funciones_grado_pertenencia']:
        membership_values = [funcion(xi) for xi in x_range]
        woe_value = fuzzy_woe_fixed_alfas['woe_values'][i][1]
        plt.plot(x_range, membership_values, label=f'WOE = {woe_value:.2f}')
        i += 1
        
    plt.xlabel('X', fontsize=15)
    plt.ylabel('Grado de pertenencia', fontsize=15)
    plt.legend()
    plt.grid(True)
    plt.show()

In [14]:
def woe_and_df(X_train, y_train, X_test):
    X_train_woe_and = pd.DataFrame()
    X_test_woe_and = pd.DataFrame()
    y_train = np.array(y_train)
    for col in X_train.columns:
        print(col)
        fuzzy_woe_fa = fuzzy_fixed_alfas(X_train[col].values, y_train, grado_pertenencia_padre_func)
        
        # Calculate WOE for X_train
        X_train_woe_and[f'{col}_woe_and'] = X_train[col].apply(fuzzy_woe_fa['woe_function'])

        # Calculate WOE for X_test
        X_test_woe_and[f'{col}_woe_and'] = X_test[col].apply(fuzzy_woe_fa['woe_function'])
        print('ok')
    return X_train_woe_and, X_test_woe_and

In [15]:
def woe_and_df_na(X_train, y_train, X_test):
    X_train_woe_and = pd.DataFrame()
    X_test_woe_and = pd.DataFrame()
    y_train = np.array(y_train) 
    for col in X_train.columns:
        print(col)
        
        # Separa los datos no nulos
        non_null_train = X_train[col].notnull()
        non_null_test = X_test[col].notnull()
        
        fuzzy_woe_fa = fuzzy_fixed_alfas(X_train[non_null_train][col].values, y_train[non_null_train], grado_pertenencia_padre_func)
        
        # Calculate WOE for X_train
        array_woe_values_train = X_train[non_null_train][col].apply(fuzzy_woe_fa['woe_function']).values

        # Calculate WOE for X_test
        array_woe_values_test = X_test[non_null_test][col].apply(fuzzy_woe_fa['woe_function']).values
        
        # Cálculo de los valores woe para los datos nulos
        good = np.sum(y_train[~non_null_train] == 1)
        bad = np.sum(y_train[~non_null_train] == 0)
        woe_value_missing = np.log( (good+0.5) / (bad+0.5) )

        # Crear el array resultado de train
        result_train = np.zeros_like(non_null_train, dtype=np.float64)

        # Indice para recorrer
        index = 0

        # Llenar el array resultado
        for i in range(len(non_null_train)):
            if non_null_train[i]:
                result_train[i] = array_woe_values_train[index]
                index += 1
            else:
                result_train[i] = woe_value_missing

        # Crear el array resultado de test
        result_test = np.zeros_like(non_null_test, dtype=np.float64)

        # Indice para recorrer
        index = 0

        # Llenar el array resultado
        for i in range(len(non_null_test)):
            if non_null_test[i]:
                result_test[i] = array_woe_values_test[index]
                index += 1
            else:
                result_test[i] = woe_value_missing

        # Creacion de las nuevas variables woe
        X_train_woe_and[f'{col}_woe_and'] = result_train
        X_test_woe_and[f'{col}_woe_and'] = result_test
        
        print('ok')
    return X_train_woe_and, X_test_woe_and

## Fuzzy WOE AD

In [16]:
def min_error_split(y, x, grado_pertenencia_padre_func, fuzzy=True):
    min_error = None
    alfa = None
    beta = 0
    
    #Elijo como posibles alfas los x's que tengan un grado de pertenencia positivo al nodo
    lista_posibles_alfas = np.unique(np.sort(x[np.array([grado_pertenencia_padre_func(xi) for xi in x]) > 0]))
        
    for posible_alfa in lista_posibles_alfas[1:]:
        
        mask = x < posible_alfa
        
        label_left = sum(grado_pertenencia_padre_func(xi) * yi for xi, yi, bandera in zip(x, y, mask) if bandera) / sum(grado_pertenencia_padre_func(xi) for xi, bandera in zip(x, mask) if bandera)
        label_right = sum(grado_pertenencia_padre_func(xi) * yi for xi, yi, bandera in zip(x, y, mask) if not bandera) / sum(grado_pertenencia_padre_func(xi) for xi, bandera in zip(x, mask) if not bandera)
        
        error = error_func(y, x, grado_pertenencia_padre_func, posible_alfa, beta, label_left, label_right)

        if min_error is None or error < min_error:
            min_error = error
            alfa = posible_alfa
            label_left_ok = label_left
            label_right_ok = label_right

    if fuzzy:
        min_beta = 0
        max_beta = 2 * min(max(x) - alfa, alfa - min(x)) 
        
        beta, min_error = fibonacci_search_fixed(
            f=error_func,
            y_fixed=y,
            x_fixed=x,
            grado_pertenencia_padre_func_fixed=grado_pertenencia_padre_func,
            alfa_fixed=alfa,
            label_left_fixed=label_left_ok,
            label_right_fixed=label_right_ok,
            a=min_beta,
            b=max_beta,
            epsilon=0.001
        )
    
    def grado_pertenencia_izquierda_func(xi):
        return v(xi, alfa, beta) * grado_pertenencia_padre_func(xi)

    def grado_pertenencia_derecha_func(xi):
        return (1 - v(xi, alfa, beta)) * grado_pertenencia_padre_func(xi)
    
    return {
        "alfa": alfa,
        "beta": beta,
        "grado_pertenencia_izquierda_func": grado_pertenencia_izquierda_func,
        "grado_pertenencia_derecha_func": grado_pertenencia_derecha_func
    }

In [17]:
class FuzzyNode:
    def __init__(self, is_leaf=True, alfa=None, beta=None, label=None, fuzzy_woe=None, grado_pertenencia_func=None, cardinality = None, node_error=None):
        self.is_leaf = is_leaf
        self.alfa = alfa
        self.beta = beta
        self.label = label
        self.fuzzy_woe = fuzzy_woe
        self.grado_pertenencia_func = grado_pertenencia_func
        self.cardinality = cardinality
        self.node_error = node_error 
        self.left = None
        self.right = None
        
    def node_def(self) -> str:
        if self.is_leaf:
            return f"LEAF | Label = {round(self.label, 5)} | Fuzzy WOE = {round(self.fuzzy_woe, 5)}"
        else:
            return (f"NODE | Cardinality = {round(self.cardinality, 5)} | "
                    f"Node Error = {round(self.node_error, 5)} | "
                    f"Alfa: {round(self.alfa, 5)} | "
                    f"Beta: {round(self.beta, 5)} | "
                    f"Label: {round(self.label, 5)} | "
                    f"Fuzzy WOE = {round(self.fuzzy_woe, 5)}")

In [18]:
class FuzzyDecisionTree:
    def __init__(self, max_depth): 
        self.root = None
        self.max_depth = max_depth

    def fit(self, y, x, grado_pertenencia_padre_func, fuzzy):
        self.root = self._grow_tree(y, x, grado_pertenencia_padre_func, fuzzy, depth=0)

    def _grow_tree(self, y, x, grado_pertenencia_padre_func, fuzzy, depth):
        node = FuzzyNode()
        
        node.grado_pertenencia_func = grado_pertenencia_padre_func
        node.label = sum(grado_pertenencia_padre_func(xi) * yi for xi, yi in zip(x, y)) / sum(grado_pertenencia_padre_func(xi) for xi in x)
        
        node.fuzzy_woe = np.log(((sum(grado_pertenencia_padre_func(xi)*yi for xi, yi in zip(x, y)))+0.5) /
        (((sum(grado_pertenencia_padre_func(xi) for xi in x) - sum(grado_pertenencia_padre_func(xi)*yi for xi, yi in zip(x, y)))+0.5)))
        
        node.cardinality = sum(grado_pertenencia_padre_func(xi) for xi in x)
        node.node_error = sum(grado_pertenencia_padre_func(xi) * (yi - node.label)**2 for xi, yi in zip(x, y)) 
        
        # Si solo hay un elemento con grado de pertenencia positivo al nodo, no tiene sentido seguir expandiendo el nodo (y ademas min_error_split da error)
        # De hecho, puede que haya mas de un elemento con grado de pertenencia positivo al nodo, pero que valgan todos lo mismo, entonces tampoco tiene sentido seguir por aqui, porque no vas a poder dividir los datos
        unique_positive_memberships = np.unique(np.sort(x[np.array([grado_pertenencia_padre_func(xi) for xi in x]) > 0]))
        if len(unique_positive_memberships) <= 1:
            node.is_leaf = True
            return node
        
        # Perform the split using min_error_split
        split_result = min_error_split(y, x, grado_pertenencia_padre_func, fuzzy)
        
        node.alfa = split_result['alfa']
        node.beta = split_result['beta']

        # Determine if the node is a leaf or should be split further
        if depth < self.max_depth and self._should_split(node, x):
            node.is_leaf = False

            # Recursive splitting for left and right child nodes
            node.left = self._grow_tree(y, x, split_result['grado_pertenencia_izquierda_func'], fuzzy, depth+1)
            node.right = self._grow_tree(y, x, split_result['grado_pertenencia_derecha_func'], fuzzy, depth+1)
        
        return node

    def _should_split(self, node, x):
        return node.cardinality > 0.01*len(x) and node.node_error > 1
    
    def collect_leaf_nodes(self):
        leaves = []
        self._collect_leaf_nodes(self.root, leaves)
        return leaves

    def _collect_leaf_nodes(self, node, leaves):
        if node.is_leaf:
            leaves.append(node)
        else:
            if node.left is not None:
                self._collect_leaf_nodes(node.left, leaves)
            if node.right is not None:
                self._collect_leaf_nodes(node.right, leaves)
                
    def collect_no_leaf_nodes(self):
        no_leaves = []
        self._collect_no_leaf_nodes(self.root, no_leaves)
        return no_leaves
                
    def _collect_no_leaf_nodes(self, node, no_leaves):
        if not node.is_leaf:
            no_leaves.append(node)
            if node.left is not None:
                self._collect_no_leaf_nodes(node.left, no_leaves)
            if node.right is not None:
                self._collect_no_leaf_nodes(node.right, no_leaves)
                
    def _predict_single(self, x):
        leaves = self.collect_leaf_nodes()
        prediction = sum(node.label*node.grado_pertenencia_func(x) for node in leaves)
        return prediction
    
    def predict(self, x):
        return [self._predict_single(xi) for xi in x]
    
    def _predict_single_fuzzy_woe(self, x):
        leaves = self.collect_leaf_nodes()
        prediction = sum(node.fuzzy_woe*node.grado_pertenencia_func(x) for node in leaves)
        return prediction
    
    def predict_fuzzy_woe(self, x):
        return [self._predict_single_fuzzy_woe(xi) for xi in x]

In [19]:
def print_tree(node, margen=0):
    if node is not None:
        print("-" * margen + node.node_def())
        if not node.is_leaf:
            print_tree(node.left, margen + 1)
            print_tree(node.right, margen + 1)

In [20]:
def plot_membership_functions(tree, x_range):
    sns.set(style="whitegrid")
    
    plt.figure(figsize=(10, 6))
    
    leaf_nodes = tree.collect_leaf_nodes()
        
    for node in leaf_nodes:
        membership_values = [node.grado_pertenencia_func(xi) for xi in x_range]
        plt.plot(x_range, membership_values, label=f'WOE AD = {node.fuzzy_woe:.2f}')
    
    plt.xlabel('X', fontsize=15)
    plt.ylabel('Grado de pertenencia', fontsize=15)
    plt.legend()
    plt.grid(True)
    plt.show()

In [21]:
def woe_ad_df(X_train, y_train, X_test): #X_train y X_test tienen que tener las mismas columnas
    y_train = np.array(y_train)
    X_train_woe_ad = pd.DataFrame()
    X_test_woe_ad = pd.DataFrame()
    for column in X_train.columns:
        print(column)
        column_data_train = X_train[[column]].values.flatten()
        column_data_test = X_test[[column]].values.flatten()
        fdt = FuzzyDecisionTree(max_depth=2)
        fdt.fit(y_train, column_data_train, grado_pertenencia_padre_func, fuzzy=True)
        X_train_woe_ad[column + '_woe_ad'] = fdt.predict(column_data_train)
        X_test_woe_ad[column + '_woe_ad'] = fdt.predict(column_data_test)
        print('ok')
    return X_train_woe_ad, X_test_woe_ad

In [22]:
def woe_ad_df_na(X_train, y_train, X_test): #X_train y X_test tienen que tener las mismas columnas
    y_train = np.array(y_train)
    X_train_woe_ad = pd.DataFrame()
    X_test_woe_ad = pd.DataFrame()
    for column in X_train.columns:
        print(column)
        
        # Separa los datos no nulos
        non_null_train = X_train[column].notnull()
        non_null_test = X_test[column].notnull()
        
        column_data_train = X_train[[column]][non_null_train].values.flatten()
        column_data_test = X_test[[column]][non_null_test].values.flatten()
        fdt = FuzzyDecisionTree(max_depth=2)
        fdt.fit(y_train[non_null_train], column_data_train, grado_pertenencia_padre_func, fuzzy=True)
        
        array_woe_values_train = fdt.predict(column_data_train)
        array_woe_values_test = fdt.predict(column_data_test)
        
        # Cálculo de los valores woe para los datos nulos
        good = np.sum(y_train[~non_null_train] == 1)
        bad = np.sum(y_train[~non_null_train] == 0)
        woe_value_missing = np.log( (good+0.5) / (bad+0.5) )
        
        # Crear el array resultado de train
        result_train = np.zeros_like(non_null_train, dtype=np.float64)

        # Indice para recorrer
        index = 0

        # Llenar el array resultado
        for i in range(len(non_null_train)):
            if non_null_train[i]:
                result_train[i] = array_woe_values_train[index]
                index += 1
            else:
                result_train[i] = woe_value_missing

        # Crear el array resultado de test
        result_test = np.zeros_like(non_null_test, dtype=np.float64)

        # Indice para recorrer
        index = 0

        # Llenar el array resultado
        for i in range(len(non_null_test)):
            if non_null_test[i]:
                result_test[i] = array_woe_values_test[index]
                index += 1
            else:
                result_test[i] = woe_value_missing
        
        X_train_woe_ad[column + '_woe_ad'] = result_train
        X_test_woe_ad[column + '_woe_ad'] = result_test
        print('ok')
    return X_train_woe_ad, X_test_woe_ad

## Carga de datos

In [23]:
country = 'Alemania'

if country == 'Alemania':
    df_data = pd.read_csv('german_TFM.csv')
elif country == 'Australia':
    df_data = pd.read_csv('australian_TFM.csv')
    df_data = df_data.rename(columns={'TARGET_N': 'target'})
elif country == 'Taiwan':
    df_data = pd.read_csv('taiwan_TFM.csv')
    df_data = df_data.rename(columns={'TARGET_N': 'target'})
    df_data = df_data.drop(columns='ID')
elif country == 'Japon':
    df_data = pd.read_csv('japan_TFM.csv')
    df_data = df_data.rename(columns={'TARGET_C': 'target'})
    df_data['target'] = df_data['target'].map({'+': 1, '-': 0})
elif country == 'Polonia':
    df_data = pd.read_csv('polish_TFM.csv')
    df_data = df_data.rename(columns={'target_n': 'target'})
df_data

Unnamed: 0,balance,duration,previous,purpose,amount,savings,length_employment,instalment,sex_marital,guarantors,...,most_valuable,age,concurrent_credits,apartment,num_credits,occupation,dependents,telephone,foreign,target
0,1,18,4,2,1049,1,2,4,2,1,...,2,21,3,1,1,3,1,1,1,0
1,1,9,4,0,2799,1,3,2,3,1,...,1,36,3,1,2,3,2,1,1,0
2,2,12,2,9,841,2,4,2,2,1,...,1,23,3,1,1,2,1,1,1,0
3,1,12,4,0,2122,1,3,3,3,1,...,1,39,3,1,2,2,2,1,2,0
4,1,12,4,0,2171,1,3,4,3,1,...,2,38,1,2,2,2,1,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,24,2,3,1987,1,3,2,3,1,...,1,21,3,1,1,2,2,1,1,1
996,1,24,2,0,2303,1,5,4,3,2,...,1,45,3,2,1,3,1,1,1,1
997,4,21,4,0,12680,5,5,4,3,1,...,4,30,3,3,1,4,1,2,1,1
998,2,12,2,3,6468,5,1,2,3,1,...,4,52,3,2,1,4,1,2,1,1


In [24]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   balance             1000 non-null   int64
 1   duration            1000 non-null   int64
 2   previous            1000 non-null   int64
 3   purpose             1000 non-null   int64
 4   amount              1000 non-null   int64
 5   savings             1000 non-null   int64
 6   length_employment   1000 non-null   int64
 7   instalment          1000 non-null   int64
 8   sex_marital         1000 non-null   int64
 9   guarantors          1000 non-null   int64
 10  duration_address    1000 non-null   int64
 11  most_valuable       1000 non-null   int64
 12  age                 1000 non-null   int64
 13  concurrent_credits  1000 non-null   int64
 14  apartment           1000 non-null   int64
 15  num_credits         1000 non-null   int64
 16  occupation          1000 non-null   int64
 

In [25]:
df_data.columns

Index(['balance', 'duration', 'previous', 'purpose', 'amount', 'savings',
       'length_employment', 'instalment', 'sex_marital', 'guarantors',
       'duration_address', 'most_valuable', 'age', 'concurrent_credits',
       'apartment', 'num_credits', 'occupation', 'dependents', 'telephone',
       'foreign', 'target'],
      dtype='object')

In [26]:
if country == 'Alemania':
    variables_numericas = ['duration', 'amount', 'age'] # 3
    variables_categoricas = ['balance', 'previous', 'purpose', 'savings',
           'length_employment', 'instalment', 'sex_marital', 'guarantors',
           'duration_address', 'most_valuable', 'concurrent_credits',
           'apartment', 'num_credits', 'occupation', 'dependents', 'telephone',
           'foreign'] # 17
elif country == 'Australia':
    variables_numericas = ['A2', 'A3', 'A7', 'A10', 'A13', 'A14'] # 6
    variables_categoricas = ['A1', 'A4', 'A5', 'A6', 'A8', 'A9', 'A11', 'A12'] # 8
elif country == 'Taiwan':
    variables_numericas = ['LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'] # 14
    variables_categoricas = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6'] # 9
elif country == 'Japon':
    variables_numericas = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15'] # 6
    variables_categoricas = ['A1', 'A4', 'A5', 'A6', 'A7', 'A9', 'A10', 'A12', 'A13'] # 9
elif country == 'Polonia':
    variables_numericas = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11',
       'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21',
       'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31',
       'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41',
       'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51',
       'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61',
       'X62', 'X63', 'X64'] # 64
    variables_categoricas = [] # 0

In [27]:
X = df_data.drop(columns='target')
X

Unnamed: 0,balance,duration,previous,purpose,amount,savings,length_employment,instalment,sex_marital,guarantors,duration_address,most_valuable,age,concurrent_credits,apartment,num_credits,occupation,dependents,telephone,foreign
0,1,18,4,2,1049,1,2,4,2,1,4,2,21,3,1,1,3,1,1,1
1,1,9,4,0,2799,1,3,2,3,1,2,1,36,3,1,2,3,2,1,1
2,2,12,2,9,841,2,4,2,2,1,4,1,23,3,1,1,2,1,1,1
3,1,12,4,0,2122,1,3,3,3,1,2,1,39,3,1,2,2,2,1,2
4,1,12,4,0,2171,1,3,4,3,1,4,2,38,1,2,2,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1,24,2,3,1987,1,3,2,3,1,4,1,21,3,1,1,2,2,1,1
996,1,24,2,0,2303,1,5,4,3,2,1,1,45,3,2,1,3,1,1,1
997,4,21,4,0,12680,5,5,4,3,1,4,4,30,3,3,1,4,1,2,1
998,2,12,2,3,6468,5,1,2,3,1,1,4,52,3,2,1,4,1,2,1


In [28]:
y = df_data['target']
y

0      0
1      0
2      0
3      0
4      0
      ..
995    1
996    1
997    1
998    1
999    1
Name: target, Length: 1000, dtype: int64

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Reiniciar los índices de los DataFrames y Series resultantes
X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

In [30]:
X_train_num = X_train[variables_numericas]
X_test_num = X_test[variables_numericas]

In [31]:
X_train_woe_num, X_test_woe_num = woe_df_na(X_train_num, y_train, X_test_num)

duration
ok
amount
ok
age
ok


In [32]:
X_train_woe_and_num, X_test_woe_and_num = woe_and_df_na(X_train_num, y_train, X_test_num)

duration
ok
amount
ok
age
ok


In [33]:
X_train_woe_ad_num, X_test_woe_ad_num = woe_ad_df_na(X_train_num, y_train, X_test_num)

duration
ok
amount
ok
age
ok


## Aplicación de modelos

### Regresión logística a las variables numéricas

Regresión logística normal (no poner esta en la memoria)

In [34]:
# Crear el modelo de regresión logística
modelo_reg_log = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_reg_log.fit(X_train_num, y_train)

# Predecir en los datos de prueba
predicciones_reg_log = modelo_reg_log.predict(X_test_num)

# Evaluar el modelo
accuracy_reg_log = modelo_reg_log.score(X_test_num, y_test)
print("Exactitud del modelo:", accuracy_reg_log)

# Predecir probabilidades en lugar de clases
predicciones_prob_reg_log = modelo_reg_log.predict_proba(X_test_num)[:, 1]

# Calcular el AUC
auc_reg_log = roc_auc_score(y_test, predicciones_prob_reg_log)
print("Área bajo la curva ROC (AUC):", auc_reg_log)

Exactitud del modelo: 0.7066666666666667
Área bajo la curva ROC (AUC): 0.6485714285714286


In [35]:
coefficients_reg_log = modelo_reg_log.coef_

intercept_reg_log = modelo_reg_log.intercept_

print("Coeficientes:", coefficients_reg_log)
print("Intercept:", intercept_reg_log)

Coeficientes: [[ 2.71598422e-02  5.83883141e-05 -1.48438130e-02]]
Intercept: [-1.10537587]


WOE nítidas

In [36]:
# Crear el modelo de regresión logística
modelo_woe = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_woe.fit(X_train_woe_num, y_train)

# Predecir en los datos de prueba
predicciones_woe = modelo_woe.predict(X_test_woe_num)

# Evaluar el modelo
accuracy_woe = modelo_woe.score(X_test_woe_num, y_test)
print("Exactitud del modelo:", accuracy_woe)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe = modelo_woe.predict_proba(X_test_woe_num)[:, 1]

# Calcular el AUC
auc_woe = roc_auc_score(y_test, predicciones_prob_woe)
print("Área bajo la curva ROC (AUC):", auc_woe)

Exactitud del modelo: 0.7
Área bajo la curva ROC (AUC): 0.631031746031746


In [37]:
coefficients_woe = modelo_woe.coef_

intercept_woe = modelo_woe.intercept_

print("Coeficientes:", coefficients_woe)
print("Intercept:", intercept_woe)

Coeficientes: [[0.67480739 0.87425313 1.10249635]]
Intercept: [1.39206719]


WOE AND

In [38]:
# Crear el modelo de regresión logística
modelo_woe_and = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_woe_and.fit(X_train_woe_and_num, y_train)

# Predecir en los datos de prueba
predicciones_woe_and = modelo_woe_and.predict(X_test_woe_and_num)

# Evaluar el modelo
accuracy_woe_and = modelo_woe_and.score(X_test_woe_and_num, y_test)
print("Exactitud del modelo:", accuracy_woe_and)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_and = modelo_woe_and.predict_proba(X_test_woe_and_num)[:, 1]

# Calcular el AUC
auc_woe_and = roc_auc_score(y_test, predicciones_prob_woe_and)
print("Área bajo la curva ROC (AUC):", auc_woe_and)

Exactitud del modelo: 0.7266666666666667
Área bajo la curva ROC (AUC): 0.6861111111111113


In [39]:
coefficients_woe_and = modelo_woe_and.coef_

intercept_woe_and = modelo_woe_and.intercept_

print("Coeficientes:", coefficients_woe_and)
print("Intercept:", intercept_woe_and)

Coeficientes: [[0.94834397 0.75156039 0.91486218]]
Intercept: [1.34273384]


WOE AD

In [40]:
# Crear el modelo de regresión logística
modelo_woe_ad = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_woe_ad.fit(X_train_woe_ad_num, y_train)

# Predecir en los datos de prueba
predicciones_woe_ad = modelo_woe_ad.predict(X_test_woe_ad_num)

# Evaluar el modelo
accuracy_woe_ad = modelo_woe_ad.score(X_test_woe_ad_num, y_test)
print("Exactitud del modelo:", accuracy_woe_ad)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_ad = modelo_woe_ad.predict_proba(X_test_woe_ad_num)[:, 1]

# Calcular el AUC
auc_woe_ad = roc_auc_score(y_test, predicciones_prob_woe_ad)
print("Área bajo la curva ROC (AUC):", auc_woe_ad)

Exactitud del modelo: 0.73
Área bajo la curva ROC (AUC): 0.6608201058201058


In [41]:
coefficients_woe_ad = modelo_woe_ad.coef_

intercept_woe_ad = modelo_woe_ad.intercept_

print("Coeficientes:", coefficients_woe_ad)
print("Intercept:", intercept_woe_ad)

Coeficientes: [[4.31060424 3.29682579 5.4614018 ]]
Intercept: [-4.82045266]


### Regresión logística a todas las variables

Transformación WOE a las variables categóricas

In [42]:
X_train_cat = X_train[variables_categoricas]
X_test_cat = X_test[variables_categoricas]

# Convertir las variables categóricas a tipo 'str'
for col in variables_categoricas:
    X_train_cat.loc[:, col] = X_train_cat[col].astype('str')
    X_test_cat.loc[:, col] = X_test_cat[col].astype('str')

# Combinar X_cat y y en un solo DataFrame
df_train_cat = pd.concat([X_train_cat[variables_categoricas], y_train], axis=1)

df_train_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   balance             700 non-null    object
 1   previous            700 non-null    object
 2   purpose             700 non-null    object
 3   savings             700 non-null    object
 4   length_employment   700 non-null    object
 5   instalment          700 non-null    object
 6   sex_marital         700 non-null    object
 7   guarantors          700 non-null    object
 8   duration_address    700 non-null    object
 9   most_valuable       700 non-null    object
 10  concurrent_credits  700 non-null    object
 11  apartment           700 non-null    object
 12  num_credits         700 non-null    object
 13  occupation          700 non-null    object
 14  dependents          700 non-null    object
 15  telephone           700 non-null    object
 16  foreign             700 no

In [43]:
# Discretización de la característica y cálculo de WOE
bins = sc.woebin(df_train_cat, y='target')

[INFO] creating woe binning ...


In [44]:
bins

{'occupation':      variable    bin  count  count_distr  good  bad   badprob       woe  \
 0  occupation  1%,%2    153     0.218571   112   41  0.267974 -0.157629   
 1  occupation      3    440     0.628571   309  131  0.297727 -0.010846   
 2  occupation      4    107     0.152857    69   38  0.355140  0.250778   
 
      bin_iv  total_iv breaks  is_special_values  
 0  0.005254  0.015393  1%,%2              False  
 1  0.000074  0.015393      3              False  
 2  0.010065  0.015393      4              False  ,
 'most_valuable':         variable bin  count  count_distr  good  bad   badprob       woe  \
 0  most_valuable   1    204     0.291429   162   42  0.205882 -0.502629   
 1  most_valuable   3    231     0.330000   163   68  0.294372 -0.026945   
 2  most_valuable   2    154     0.220000   103   51  0.331169  0.144395   
 3  most_valuable   4    111     0.158571    62   49  0.441441  0.611984   
 
      bin_iv  total_iv breaks  is_special_values  
 0  0.065649  0.135964   

In [45]:
# Aplicar la transformación WOE al DataFrame
df_train_woe_cat = sc.woebin_ply(df_train_cat, bins)
df_train_woe_cat

[INFO] converting into woe values ...


Unnamed: 0,target,previous_woe,occupation_woe,most_valuable_woe,length_employment_woe,foreign_woe,purpose_woe,apartment_woe,num_credits_woe,dependents_woe,savings_woe,sex_marital_woe,concurrent_credits_woe,telephone_woe,guarantors_woe,balance_woe,duration_address_woe,instalment_woe
0,1,-0.008368,0.250778,-0.502629,0.133811,0.0,0.112651,-0.172534,-0.124563,-0.008940,0.265209,0.419854,0.419854,-0.120792,0.0,0.401559,0.043336,-0.111959
1,0,-0.008368,-0.157629,-0.502629,0.133811,0.0,0.470612,-0.172534,-0.124563,0.045937,0.265209,-0.221372,-0.136773,0.080588,0.0,0.816207,0.043336,0.127243
2,1,1.215023,0.250778,-0.502629,0.431470,0.0,-0.930475,-0.172534,0.068629,-0.008940,-0.734489,0.326763,0.553537,-0.120792,0.0,-1.042654,-0.005479,0.127243
3,0,0.166032,0.250778,0.144395,0.431470,0.0,-0.414944,0.306491,0.068629,-0.008940,0.265209,0.326763,-0.136773,-0.120792,0.0,0.816207,-0.021202,0.127243
4,0,-1.004360,-0.010846,0.611984,-0.321273,0.0,-0.414944,-0.172534,-0.124563,-0.008940,0.265209,-0.221372,-0.136773,0.080588,0.0,-1.042654,0.043336,-0.111959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,1,1.215023,0.250778,-0.026945,-0.321273,0.0,0.470612,-0.172534,0.068629,-0.008940,-0.734489,-0.221372,0.419854,-0.120792,0.0,0.816207,-0.021202,0.127243
696,1,0.166032,-0.010846,0.144395,-0.421213,0.0,-0.930475,0.306491,0.068629,-0.008940,0.265209,-0.221372,-0.136773,0.080588,0.0,0.816207,-0.021202,0.127243
697,0,-1.004360,-0.157629,-0.502629,0.133811,0.0,-0.414944,-0.172534,-0.124563,-0.008940,-0.887303,0.326763,-0.136773,0.080588,0.0,-1.042654,-0.005479,0.127243
698,0,-1.004360,-0.157629,-0.502629,0.133811,0.0,0.470612,-0.172534,-0.124563,-0.008940,-0.887303,0.326763,0.553537,0.080588,0.0,-1.042654,0.043336,-0.111959


In [46]:
X_train_woe_cat = df_train_woe_cat.drop(columns='target')

In [47]:
X_test_woe_cat = sc.woebin_ply(X_test_cat, bins)
X_test_woe_cat

[INFO] converting into woe values ...


Unnamed: 0,previous_woe,occupation_woe,most_valuable_woe,length_employment_woe,foreign_woe,purpose_woe,apartment_woe,num_credits_woe,dependents_woe,savings_woe,sex_marital_woe,concurrent_credits_woe,telephone_woe,guarantors_woe,balance_woe,duration_address_woe,instalment_woe
0,-1.004360,-0.010846,-0.026945,0.431470,0.0,0.470612,-0.172534,0.068629,-0.008940,0.265209,-0.221372,-0.136773,0.080588,0.0,0.401559,-0.005479,0.127243
1,0.166032,-0.157629,-0.502629,-0.421213,0.0,0.112651,-0.172534,0.068629,-0.008940,0.265209,-0.221372,-0.136773,0.080588,0.0,-1.042654,-0.021202,-0.225339
2,-0.008368,-0.010846,0.144395,0.133811,0.0,0.112651,-0.172534,-0.124563,-0.008940,0.265209,-0.221372,-0.136773,0.080588,0.0,-1.042654,-0.021202,-0.111959
3,1.215023,0.250778,0.611984,0.431470,0.0,-0.930475,0.506371,0.068629,-0.008940,0.265209,-0.221372,0.553537,-0.120792,0.0,0.401559,0.043336,-0.225339
4,-1.004360,-0.010846,-0.026945,0.133811,0.0,0.112651,-0.172534,0.068629,-0.008940,-0.887303,-0.221372,0.553537,0.080588,0.0,0.401559,-0.021202,-0.111959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,-0.008368,-0.010846,-0.026945,0.431470,0.0,0.112651,0.306491,-0.124563,-0.008940,-0.734489,-0.221372,-0.136773,-0.120792,0.0,-1.042654,-0.021202,0.127243
296,-0.008368,-0.157629,-0.502629,0.431470,0.0,0.112651,-0.172534,0.068629,0.045937,-0.887303,-0.221372,0.553537,0.080588,0.0,-1.042654,0.043336,0.127243
297,1.215023,-0.010846,0.144395,0.431470,0.0,0.112651,-0.172534,-0.124563,-0.008940,0.265209,0.419854,-0.136773,0.080588,0.0,-1.042654,-0.005479,-0.111959
298,0.166032,-0.157629,-0.502629,0.133811,0.0,0.470612,-0.172534,0.068629,0.045937,0.265209,-0.221372,-0.136773,0.080588,0.0,-1.042654,0.043336,-0.050644


Unión de las woe categoricas con los tres dataframes con las distintas woe numericas

In [48]:
X_train_woe = pd.concat([X_train_woe_num, X_train_woe_cat], axis=1)
X_test_woe = pd.concat([X_test_woe_num, X_test_woe_cat], axis=1)

X_train_woe_and = pd.concat([X_train_woe_and_num, X_train_woe_cat], axis=1)
X_test_woe_and = pd.concat([X_test_woe_and_num, X_test_woe_cat], axis=1)

X_train_woe_ad = pd.concat([X_train_woe_ad_num, X_train_woe_cat], axis=1)
X_test_woe_ad = pd.concat([X_test_woe_ad_num, X_test_woe_cat], axis=1)


if country in ['Japon', 'Alemania', 'Australia']:
    X_train_all = pd.concat([X_train_woe_num, X_train_woe_and_num, X_train_woe_ad_num, X_train_woe_cat], axis=1)
    X_test_all = pd.concat([X_test_woe_num, X_test_woe_and_num, X_test_woe_ad_num, X_test_woe_cat], axis=1)
elif country in ['Taiwan']:
    X_train_all = pd.concat([X_train_woe_num, X_train_woe_and_num, X_train_woe_cat], axis=1)
    X_test_all = pd.concat([X_test_woe_num, X_test_woe_and_num, X_test_woe_cat], axis=1)
elif country in ['Polonia']:
    X_train_all = pd.concat([X_train_woe_num, X_train_woe_and_num], axis=1)
    X_test_all = pd.concat([X_test_woe_num, X_test_woe_and_num], axis=1)

Aplicación de una regresión logística a los tres dataframes:

WOE nítidas

In [49]:
# Crear el modelo de regresión logística
modelo_woe = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_woe.fit(X_train_woe, y_train)

# Predecir en los datos de prueba
predicciones_woe = modelo_woe.predict(X_test_woe)

# Evaluar el modelo
accuracy_woe = modelo_woe.score(X_test_woe, y_test)
print("Exactitud del modelo:", accuracy_woe)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe = modelo_woe.predict_proba(X_test_woe)[:, 1]

# Calcular el AUC
auc_woe = roc_auc_score(y_test, predicciones_prob_woe)
print("Área bajo la curva ROC (AUC):", auc_woe)

Exactitud del modelo: 0.7633333333333333
Área bajo la curva ROC (AUC): 0.7627513227513228


In [50]:
coefficients_woe = modelo_woe.coef_

intercept_woe = modelo_woe.intercept_

print("Coeficientes:", coefficients_woe)
print("Intercept:", intercept_woe)

Coeficientes: [[ 0.54848847  1.11462637  0.52935439  0.85131756  0.62852619  0.58053067
   0.60368649  0.          1.09183308  0.51705093 -1.87113929  2.90470203
   0.74637697  1.37168859  0.39155399  1.41393299  0.          0.79920725
   4.73443848  2.71708802]]
Intercept: [1.0137024]


WOE AND

In [51]:
# Crear el modelo de regresión logística
modelo_woe_and = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_woe_and.fit(X_train_woe_and, y_train)

# Predecir en los datos de prueba
predicciones_woe_and = modelo_woe_and.predict(X_test_woe_and)

# Evaluar el modelo
accuracy_woe_and = modelo_woe_and.score(X_test_woe_and, y_test)
print("Exactitud del modelo:", accuracy_woe_and)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_and = modelo_woe_and.predict_proba(X_test_woe_and)[:, 1]

# Calcular el AUC
auc_woe_and = roc_auc_score(y_test, predicciones_prob_woe_and)
print("Área bajo la curva ROC (AUC):", auc_woe_and)

Exactitud del modelo: 0.7533333333333333
Área bajo la curva ROC (AUC): 0.7670370370370371


In [52]:
coefficients_woe_and = modelo_woe_and.coef_

intercept_woe_and = modelo_woe_and.intercept_

print("Coeficientes:", coefficients_woe_and)
print("Intercept:", intercept_woe_and)

Coeficientes: [[ 0.85234018  1.02429112  0.101426    0.83856098  0.49777121  0.50294874
   0.68498098  0.          1.08929816  0.5428119  -1.59925025  1.82099566
   0.78346841  1.42354949  0.41714521  1.48970399  0.          0.80203673
   4.74976435  2.58922986]]
Intercept: [0.7555507]


WOE AD

In [53]:
# Crear el modelo de regresión logística
modelo_woe_ad = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_woe_ad.fit(X_train_woe_ad, y_train)

# Predecir en los datos de prueba
predicciones_woe_ad = modelo_woe_ad.predict(X_test_woe_ad)

# Evaluar el modelo
accuracy_woe_ad = modelo_woe_ad.score(X_test_woe_ad, y_test)
print("Exactitud del modelo:", accuracy_woe_ad)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_ad = modelo_woe_ad.predict_proba(X_test_woe_ad)[:, 1]

# Calcular el AUC
auc_woe_ad = roc_auc_score(y_test, predicciones_prob_woe_ad)
print("Área bajo la curva ROC (AUC):", auc_woe_ad)

Exactitud del modelo: 0.7533333333333333
Área bajo la curva ROC (AUC): 0.7683597883597884


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
coefficients_woe_ad = modelo_woe_ad.coef_

intercept_woe_ad = modelo_woe_ad.intercept_

print("Coeficientes:", coefficients_woe_ad)
print("Intercept:", intercept_woe_ad)

Coeficientes: [[ 3.7077018   4.70963731  1.2836809   0.83402088  0.41717226  0.50933143
   0.66003717  0.          1.08913747  0.53972904 -1.60216176  2.3237918
   0.77694402  1.41496968  0.42621807  1.50966448  0.          0.79910011
   4.39580546  2.69611655]]
Intercept: [-3.80119672]


Las 3 tipos de WOE numericas y las categoricas

In [55]:
# Crear el modelo de regresión logística
modelo_all = LogisticRegression(penalty=None)

# Ajustar el modelo con los datos de entrenamiento
modelo_all.fit(X_train_all, y_train)

# Predecir en los datos de prueba
predicciones_all = modelo_all.predict(X_test_all)

# Evaluar el modelo
accuracy_all = modelo_all.score(X_test_all, y_test)
print("Exactitud del modelo:", accuracy_all)

# Predecir probabilidades en lugar de clases
predicciones_prob_all = modelo_all.predict_proba(X_test_all)[:, 1]

# Calcular el AUC
auc_all = roc_auc_score(y_test, predicciones_prob_all)
print("Área bajo la curva ROC (AUC):", auc_all)

Exactitud del modelo: 0.7566666666666667
Área bajo la curva ROC (AUC): 0.7570370370370371


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [56]:
coefficients_all = modelo_all.coef_

intercept_all = modelo_all.intercept_

print("Coeficientes:", coefficients_all)
print("Intercept:", intercept_all)

Coeficientes: [[-0.74550388  1.93861073  1.51955753  1.06600465 -1.10623785 -1.13201785
   2.47423994  0.67691675 -1.99446614  0.85356206  0.63029655  0.52920673
   0.66941909  0.          1.10499786  0.55468044 -1.83960589  3.35834424
   0.74084794  1.41508076  0.40885466  1.26135891  0.          0.80589201
   4.43823475  2.54908979]]
Intercept: [-0.0839005]


### Lo mismo pero con seleccion de variables. Regresión hacia adelante

**Variables numéricas**

WOE nítidas

In [57]:
# Crear el modelo de regresión logística
modelo_woe = LogisticRegression(penalty=None)

# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_woe,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_woe_num, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_woe_num_sel = X_train_woe_num[selected_features]
X_test_woe_num_sel = X_test_woe_num[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_woe.fit(X_train_woe_num_sel, y_train)

# Predecir en los datos de prueba
predicciones_woe = modelo_woe.predict(X_train_woe_num_sel)

# Evaluar el modelo
accuracy_woe = modelo_woe.score(X_test_woe_num_sel, y_test)
print("Exactitud del modelo:", accuracy_woe)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe = modelo_woe.predict_proba(X_test_woe_num_sel)[:, 1]

# Calcular el AUC
auc_woe = roc_auc_score(y_test, predicciones_prob_woe)
print("Área bajo la curva ROC (AUC):", auc_woe)

Selected features: ['duration_woe', 'amount_woe', 'age_woe']
Exactitud del modelo: 0.7
Área bajo la curva ROC (AUC): 0.631031746031746


In [58]:
coefficients_woe = modelo_woe.coef_

intercept_woe = modelo_woe.intercept_

print("Coeficientes:", coefficients_woe)
print("Intercept:", intercept_woe)

Coeficientes: [[0.67480739 0.87425313 1.10249635]]
Intercept: [1.39206719]


WOE AND

In [59]:
# Crear el modelo de regresión logística
modelo_woe_and = LogisticRegression(penalty=None)


# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_woe_and,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_woe_and_num, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_woe_and_num_sel = X_train_woe_and_num[selected_features]
X_test_woe_and_num_sel = X_test_woe_and_num[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_woe_and.fit(X_train_woe_and_num_sel, y_train)

# Predecir en los datos de prueba
predicciones_woe_and = modelo_woe_and.predict(X_test_woe_and_num_sel)

# Evaluar el modelo
accuracy_woe_and = modelo_woe_and.score(X_test_woe_and_num_sel, y_test)
print("Exactitud del modelo:", accuracy_woe_and)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_and = modelo_woe_and.predict_proba(X_test_woe_and_num_sel)[:, 1]

# Calcular el AUC
auc_woe_and = roc_auc_score(y_test, predicciones_prob_woe_and)
print("Área bajo la curva ROC (AUC):", auc_woe_and)

Selected features: ['duration_woe_and', 'amount_woe_and', 'age_woe_and']
Exactitud del modelo: 0.7266666666666667
Área bajo la curva ROC (AUC): 0.6861111111111113


In [60]:
coefficients_woe_and = modelo_woe_and.coef_

intercept_woe_and = modelo_woe_and.intercept_

print("Coeficientes:", coefficients_woe_and)
print("Intercept:", intercept_woe_and)

Coeficientes: [[0.94834397 0.75156039 0.91486218]]
Intercept: [1.34273384]


WOE AD

In [61]:
# Crear el modelo de regresión logística
modelo_woe_ad = LogisticRegression(penalty=None)


# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_woe_ad,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_woe_ad_num, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_woe_ad_num_sel = X_train_woe_ad_num[selected_features]
X_test_woe_ad_num_sel = X_test_woe_ad_num[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_woe_ad.fit(X_train_woe_ad_num_sel, y_train)

# Predecir en los datos de prueba
predicciones_woe_ad = modelo_woe_ad.predict(X_test_woe_ad_num_sel)

# Evaluar el modelo
accuracy_woe_ad = modelo_woe_ad.score(X_test_woe_ad_num_sel, y_test)
print("Exactitud del modelo:", accuracy_woe_ad)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_ad = modelo_woe_ad.predict_proba(X_test_woe_ad_num_sel)[:, 1]

# Calcular el AUC
auc_woe_ad = roc_auc_score(y_test, predicciones_prob_woe_ad)
print("Área bajo la curva ROC (AUC):", auc_woe_ad)

Selected features: ['duration_woe_ad', 'amount_woe_ad', 'age_woe_ad']
Exactitud del modelo: 0.73
Área bajo la curva ROC (AUC): 0.6608201058201058


In [62]:
coefficients_woe_ad = modelo_woe_ad.coef_

intercept_woe_ad = modelo_woe_ad.intercept_

print("Coeficientes:", coefficients_woe_ad)
print("Intercept:", intercept_woe_ad)

Coeficientes: [[4.31060424 3.29682579 5.4614018 ]]
Intercept: [-4.82045266]


**Variables numéricas y categóricas**

WOE nítidas

In [63]:
# Crear el modelo de regresión logística
modelo_woe = LogisticRegression(penalty=None)

# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_woe,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_woe, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_woe_sel = X_train_woe[selected_features]
X_test_woe_sel = X_test_woe[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_woe.fit(X_train_woe_sel, y_train)

# Predecir en los datos de prueba
predicciones_woe = modelo_woe.predict(X_train_woe_sel)

# Evaluar el modelo
accuracy_woe = modelo_woe.score(X_test_woe_sel, y_test)
print("Exactitud del modelo:", accuracy_woe)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe = modelo_woe.predict_proba(X_test_woe_sel)[:, 1]

# Calcular el AUC
auc_woe = roc_auc_score(y_test, predicciones_prob_woe)
print("Área bajo la curva ROC (AUC):", auc_woe)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features: ['duration_woe', 'amount_woe', 'previous_woe', 'most_valuable_woe', 'length_employment_woe', 'purpose_woe', 'savings_woe', 'sex_marital_woe', 'telephone_woe', 'balance_woe', 'instalment_woe']
Exactitud del modelo: 0.7433333333333333
Área bajo la curva ROC (AUC): 0.7482804232804232


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
coefficients_woe = modelo_woe.coef_

intercept_woe = modelo_woe.intercept_

print("Coeficientes:", coefficients_woe)
print("Intercept:", intercept_woe)

Coeficientes: [[0.60411986 1.12578553 0.81327233 0.71089108 0.65538965 1.0808579
  0.80802727 1.37216378 1.21696925 0.78798085 2.57310977]]
Intercept: [0.62796905]


WOE AND

In [65]:
# Crear el modelo de regresión logística
modelo_woe_and = LogisticRegression(penalty=None)


# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_woe_and,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_woe_and, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_woe_and_sel = X_train_woe_and[selected_features]
X_test_woe_and_sel = X_test_woe_and[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_woe_and.fit(X_train_woe_and_sel, y_train)

# Predecir en los datos de prueba
predicciones_woe_and = modelo_woe_and.predict(X_test_woe_and_sel)

# Evaluar el modelo
accuracy_woe_and = modelo_woe_and.score(X_test_woe_and_sel, y_test)
print("Exactitud del modelo:", accuracy_woe_and)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_and = modelo_woe_and.predict_proba(X_test_woe_and_sel)[:, 1]

# Calcular el AUC
auc_woe_and = roc_auc_score(y_test, predicciones_prob_woe_and)
print("Área bajo la curva ROC (AUC):", auc_woe_and)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features: ['duration_woe_and', 'amount_woe_and', 'previous_woe', 'most_valuable_woe', 'length_employment_woe', 'purpose_woe', 'savings_woe', 'sex_marital_woe', 'telephone_woe', 'balance_woe', 'duration_address_woe', 'instalment_woe']
Exactitud del modelo: 0.73
Área bajo la curva ROC (AUC): 0.7591534391534391


In [66]:
coefficients_woe_and = modelo_woe_and.coef_

intercept_woe_and = modelo_woe_and.intercept_

print("Coeficientes:", coefficients_woe_and)
print("Intercept:", intercept_woe_and)

Coeficientes: [[0.86094241 1.03952788 0.80243257 0.72603778 0.6225375  1.09323742
  0.83004597 1.38119543 1.16672424 0.79632099 4.1352143  2.49350185]]
Intercept: [0.68251586]


WOE AD

In [67]:
# Crear el modelo de regresión logística
modelo_woe_ad = LogisticRegression(penalty=None)


# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_woe_ad,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_woe_ad, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_woe_ad_sel = X_train_woe_ad[selected_features]
X_test_woe_ad_sel = X_test_woe_ad[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_woe_ad.fit(X_train_woe_ad_sel, y_train)

# Predecir en los datos de prueba
predicciones_woe_ad = modelo_woe_ad.predict(X_test_woe_ad_sel)

# Evaluar el modelo
accuracy_woe_ad = modelo_woe_ad.score(X_test_woe_ad_sel, y_test)
print("Exactitud del modelo:", accuracy_woe_ad)

# Predecir probabilidades en lugar de clases
predicciones_prob_woe_ad = modelo_woe_ad.predict_proba(X_test_woe_ad_sel)[:, 1]

# Calcular el AUC
auc_woe_ad = roc_auc_score(y_test, predicciones_prob_woe_ad)
print("Área bajo la curva ROC (AUC):", auc_woe_ad)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features: ['duration_woe_ad', 'amount_woe_ad', 'previous_woe', 'most_valuable_woe', 'length_employment_woe', 'purpose_woe', 'savings_woe', 'sex_marital_woe', 'telephone_woe', 'balance_woe', 'instalment_woe']
Exactitud del modelo: 0.7566666666666667
Área bajo la curva ROC (AUC): 0.7578306878306877


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [68]:
coefficients_woe_ad = modelo_woe_ad.coef_

intercept_woe_ad = modelo_woe_ad.intercept_

print("Coeficientes:", coefficients_woe_ad)
print("Intercept:", intercept_woe_ad)

Coeficientes: [[3.90570028 4.79694009 0.81066576 0.66281708 0.68191176 1.08924381
  0.8343929  1.35294834 1.30782541 0.77863296 2.55697617]]
Intercept: [-3.49558211]


**Todas las transformaciones woe a la  vez**

In [69]:
# Crear el modelo de regresión logística
modelo_all = LogisticRegression(penalty=None)


# Configurar y ajustar el selector de características hacia adelante
sfs = SFS(modelo_all,
          k_features='best',
          forward=True,
          floating=False,
          scoring='roc_auc',
          cv=5)

sfs = sfs.fit(X_train_all, y_train)

# Ver las características seleccionadas
selected_features = list(sfs.k_feature_names_)
print(f"Selected features: {selected_features}")

X_train_all_sel = X_train_all[selected_features]
X_test_all_sel = X_test_all[selected_features]


# Ajustar el modelo con los datos de entrenamiento
modelo_all.fit(X_train_all_sel, y_train)

# Predecir en los datos de prueba
predicciones_all = modelo_all.predict(X_test_all_sel)

# Evaluar el modelo
accuracy_all = modelo_all.score(X_test_all_sel, y_test)
print("Exactitud del modelo:", accuracy_all)

# Predecir probabilidades en lugar de clases
predicciones_prob_all = modelo_all.predict_proba(X_test_all_sel)[:, 1]

# Calcular el AUC
auc_all = roc_auc_score(y_test, predicciones_prob_all)
print("Área bajo la curva ROC (AUC):", auc_all)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Selected features: ['duration_woe', 'amount_woe', 'duration_woe_and', 'duration_woe_ad', 'previous_woe', 'most_valuable_woe', 'length_employment_woe', 'purpose_woe', 'num_credits_woe', 'savings_woe', 'sex_marital_woe', 'telephone_woe', 'balance_woe', 'duration_address_woe', 'instalment_woe']
Exactitud del modelo: 0.7466666666666667
Área bajo la curva ROC (AUC): 0.7586772486772487


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [70]:
coefficients_all = modelo_all.coef_

intercept_all = modelo_all.intercept_

print("Coeficientes:", coefficients_all)
print("Intercept:", intercept_all)

Coeficientes: [[-1.04624739  1.05945779 -3.12250599 23.35580726  0.89144642  0.73200671
   0.69053378  1.0720351  -1.89413933  0.77859582  1.43085547  1.24067644
   0.82691208  3.57312596  2.5195821 ]]
Intercept: [-10.30243056]
