<a href="https://colab.research.google.com/github/jeguns/EP7173/blob/main/Unidad%2009/09_Discretizacion_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
datos1 = pd.read_excel('datos_intro_balanceo.xlsx')

In [None]:
tabla = datos1['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla

In [None]:
tabla['prop'].iloc[0]/tabla['prop'].iloc[1]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_dataframe(dataframe, x_col='EP', y_col='EF', hue_col='DESAPRUEBA'):
    """
    Genera un gráfico de dispersión personalizado con las líneas guía.

    Parameters:
    - dataframe: pd.DataFrame, DataFrame con los datos a graficar.
    - x_col: str, nombre de la columna para el eje x.
    - y_col: str, nombre de la columna para el eje y.
    - hue_col: str, nombre de la columna para diferenciar por colores.
    """
    plt.figure(figsize=(6, 6))
    scatter = sns.scatterplot(data=dataframe, x=x_col, y=y_col, hue=hue_col, s=25)

    plt.axhline(y=10.5, color='gray', linestyle='--')
    plt.axvline(x=10.5, color='gray', linestyle='--')

    plt.title("Gráfico de EP vs EF por DESAPRUEBA")
    plt.xlabel(x_col)
    plt.ylabel(y_col)
    plt.legend(title=hue_col)
    plt.grid(True)
    plt.tight_layout()

    plt.show()

plot_dataframe(datos1)

# Undersampling

## Random Undersampling

In [None]:
import numpy as np
from imblearn.under_sampling import RandomUnderSampler, CondensedNearestNeighbour, TomekLinks, EditedNearestNeighbours, NeighbourhoodCleaningRule

X = datos1.drop(columns=['DESAPRUEBA'])
y = datos1['DESAPRUEBA']

In [None]:
np.random.seed(44)
rus = RandomUnderSampler(random_state=9)
X_rus, y_rus = rus.fit_resample(X, y)

datos1_rus = pd.concat([pd.DataFrame(X_rus, columns=X.columns),
                        pd.DataFrame(y_rus, columns=['DESAPRUEBA'])], axis=1)

tabla1_rus = datos1_rus['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla1_rus['prop'].iloc[0]/tabla1_rus['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_rus)

## Condensed Nearest Neighbor

In [None]:
np.random.seed(44)
cnn = CondensedNearestNeighbour(sampling_strategy='majority') # ?CondensedNearestNeighbour
X_resampled, y_resampled = cnn.fit_resample(X, y)

datos1_cnn = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                        pd.DataFrame(y_resampled, columns=['DESAPRUEBA'])], axis=1)

tabla_cnn = datos1_cnn['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_cnn['prop'].iloc[0]/tabla_cnn['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_cnn)

## Tomek Links

In [None]:
np.random.seed(44)
tomek = TomekLinks(sampling_strategy='majority')
X_resampled, y_resampled = tomek.fit_resample(X, y)

datos1_tomek = pd.concat([pd.DataFrame(X_resampled, columns=X.columns),
                          pd.DataFrame(y_resampled, columns=['DESAPRUEBA'])], axis=1)

tabla_tomek = datos1_tomek['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_tomek['prop'].iloc[0]/tabla_tomek['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_tomek)

## One Side Selection

In [None]:
np.random.seed(44)
cnn = CondensedNearestNeighbour(sampling_strategy='majority')
X_cnn, y_cnn = cnn.fit_resample(X, y)
tomek = TomekLinks(sampling_strategy='majority')
X_oss, y_oss = tomek.fit_resample(X_cnn, y_cnn)

datos1_oss = pd.concat([pd.DataFrame(X_oss, columns=X.columns),
                        pd.DataFrame(y_oss, columns=['DESAPRUEBA'])], axis=1)

tabla_oss = datos1_oss['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_oss['prop'].iloc[0]/tabla_oss['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_oss)

## Edited Nearest Neighbors

In [None]:
np.random.seed(44)
enn = EditedNearestNeighbours(sampling_strategy='majority')
X_enn, y_enn = enn.fit_resample(X, y)

datos1_enn = pd.concat([pd.DataFrame(X_enn, columns=X.columns),
                        pd.DataFrame(y_enn, columns=['DESAPRUEBA'])], axis=1)

tabla_enn = datos1_enn['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_enn['prop'].iloc[0]/tabla_enn['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_enn)

## Neighborhood Cleaning Rule

In [None]:
np.random.seed(44)
ncl = NeighbourhoodCleaningRule(sampling_strategy='majority')
X_ncl, y_ncl = ncl.fit_resample(X, y)

datos1_ncl = pd.concat([pd.DataFrame(X_ncl, columns=X.columns),
                        pd.DataFrame(y_ncl, columns=['DESAPRUEBA'])], axis=1)

tabla_ncl = datos1_ncl['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_ncl['prop'].iloc[0]/tabla_ncl['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_ncl)

# Oversampling

In [None]:
import numpy as np
from imblearn.over_sampling import RandomOverSampler, SMOTE

X = datos1.drop(columns=['DESAPRUEBA'])
y = datos1['DESAPRUEBA']

## Random Oversampling


In [None]:
np.random.seed(119)
ros = RandomOverSampler()
X_ros, y_ros = ros.fit_resample(X, y)

datos1_ros = pd.concat([pd.DataFrame(X_ros, columns=X.columns),
                        pd.DataFrame(y_ros, columns=['DESAPRUEBA'])], axis=1)

tabla_ros= datos1_ros['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_ros['prop'].iloc[0]/tabla_ros['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_ros)

## SMOTE

In [None]:
np.random.seed(129)
smote = SMOTE(sampling_strategy = 'minority')
X_smote, y_smote = smote.fit_resample(X, y)

datos1_smote1 = pd.concat([pd.DataFrame(X_smote, columns=X.columns),
                           pd.DataFrame(y_smote, columns=['DESAPRUEBA'])], axis=1)

tabla_smote= datos1_smote1['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_smote['prop'].iloc[0]/tabla_smote['prop'].iloc[1]

In [None]:
plot_dataframe(datos1_smote1)

# Oversampling + Undersampling

## Random Oversampling + Undersampling

In [None]:
np.random.seed(9)
count_classes = y.value_counts()
target_size = int(count_classes.mean())

ros = RandomOverSampler(sampling_strategy={count_classes.idxmin(): target_size})
X_oversampled, y_oversampled = ros.fit_resample(X, y)
rus = RandomUnderSampler(sampling_strategy={count_classes.idxmax(): target_size})
X_balanced, y_balanced = rus.fit_resample(X_oversampled, y_oversampled)

datos1_balanced = pd.concat([pd.DataFrame(X_balanced, columns=X.columns),
                             pd.DataFrame(y_balanced, columns=['DESAPRUEBA'])], axis=1)

tabla_balanced = datos1_balanced['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_balanced['prop'].iloc[0]/tabla_balanced['prop'].iloc[1]

In [None]:
datos1_balanced['DESAPRUEBA'].value_counts()

In [None]:
plot_dataframe(datos1_balanced)

## SMOTE ENN

In [None]:
from imblearn.combine import SMOTETomek

np.random.seed(9)
smote_tomek = SMOTETomek(sampling_strategy = "minority")
X_both, y_both = smote_tomek.fit_resample(X, y)

datos1_both1 = pd.concat([pd.DataFrame(X_both, columns=X.columns),
                          pd.DataFrame(y_both, columns=['DESAPRUEBA'])], axis=1)

tabla_both = datos1_both1['DESAPRUEBA'].value_counts(normalize=True).reset_index(name='prop')
tabla_both['prop'].iloc[0]/tabla_both['prop'].iloc[1]

In [None]:
datos1_both1['DESAPRUEBA'].value_counts()

In [None]:
plot_dataframe(datos1_both1)