# 07 - BALANCEO DE DATOS

Vamos a realizar los siguientes métodos:

- Sin balanceo
- Undersampling
- Oversampling
- Smote-tomek

## IMPORTAR PAQUETES

In [1]:
import numpy as np
import pandas as pd

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## IMPORTAR LOS DATOS

In [2]:
df = pd.read_pickle('../../02_Datos/03_Trabajo/df_tablon.pickle')
df.head()

Unnamed: 0,estacion,edad,e_infantil,acc_grave,int_quirurgica,fiebre_ult_any,frec_alcohol,fumar,hr_sentado,produccion
0,-0.33,0.69,0,1,1,0,0.8,0,0.88,0
1,-0.33,0.94,1,0,1,0,0.8,1,0.31,1
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5,0
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5,1
5,-0.33,0.67,1,0,1,0,0.8,0,0.5,0


### Cargar los datos

In [3]:
x = df.drop(columns= 'produccion').copy()
y = df.produccion.copy()

In [4]:
x.head()

Unnamed: 0,estacion,edad,e_infantil,acc_grave,int_quirurgica,fiebre_ult_any,frec_alcohol,fumar,hr_sentado
0,-0.33,0.69,0,1,1,0,0.8,0,0.88
1,-0.33,0.94,1,0,1,0,0.8,1,0.31
2,-0.33,0.5,1,0,0,0,1.0,-1,0.5
4,-0.33,0.67,1,1,0,0,0.8,-1,0.5
5,-0.33,0.67,1,0,1,0,0.8,0,0.5


In [5]:
y.head()

0    0
1    1
2    0
4    1
5    0
Name: produccion, dtype: int64

## BALANCEO

### SIN BALANCEO

#### Crear train y test

In [6]:
train_x, val_x, train_y, val_y = train_test_split(x, y, test_size= 0.3, random_state= 42)

#### Instanciar modelo

In [7]:
rl_sin_balanceo = LogisticRegression(n_jobs= -1)

#### Entrenar

In [8]:
rl_sin_balanceo.fit(train_x, train_y)

#### Aplicar

In [9]:
pred_rl_sin_balanceo = rl_sin_balanceo.predict_proba(val_x)[:,1]
pred = rl_sin_balanceo.predict(val_x)

#### Evaluar

In [10]:
roc1_rl_sin_balanceo = roc_auc_score(val_y, pred_rl_sin_balanceo)
roc_rl_sin_balanceo = roc_auc_score(val_y, pred)
roc_rl_sin_balanceo_CR = (classification_report(val_y, pred))
roc_rl_sin_balanceo_CF = (confusion_matrix(val_y, pred))

print(f'AUC [1]: {roc1_rl_sin_balanceo}')
print(f'AUC: {roc_rl_sin_balanceo}')
print(roc_rl_sin_balanceo_CR)
print(roc_rl_sin_balanceo_CF)

AUC [1]: 0.48148148148148145
AUC: 0.5
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        18
           1       0.00      0.00      0.00         3

    accuracy                           0.86        21
   macro avg       0.43      0.50      0.46        21
weighted avg       0.73      0.86      0.79        21

[[18  0]
 [ 3  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### UNDERSAMPLING

#### Instanciar el undersampler

In [11]:
rus = RandomUnderSampler(sampling_strategy= 1)

#### Entrenar y aplicar el undersampler

In [12]:
x_rus, y_rus = rus.fit_resample(x,y)

#### Crear train y test

In [13]:
train_x_rus, val_x_rus, train_y_rus, val_y_rus = train_test_split(x_rus, y_rus, test_size= 0.3, random_state= 42)

#### Instanciar el modelo

In [14]:
rl_rus = LogisticRegression(n_jobs=-1)

#### Entrenar

In [15]:
rl_rus.fit(train_x_rus,train_y_rus)

#### Aplicar

In [16]:
pred_rl_rus = rl_rus.predict_proba(val_x_rus)[:,1]
pred_rl_rusCR = rl_rus.predict(val_x_rus)

#### Evaluar

In [17]:
roc1_rl_rus = roc_auc_score(val_y_rus,pred_rl_rus)
roc_rl_rus = roc_auc_score(val_y_rus, pred_rl_rusCR)
roc_rl_rus_CR = classification_report(val_y_rus, pred_rl_rusCR)
roc_rl_rus_CF = confusion_matrix(val_y_rus, pred_rl_rusCR)

print(f'AUC [1]: {roc1_rl_rus}')
print(f'AUC: {roc_rl_rus}')
print(roc_rl_rus_CR)
print(roc_rl_rus_CF)

AUC [1]: 0.8
AUC: 0.7
              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.25      1.00      0.40         1

    accuracy                           0.50         6
   macro avg       0.62      0.70      0.49         6
weighted avg       0.88      0.50      0.54         6

[[2 3]
 [0 1]]


### OVERSAMPLING

#### Instanciar el Oversampler

In [18]:
ros = RandomOverSampler(sampling_strategy=1)

#### Entrenar y aplicar el Oversampler

In [19]:
x_ros, y_ros = ros.fit_resample(x,y)

#### Crear train y test

In [20]:
train_x_ros, val_x_ros, train_y_ros, val_y_ros = train_test_split(x_ros, y_ros, test_size= 0.3, random_state= 42)

#### Instanciar el modelo

In [21]:
rl_ros = LogisticRegression(n_jobs= -1)

#### Entrenar

In [22]:
rl_ros.fit(train_x_ros, train_y_ros)

#### Aplicar

In [23]:
pred_rl_ros = rl_ros.predict_proba(val_x_ros)[:,1]
pred_rl_rosCR = rl_ros.predict(val_x_ros)

#### Evaluar

In [24]:
roc1_rl_ros = roc_auc_score(val_y_ros,pred_rl_ros)
roc_rl_ros = roc_auc_score(val_y_ros, pred_rl_rosCR)
roc_rl_ros_CR = classification_report(val_y_ros, pred_rl_rosCR)
roc_rl_ros_CF = confusion_matrix(val_y_ros, pred_rl_rosCR)

print(f'AUC [1]: {roc1_rl_ros}')
print(f'AUC: {roc_rl_ros}')
print(roc_rl_ros_CR)
print(roc_rl_ros_CF)

AUC [1]: 0.7690058479532164
AUC: 0.6198830409356725
              precision    recall  f1-score   support

           0       0.62      0.56      0.59        18
           1       0.62      0.68      0.65        19

    accuracy                           0.62        37
   macro avg       0.62      0.62      0.62        37
weighted avg       0.62      0.62      0.62        37

[[10  8]
 [ 6 13]]


### SMOTE-TOMEK

#### Instanciar un Tomek y un SMOTE

In [25]:
tom = TomekLinks(n_jobs= -1)
smo = SMOTE(sampling_strategy= 1, n_jobs= -1)

#### Instanciar el SMOTE-Tomek

In [26]:
sto = SMOTETomek(sampling_strategy= 1,
                 smote= smo,
                 tomek= tom,
                 n_jobs= -1)

#### Entrenar y aplicar el SMOTE-Tomek

In [27]:
x_sto, y_sto = sto.fit_resample(x, y)



#### Crear train y test el SMOTE-Tomek

In [28]:
train_x_sto, val_x_sto, train_y_sto, val_y_sto = train_test_split(x_sto, y_sto, test_size= 0.3, random_state= 42)

#### Instanciar el modelo

In [29]:
rl_sto = LogisticRegression(n_jobs=-1)

#### Entrenar

In [30]:
rl_sto.fit(train_x_sto, train_y_sto)

#### Aplicar

In [31]:
pred_rl_sto = rl_sto.predict_proba(val_x_sto)[:,1]
pred_rl_stoCR = rl_sto.predict(val_x_sto)

#### Evaluar

In [32]:

roc1_rl_sto = roc_auc_score(val_y_sto,pred_rl_sto)
roc_rl_sto = roc_auc_score(val_y_sto, pred_rl_stoCR)
roc_rl_sto_CR = classification_report(val_y_sto, pred_rl_stoCR)
roc_rl_sto_CF = confusion_matrix(val_y_sto, pred_rl_stoCR)

print(f'AUC [1]: {roc1_rl_sto}')
print(f'AUC: {roc_rl_sto}')
print(roc_rl_sto_CR)
print(roc_rl_sto_CF)

AUC [1]: 0.8450292397660819
AUC: 0.8377192982456141
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        18
           1       0.84      0.84      0.84        19

    accuracy                           0.84        37
   macro avg       0.84      0.84      0.84        37
weighted avg       0.84      0.84      0.84        37

[[15  3]
 [ 3 16]]


### EVALUAR RESULTADOS

In [33]:
print('***Sin Balanceo***\n')
print(f'Sin Balanceo - AUC[1]: {roc1_rl_sin_balanceo}')
print(f'Sin Balanceo - AUC: {roc_rl_sin_balanceo}')
print(roc_rl_sin_balanceo_CR)
print('Matrix Confusión:\n' ,roc_rl_sin_balanceo_CF)

print('\n***Undersampling***\n')
print(f'Undersampling - AUC[1]: {roc1_rl_rus}')
print(f'Undersampling - AUC: {roc_rl_rus}')
print(roc_rl_rus_CR)
print('Matrix Confusión:\n' ,roc_rl_rus_CF)

print('\n***Oversampling***\n')
print(f'Oversampling - AUC[1]: {roc1_rl_ros}')
print(f'Oversampling - AUC: {roc_rl_ros}')
print(roc_rl_ros_CR)
print('Matrix Confusión:\n' ,roc_rl_ros_CF)

print('\n***Smote-Tomek***\n')
print(f'Smote-Tomek - AUC[1]: {roc1_rl_sto}')
print(f'Smote-Tomek - AUC: {roc_rl_sto}')
print(roc_rl_sto_CR)
print('Matrix Confusión:\n' ,roc_rl_sto_CF)

***Sin Balanceo***

Sin Balanceo - AUC[1]: 0.48148148148148145
Sin Balanceo - AUC: 0.5
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        18
           1       0.00      0.00      0.00         3

    accuracy                           0.86        21
   macro avg       0.43      0.50      0.46        21
weighted avg       0.73      0.86      0.79        21

Matrix Confusión:
 [[18  0]
 [ 3  0]]

***Undersampling***

Undersampling - AUC[1]: 0.8
Undersampling - AUC: 0.7
              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.25      1.00      0.40         1

    accuracy                           0.50         6
   macro avg       0.62      0.70      0.49         6
weighted avg       0.88      0.50      0.54         6

Matrix Confusión:
 [[2 3]
 [0 1]]

***Oversampling***

Oversampling - AUC[1]: 0.7690058479532164
Oversampling - AUC: 0.6198830409356725
          

**CONCLUSIÓN:** Podemos establecer que el modelo trabaja mejor aplicando un balanceo. El que mejor resultados a obtenido ha sido el Smote-Tomek. Por lo que aplicaremos el dataset con el Smote-Tomek al XGBClassifier

## GUARDAR DATASET TRAS PRESELECCIÓN

In [34]:
# Balanceo seleccionado

x_final = x_sto
y_final = y_sto
# Definir los nombres de los archivos

x_final.to_pickle('../../02_Datos/03_Trabajo/x_balanceo.pickle')
y_final.to_pickle('../../02_Datos/03_Trabajo/y_balanceo.pickle')