<a href="https://colab.research.google.com/github/jcmachicao/modpred__evaluacion/blob/main/modpred_eval__soluciones.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### **Curso Modelamiento Predictivo**
---
# <font color='blue'>**Evaluación de Modelos**</font>
* Autor: José Carlos Machicao
* Licencia: [GestioDinámica](http://www.gestiodinamica.com) 2021

### Temas
* Data desbalanceada
* SMOTE (Synthetic Minority Oversampling Technique)


In [None]:
!pip install imbalanced-learn

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import os

from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler



In [17]:
np.set_printoptions(linewidth=np.inf)

In [3]:
ruta = 'drive/My Drive/2020 Cursos/2020 Modelamiento Predictivo/2021 ModPred EVAL/'
os.listdir(ruta)

['ActosViolentos.xlsx',
 'Actos_v2.xlsx',
 'bd_manzanas.xlsx',
 'bd_resultado.xlsx',
 'ModPred_Eval__Muestras.ipynb',
 'modpred_eval__soluciones.ipynb']

In [4]:
data = pd.read_excel(ruta + 'bd_manzanas.xlsx')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   peso      91 non-null     int64  
 1   aroma     91 non-null     object 
 2   precio    91 non-null     int64  
 3   diametro  91 non-null     float64
 4   color     91 non-null     object 
 5   estado    91 non-null     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 4.4+ KB


In [6]:
label = 'estado'

In [7]:
var_cat, var_num = [], []
for a, b in zip(data.columns, data.dtypes):
  if a != label:
    if b == 'object':
      var_cat.append(a)
    else:
      var_num.append(a)
print(var_cat)
print(var_num)

['aroma', 'color']
['peso', 'precio', 'diametro']


In [8]:
data[var_cat + [label]].apply(lambda x: x.value_counts()).T.stack()

aroma   agradable       61.0
        desagradable    30.0
color   marron          36.0
        rojo            39.0
        verde           16.0
estado  comestible      61.0
        malogrado       30.0
dtype: float64

In [9]:
data_cat = pd.get_dummies(data[var_cat], drop_first=True)
data_num = data[var_num]
X_df = pd.concat([data_cat, data_num], axis=1)
scl = MinMaxScaler()
scl.fit(X_df)
X = scl.transform(X_df)
y = data[label]
X.shape, type(X), y.shape, type(y)

((91, 6), numpy.ndarray, (91,), pandas.core.series.Series)

## <font color='red'>**Resampling**</font>

In [10]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)



In [11]:
print(sorted(Counter(y_resampled).items()))

[('comestible', 61), ('malogrado', 61)]


## <font color='red'>**Eliminación por Varianza**</font>

In [18]:
X[0:10]

array([[0.        , 0.        , 0.        , 0.84      , 0.33333333, 0.82758621],
       [0.        , 1.        , 0.        , 0.84      , 0.5       , 0.93103448],
       [0.        , 1.        , 0.        , 0.6       , 0.66666667, 0.31034483],
       [1.        , 0.        , 0.        , 0.32      , 0.5       , 0.27586207],
       [0.        , 0.        , 0.        , 0.4       , 0.16666667, 0.20689655],
       [1.        , 0.        , 1.        , 0.08      , 0.        , 0.31034483],
       [1.        , 0.        , 0.        , 0.6       , 0.        , 0.51724138],
       [0.        , 1.        , 0.        , 0.8       , 0.83333333, 0.96551724],
       [0.        , 1.        , 0.        , 0.72      , 0.33333333, 0.86206897],
       [0.        , 1.        , 0.        , 0.96      , 0.33333333, 0.93103448]])

In [20]:
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.9 * (1 - 0.9)))
X_alt = sel.fit_transform(X)
X_alt[0:10]

array([[0.        , 0.        , 0.        , 0.82758621],
       [0.        , 1.        , 0.        , 0.93103448],
       [0.        , 1.        , 0.        , 0.31034483],
       [1.        , 0.        , 0.        , 0.27586207],
       [0.        , 0.        , 0.        , 0.20689655],
       [1.        , 0.        , 1.        , 0.31034483],
       [1.        , 0.        , 0.        , 0.51724138],
       [0.        , 1.        , 0.        , 0.96551724],
       [0.        , 1.        , 0.        , 0.86206897],
       [0.        , 1.        , 0.        , 0.93103448]])

# Referencias

https://pypi.org/project/imbalanced-learn/

https://www.youtube.com/watch?v=dkXB8HH_4-k

