# **Formas prácticas de manejar los valores ausentes**





## Imputation of missing values

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

import pandas as pd


Univariate feature imputation - SimpleImputer

In [None]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x=[[7, 2, 3], [4, np.nan, 6], [10, 5, 9]]
print(x)

[[7, 2, 3], [4, nan, 6], [10, 5, 9]]


In [None]:
si=imp_mean.fit(x)
si

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [None]:
si.fit(x)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='mean', verbose=0)

In [None]:
y=imp_mean.transform(x)

In [None]:
print(imp_mean.transform(x))

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   5.   9. ]]


In [None]:
print(y)

[[ 7.   2.   3. ]
 [ 4.   3.5  6. ]
 [10.   5.   9. ]]


Lo vemos más claro con pandas

por el tema de que se ven mejor la filas y columnas

In [None]:
df = pd.DataFrame({
    "x": [1, np.nan, 4, 2],
    "y": [np.nan, 2, 5, 2],
    "z": [0, 2, np.nan, 4]
})
df

Unnamed: 0,x,y,z
0,1.0,,0.0
1,,2.0,2.0
2,4.0,5.0,
3,2.0,2.0,4.0


In [None]:
# Definimos el tipo
si = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
df

Unnamed: 0,x,y,z
0,1.0,,0.0
1,,2.0,2.0
2,4.0,5.0,
3,2.0,2.0,4.0


In [None]:
# y aplicamos la imputación
df_si=si.fit_transform(df)

array([[1.        , 3.        , 0.        ],
       [2.33333333, 2.        , 2.        ],
       [4.        , 5.        , 2.        ],
       [2.        , 2.        , 4.        ]])

In [None]:
# importante pasarlo a pandas, sino sigue siendo numpy
df_si=pd.DataFrame(df_si)
df_si

Unnamed: 0,0,1,2
0,1.0,3.0,0.0
1,2.333333,2.0,2.0
2,4.0,5.0,2.0
3,2.0,2.0,4.0


**Multivariate feature imputation - IterativeImputer**

In [None]:
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
df

Unnamed: 0,x,y,z
0,1.0,,0.0
1,,2.0,2.0
2,4.0,5.0,
3,2.0,2.0,4.0


In [None]:
# Definimos el tipo de imputación
ii = IterativeImputer(max_iter=10, random_state=0)

In [None]:
# y aplicamos la imputación
df_ii=ii.fit_transform(df)
df_ii

**Nearest neighbors imputation**


In [None]:
from sklearn.impute import
# sklearn.impute.

In [None]:
from sklearn.impute import KNNImputer

# Definimos el tipo de imputación
imputerKNN = KNNImputer(n_neighbors=2, weights="uniform")


In [None]:
# y aplicamos la imputación
df_knn = imputerKNN.fit_transform(df)
df_knn

array([[1. , 2. , 0. ],
       [1.5, 2. , 2. ],
       [4. , 5. , 3. ],
       [2. , 2. , 4. ]])

Nuevo ejemplo, nuevos datos

In [None]:
students = [[85, 'M', 'verygood'],
           [95, 'F', 'excellent'],
           [75, None,'good'],
           [np.NaN, 'M', 'average'],
           [70, 'M', 'good'],
           [np.NaN, None, 'verygood'],
           [92, 'F', 'verygood'],
           [98, 'M', 'excellent']]

dfstd = pd.DataFrame(students)

dfstd.columns = ['marks', 'gender', 'result']

In [None]:
students

[[85, 'M', 'verygood'],
 [95, 'F', 'excellent'],
 [75, None, 'good'],
 [nan, 'M', 'average'],
 [70, 'M', 'good'],
 [nan, None, 'verygood'],
 [92, 'F', 'verygood'],
 [98, 'M', 'excellent']]

In [None]:
# dataframe
dfstdOriginal = dfstd
dfstdOriginal


Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


Vamos a imputar los valores

In [None]:
# si imputamos, todas las columnas nos da error porque no se puede hacer media de las categóricas como gender
df3 = imputer.fit_transform(dfstdOriginal)
df3

Así que lo hacemos sólo de la primera columna

In [None]:

# Definimos el tipo de imputación
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
dfstd.marks = imputer.fit_transform(dfstd['marks'].values.reshape(-1,1))[:,0]    # y aplicamos la imputación
dfstd

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


y para la segunda columna cogemos la categoría más frequente

In [None]:
# Cogemos otro tipo de imputación
imputer2 = SimpleImputer(missing_values=None, strategy='most_frequent')

dfstd.gender = imputer2.fit_transform(dfstd['gender'].values.reshape(-1,1))[:,0]
dfstd

In [None]:
dfstdOriginal

In [None]:
dfstd2 = dfstdOriginal
dfstd2


Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,M,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,M,verygood
6,92.0,F,verygood
7,98.0,M,excellent


In [None]:
# lo volvemos a probar con los valores originales y podemos rellenarlo con los valores que indiquemos
imputer = SimpleImputer(missing_values=None, strategy='constant', fill_value='F')

dfstd2.gender = imputer.fit_transform(dfstd['gender'].values.reshape(-1,1))[:,0]
dfstd2

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,F,good
3,85.833333,M,average
4,70.0,M,good
5,85.833333,F,verygood
6,92.0,F,verygood
7,98.0,M,excellent


Y ahora con datos más grandes

In [None]:
import pandas as pd

url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = pd.read_csv(url, header=None, na_values='?')

In [None]:
dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,5.0,4.0,4.0,,,,3.0,5.0,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,3.0,4.0,2.0,,,,4.0,2.0,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,,,,1.0,1.0,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,2.0,4.0,4.0,1.0,2.0,5.0,3.0,,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,,,,,,,,,74.0,7.4,,,2.0,2,4300,0,0,2


podemos enumerar cada columna e informar el número de filas con valores perdidos para la columna.

In [None]:
# Definimos el tipo de imputación
ii = IterativeImputer(max_iter=10, random_state=0)

In [None]:
dataframe

In [None]:
import pandas as pd

# y aplicamos la imputación
df_ii2=ii.fit_transform(dataframe)
df3 = pd.DataFrame(df_ii2)  #importante para verlo como pandas
df3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27
0,2.0,1.0,530101.0,38.5,66.0,28.0,3.0,3.0,2.901983,2.0,5.0,4.0,4.0,1.733339,1.602366,5.335189,3.0,5.0,45.0,8.4,2.098757,3.521581,2.0,2.0,11300.0,0.0,0.0,2.0
1,1.0,1.0,534817.0,39.2,88.0,20.0,2.702491,2.34265,4.0,1.0,3.0,4.0,2.0,1.748298,1.654848,1.275257,4.0,2.0,50.0,85.0,2.0,2.0,3.0,2.0,2208.0,0.0,0.0,2.0
2,2.0,1.0,530334.0,38.3,40.0,24.0,1.0,1.0,3.0,1.0,3.0,3.0,1.0,1.777677,1.353778,5.531709,1.0,1.0,33.0,6.7,1.716613,3.633892,1.0,2.0,0.0,0.0,0.0,1.0
3,1.0,9.0,5290409.0,39.1,164.0,84.0,4.0,1.0,6.0,2.0,2.0,4.0,4.0,1.0,2.0,5.0,3.0,4.183747,48.0,7.2,3.0,5.3,2.0,1.0,2208.0,0.0,0.0,1.0
4,2.0,1.0,530255.0,37.3,104.0,35.0,3.150878,3.351311,6.0,2.0,3.534412,3.645108,2.79456,1.764873,1.963865,5.426239,3.56997,3.661319,74.0,7.4,2.585734,3.990319,2.0,2.0,4300.0,0.0,0.0,2.0


In [None]:
dataframe.shape

(300, 28)

In [None]:
# summarize the number of rows with missing values for each column
for i in range(dataframe.shape[1]):
	# count number of rows with missing values
	n_miss = dataframe[[i]].isnull().sum()
	perc = n_miss / dataframe.shape[0] * 100
	print('> %d, Missing: %d (%.1f%%)' % (i, n_miss, perc))

Probamos KNNImputer como método de imputación

In [None]:
from sklearn.impute import KNNImputer
# Definimos el tipo de imputación
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [None]:
dataframe

In [None]:
df_imputer_knn = imputer.fit_transform(dataframe)
df_imputer_knn

In [None]:
df_imputer_knn.dtype

In [None]:
# knn imputation transform for the horse colic dataset
from numpy import isnan
from pandas import read_csv
from sklearn.impute import KNNImputer
# load dataset
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/horse-colic.csv'
dataframe = read_csv(url, header=None, na_values='?')
# split into input and output elements
data = dataframe.values
data


In [None]:
dataframe.shape

(300, 28)

In [None]:
data.shape

(300, 28)

In [None]:
data.columns = ['marks', 'gender', 'result']

In [None]:
ix = [i for i in range(data.shape[1]) if i != 23]
X, y = data[:, ix], data[:, 23]
# print total missing
print('Missing: %d' % sum(isnan(X).flatten()))
# define imputer
imputer = KNNImputer()
# fit on the dataset
imputer.fit(X)
# transform the dataset
Xtrans = imputer.transform(X)
# print total missing
print('Missing: %d' % sum(isnan(Xtrans).flatten()))

Missing: 1605
Missing: 0
