# Imputación de Datos

In [1]:
import numpy as np
import pandas as pd

In [3]:
df1 = pd.read_csv('dataset/dataimputar.csv')

In [4]:
df1

Unnamed: 0,A,B,C,D
0,1.0,,2.0,Curicó
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,,Santiago
3,,4.0,6.0,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu
8,1.0,,,Santiago
9,1.0,,2.1,Santiago


## NaN

In [5]:
# Elimina todas las filas donde encuentre valor NAN
df_clean1 = df1.dropna()
df_clean1

Unnamed: 0,A,B,C,D
1,0.0,3.0,5.0,Pichilemu
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu


In [6]:
# Elimina todas las columnas donde encuentre valor NAN
df_clean2 = df1.dropna(axis="columns")
df_clean2

Unnamed: 0,D
0,Curicó
1,Pichilemu
2,Santiago
3,Santiago
4,Chiloé
5,Curicó
6,Santiago
7,Pichilemu
8,Santiago
9,Santiago


In [8]:
# Elimina todas las columnas donde encuentre valor NAN con menos de 8 registros
df_clean3 = df1.dropna(thresh=8, axis="columns")
df_clean3

Unnamed: 0,A,C,D
0,1.0,2.0,Curicó
1,0.0,5.0,Pichilemu
2,1.0,,Santiago
3,,6.0,Santiago
4,0.0,2.0,Chiloé
5,0.0,3.0,Curicó
6,1.0,4.0,Santiago
7,0.0,2.0,Pichilemu
8,1.0,,Santiago
9,1.0,2.1,Santiago


In [10]:
# Elimina todas las filas donde encuentre menos de 3 valores validos
df_clean4 = df1.dropna(thresh=3, axis="rows")
df_clean4

Unnamed: 0,A,B,C,D
0,1.0,,2.0,Curicó
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,,Santiago
3,,4.0,6.0,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu
9,1.0,,2.1,Santiago


In [11]:
# Elimina todas los NaN sólo en las columnas que se indican
df_clean5 = df1.dropna(subset=['A','B'])
df_clean5

Unnamed: 0,A,B,C,D
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu


## isnull()

In [14]:
df1['C'].isnull().sum()

2

In [16]:
for i in df1.columns:
    print(i+': '+ str(df1[i].isnull().sum()))

A: 1
B: 3
C: 2
D: 0


## fillna()

In [17]:
# Completa los valores NaN con el valor de la fila posterior
df1.fillna(method='backfill')

Unnamed: 0,A,B,C,D
0,1.0,3.0,2.0,Curicó
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,6.0,Santiago
3,0.0,4.0,6.0,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu
8,1.0,,2.1,Santiago
9,1.0,,2.1,Santiago


In [18]:
# Completa los valores NaN con el valor de la fila anterior
df1.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,1.0,,2.0,Curicó
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,5.0,Santiago
3,1.0,4.0,6.0,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu
8,1.0,0.0,2.0,Santiago
9,1.0,0.0,2.1,Santiago


In [19]:
# Completa con la combinación de ambos metodos
df1.fillna(method='ffill').fillna(method='bfill')

Unnamed: 0,A,B,C,D
0,1.0,3.0,2.0,Curicó
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,5.0,Santiago
3,1.0,4.0,6.0,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu
8,1.0,0.0,2.0,Santiago
9,1.0,0.0,2.1,Santiago


In [20]:
# Completa con el valor de la media de la columna
df1.fillna(value=df1.mean())

Unnamed: 0,A,B,C,D
0,1.0,3.085714,2.0,Curicó
1,0.0,3.0,5.0,Pichilemu
2,1.0,3.5,3.2625,Santiago
3,0.555556,4.0,6.0,Santiago
4,0.0,2.8,2.0,Chiloé
5,0.0,3.1,3.0,Curicó
6,1.0,5.2,4.0,Santiago
7,0.0,0.0,2.0,Pichilemu
8,1.0,3.085714,3.2625,Santiago
9,1.0,3.085714,2.1,Santiago
