#Limpieza de Datos

In [0]:
import pandas as pd 
import numpy as np 

###Valores perdidos

In [0]:
dic = {'Name':['Alex', 'Mike', 'John', 'Dave', 'Joey'], 
        	  'Height(m)': [1.75, 1.55, 1.73, np.nan, 1.82],
            'Weight(Kg)': [80.5,70.2,100.3,67.3,120.2],
     	      'Test Score':[70, np.nan, 84, 62, 73]} 
df = pd.DataFrame(dic) 
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70.0
1,Mike,1.55,70.2,
2,John,1.73,100.3,84.0
3,Dave,,67.3,62.0
4,Joey,1.82,120.2,73.0


Utilizaremos la función .isnull( ) para corroborar valores nulos

In [0]:
df.isnull().sum()

Name          0
Height(m)     1
Weight(Kg)    0
Test Score    1
dtype: int64

Ahora que sabemos que columnas tienen valores nulos, podemos completar la data de distintas maneras

In [0]:
#Eliminamos datos nulos
df= df.dropna() 
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70.0
2,John,1.73,100.3,84.0
4,Joey,1.82,120.2,73.0


In [0]:
#Ejecutar de nuevo Dataframe DF
#Función .fillna()
df = df.fillna(0)
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70.0
1,Mike,1.55,70.2,0.0
2,John,1.73,100.3,84.0
3,Dave,0.0,67.3,62.0
4,Joey,1.82,120.2,73.0


In [0]:
#Función .fillna()
peso = df['Height(m)'].mean()
peso = round(peso,2)
peso

1.71

In [0]:
df['Height(m)'] = df['Height(m)'].fillna(peso)
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70.0
1,Mike,1.55,70.2,
2,John,1.73,100.3,84.0
3,Dave,1.71,67.3,62.0
4,Joey,1.82,120.2,73.0


###Valores perdidos no estandar

In [0]:
dic = {'Name':['Alex', 'Mike', 'John', 'Dave', 'Joey'], 
        	  'Height(m)': [1.75, 1.55, 1.73,'-', 1.82],
            'Weight(Kg)': [80.5,70.2,100.3,67.3,120.2],
     	      'Test Score':[70,'Nada', 84, 62, 73]} 
df_2 = pd.DataFrame(dic) 
df_2

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70
1,Mike,1.55,70.2,Nada
2,John,1.73,100.3,84
3,Dave,-,67.3,62
4,Joey,1.82,120.2,73


In [0]:
df_2.isnull().sum()

Name          0
Height(m)     0
Weight(Kg)    0
Test Score    0
dtype: int64

Los valores no estandar no son valores nulos, por lo que se tienen que manejar de otra manera

In [0]:
#Reemplazar con valores nulos
df_2 = df_2.replace('-',np.nan) #Para todo el dataframe
df_2['Test Score'] = df_2['Test Score'].replace('Nada',np.nan) #Una columna
df_2

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70.0
1,Mike,1.55,70.2,
2,John,1.73,100.3,84.0
3,Dave,,67.3,62.0
4,Joey,1.82,120.2,73.0


In [0]:
#funcion .replace()
df_2 = df_2.replace('-',0) #Para todo el dataframe
df_2['Test Score'] = df_2['Test Score'].replace('Nada',0) #Una columna
df_2

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score
0,Alex,1.75,80.5,70
1,Mike,1.55,70.2,0
2,John,1.73,100.3,84
3,Dave,0.0,67.3,62
4,Joey,1.82,120.2,73


###Crear columnas a partir de datos

Añadir la columna de IMC (Indice de masa corporal) con la fórmula: Kg / m^2

In [0]:
df['IMC'] = df['Weight(Kg)'] / (df['Height(m)']**2)
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score,IMC
0,Alex,1.75,80.5,70.0,26.285714
1,Mike,1.55,70.2,,29.219563
2,John,1.73,100.3,84.0,33.512647
3,Dave,1.71,67.3,62.0,23.015629
4,Joey,1.82,120.2,73.0,36.287888


###Segmentar data en intervalos

Se utilizara la función pd.cut( )

In [0]:
bins = (18.5, 25, 30, 35, 40,45)
group_names = ['Normal', 'Sobrepeso', 'Obesidad I', 'Obesidad II', 'Obesidad III']
categories = pd.cut(df['IMC'], bins, labels=group_names)
df['Clasificacion'] = categories
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score,IMC,Clasificacion
0,Alex,1.75,80.5,70.0,26.285714,Sobrepeso
1,Mike,1.55,70.2,,29.219563,Sobrepeso
2,John,1.73,100.3,84.0,33.512647,Obesidad I
3,Dave,1.71,67.3,62.0,23.015629,Normal
4,Joey,1.82,120.2,73.0,36.287888,Obesidad II


####Ordenar Data

In [0]:
df = df.sort_values('IMC',ascending=False)
df


Unnamed: 0,Name,Height(m),Weight(Kg),Test Score,IMC,Clasificacion
4,Joey,1.82,120.2,73.0,36.287888,Obesidad II
3,John,1.73,100.3,84.0,33.512647,Obesidad I
2,Mike,1.55,70.2,,29.219563,Sobrepeso
1,Alex,1.75,80.5,70.0,26.285714,Sobrepeso
0,Dave,1.71,67.3,62.0,23.015629,Normal


In [0]:
df =df.reset_index(drop=True)
df

Unnamed: 0,Name,Height(m),Weight(Kg),Test Score,IMC,Clasificacion
0,Dave,1.71,67.3,62.0,23.015629,Normal
1,Alex,1.75,80.5,70.0,26.285714,Sobrepeso
2,Mike,1.55,70.2,,29.219563,Sobrepeso
3,John,1.73,100.3,84.0,33.512647,Obesidad I
4,Joey,1.82,120.2,73.0,36.287888,Obesidad II


In [0]:
suma = df['Clasificacion'].value_counts().sum()
df['Clasificacion'].value_counts() *100 / suma

Sobrepeso       40.0
Obesidad II     20.0
Obesidad I      20.0
Normal          20.0
Obesidad III     0.0
Name: Clasificacion, dtype: float64