In [6]:
import numpy as np
import pandas as pd

In [19]:
babies = pd.read_csv('babies.data', delimiter='\s+')

In [20]:
babies.head(10)

Unnamed: 0,bwt,gestation,parity,age,height,weight,smoke
0,120,284,0,27,62,100,0
1,113,282,0,33,64,135,0
2,128,279,0,28,64,115,1
3,123,999,0,36,69,190,0
4,108,282,0,23,67,125,1
5,136,286,0,25,62,93,0
6,138,244,0,33,62,178,0
7,132,245,0,23,65,140,0
8,120,289,0,25,62,125,0
9,143,299,0,30,66,136,1


In [21]:
columns = babies.columns

In [22]:
indices = babies.index

In [23]:
fraction_missing = babies.size // 10
fraction_missing

865

In [24]:
babies_na = babies.copy()

In [25]:
iteration = 0
while (iteration < fraction_missing):
    rand_col = np.random.choice(columns, 1)
    rand_row = np.random.choice(indices, 1)    
    babies_na.loc[rand_row, rand_col] = np.NaN    
    
    iteration += 1

In [26]:
babies_na.head(10)

Unnamed: 0,bwt,gestation,parity,age,height,weight,smoke
0,120.0,284.0,0.0,27.0,62.0,100.0,0.0
1,113.0,282.0,0.0,33.0,64.0,135.0,0.0
2,128.0,,0.0,28.0,,115.0,1.0
3,123.0,999.0,0.0,,69.0,190.0,0.0
4,108.0,282.0,0.0,23.0,67.0,125.0,1.0
5,136.0,286.0,0.0,25.0,62.0,93.0,0.0
6,138.0,244.0,0.0,33.0,62.0,178.0,0.0
7,132.0,245.0,0.0,23.0,65.0,140.0,0.0
8,120.0,289.0,0.0,25.0,62.0,125.0,0.0
9,143.0,,0.0,30.0,66.0,136.0,1.0


### Missing Values

In [28]:
np.NaN + 0

nan

In [29]:
np.NaN * 10

nan

In [27]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64


Pandas treats None and NaN as essentially interchangeable for indicating missing or null values. There are several useful methods for detecting, removing, and replacing null values in Pandas 

 * isnull(): Generate a boolean mask indicating missing values
 * notnull(): Opposite of isnull()
 * dropna(): Return a filtered version of the data
 * fillna(): Return a copy of the data with missing values filled or imputed
    

### Deleting Missing Values

In [43]:
age = babies_na[['age']]

In [44]:
age.isnull().head(10)

Unnamed: 0,age
0,False
1,False
2,False
3,True
4,False
5,False
6,False
7,False
8,False
9,False


In [48]:
age.dropna().head()

Unnamed: 0,age
0,27.0
1,33.0
2,28.0
4,23.0
5,25.0


In [47]:
age[~age.isnull()['age']].head()

Unnamed: 0,age
0,27.0
1,33.0
2,28.0
4,23.0
5,25.0


### Filling Missing Values

In [51]:
age.fillna(value=999, method=None).head()

Unnamed: 0,age
0,27.0
1,33.0
2,28.0
3,999.0
4,23.0


In [54]:
age.fillna(method='ffill').head()

Unnamed: 0,age
0,27.0
1,33.0
2,28.0
3,28.0
4,23.0


### Imputing Missing Values

In [55]:
avg_value = age['age'].fillna(value=0, method=None).mean()

In [58]:
age.fillna(value = avg_value, method=None).head()

Unnamed: 0,age
0,27.0
1,33.0
2,28.0
3,24.832524
4,23.0


### Question: How fill with the most common value

In [64]:
most_common = age['age'].fillna(value = 0, method = None).value_counts().index[1]

In [65]:
age.fillna(value = most_common, method=None).head()

Unnamed: 0,age
0,27.0
1,33.0
2,28.0
3,23.0
4,23.0
