# Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
#returns True for NA values
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull().sum()

2

In [5]:
string_data.notnull()

0    False
1     True
2    False
3     True
dtype: bool

## 1. Filtering Out Missing Data (dropna)

### 1.1 Series

In [6]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [7]:
#drops the ROWS that have NA values
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
#alternatively
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

### 1.2 DataFrame

In [9]:
data = pd.DataFrame([[1, 6.5, 3], [1, np.nan, np.nan],
                     [np.nan, np.nan, np.nan], [np.nan, 6.5, 3]])

In [10]:
#by default ROWS that has ANY NA values is dropped
cleaned = data.dropna()
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [11]:
#ROWS ALL NA 
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [12]:
#creating a new '99' column and assigning the value to it
data[99] = np.nan
data

Unnamed: 0,0,1,2,99
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [13]:
# COLUMN ALL
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [14]:
dataframe = pd.DataFrame(np.random.randn(7, 3))
dataframe.iloc[:4, 1] = np.nan
dataframe.iloc[:2, 2] = np.nan
dataframe

Unnamed: 0,0,1,2
0,1.4529,,
1,0.149654,,
2,-1.215131,,1.219978
3,-0.304143,,-0.364897
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


In [15]:
dataframe.dropna()

Unnamed: 0,0,1,2
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


In [16]:
#drops the ROWS that has at least 2 NA values
dataframe.dropna(thresh=2)

Unnamed: 0,0,1,2
2,-1.215131,,1.219978
3,-0.304143,,-0.364897
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


## 2. Filling In Missing Data (fillna)

In [17]:
dataframe

Unnamed: 0,0,1,2
0,1.4529,,
1,0.149654,,
2,-1.215131,,1.219978
3,-0.304143,,-0.364897
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


In [18]:
#filling NA values with '0'
dataframe.fillna(0)

Unnamed: 0,0,1,2
0,1.4529,0.0,0.0
1,0.149654,0.0,0.0
2,-1.215131,0.0,1.219978
3,-0.304143,0.0,-0.364897
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


In [19]:
#if dict, key=column index, value=filling value to that NA column value
dataframe.fillna({1: 44, 2: 88})

Unnamed: 0,0,1,2
0,1.4529,44.0,88.0
1,0.149654,44.0,88.0
2,-1.215131,44.0,1.219978
3,-0.304143,44.0,-0.364897
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


In [20]:
#inplace permanent modification
dataframe.fillna({1: 44, 2: 88}, inplace=True)

In [21]:
dataframe

Unnamed: 0,0,1,2
0,1.4529,44.0,88.0
1,0.149654,44.0,88.0
2,-1.215131,44.0,1.219978
3,-0.304143,44.0,-0.364897
4,-0.963782,-0.919786,0.337835
5,0.800893,1.021696,-0.762072
6,0.477276,-1.021469,0.959458


### 2.1 Interpolation

In [22]:
data = pd.DataFrame(np.random.randn(6, 3))
data.iloc[2:, 1] = np.nan
data.iloc[4:, 2] = np.nan
data

Unnamed: 0,0,1,2
0,0.165187,-1.745257,1.023171
1,-1.709811,0.587133,-0.68705
2,-0.050582,,-1.232353
3,-0.604346,,1.533342
4,1.32134,,
5,-0.133144,,


In [23]:
data.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.165187,-1.745257,1.023171
1,-1.709811,0.587133,-0.68705
2,-0.050582,0.587133,-1.232353
3,-0.604346,0.587133,1.533342
4,1.32134,0.587133,1.533342
5,-0.133144,0.587133,1.533342


In [24]:
#limiting the number of column NA values to be filled
data.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.165187,-1.745257,1.023171
1,-1.709811,0.587133,-0.68705
2,-0.050582,0.587133,-1.232353
3,-0.604346,0.587133,1.533342
4,1.32134,,1.533342
5,-0.133144,,1.533342


### 2.2 Filling with mean/median values

In [25]:
series = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [26]:
series.fillna(series.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64