In [3]:
import pandas as pd
import numpy as np

# Chapter 7

## 7.1 Handing Missing Data

for numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data. we call this a _sentinel value_

In [4]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [5]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
string_data[0] = None #None value is also trated as NA in object arrays

In [7]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

a few ways to filter out missing data - can always do it by hand using .isnull and boolean indexing. the dropna can be useful. on a Series, it returns the Series with only the non-null data and index values:

In [8]:
from numpy import nan as NA

In [9]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

**_equivalent to_**

In [11]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

DataFrames are more complex. dropna by default drops any row containing a missing value:

In [12]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

In [13]:
cleaned = data.dropna()

In [14]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [15]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [16]:
data.dropna(how='all') #passing how='all' will only drop rows that are all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


to drop columns in the same way, pass axis=1

In [17]:
data[4] = NA

In [18]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [19]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


suppose you want to keep only rows containing a certain number of observations. you can indicate this with the thresh argument:

In [20]:
df = pd.DataFrame(np.random.randn(7, 3))

In [21]:
df.iloc[:4, 1] = NA

In [23]:
df.iloc[:2, 2] = NA

In [24]:
df

Unnamed: 0,0,1,2
0,1.530258,,
1,-0.990702,,
2,1.615311,,0.141599
3,-0.070499,,4.344063
4,0.762107,-1.347476,0.403436
5,0.738771,0.796897,-0.511515
6,-0.790316,-0.590541,-0.566759


In [25]:
df.dropna()

Unnamed: 0,0,1,2
4,0.762107,-1.347476,0.403436
5,0.738771,0.796897,-0.511515
6,-0.790316,-0.590541,-0.566759


In [26]:
df.dropna(thresh=2) #drops rows with more than 2 NAs

Unnamed: 0,0,1,2
2,1.615311,,0.141599
3,-0.070499,,4.344063
4,0.762107,-1.347476,0.403436
5,0.738771,0.796897,-0.511515
6,-0.790316,-0.590541,-0.566759


### Filling In Missing Data

may want to fill "holes" in data rather than discard. Using fillna can use a constant to replace missing values

In [27]:
df.fillna(0)

Unnamed: 0,0,1,2
0,1.530258,0.0,0.0
1,-0.990702,0.0,0.0
2,1.615311,0.0,0.141599
3,-0.070499,0.0,4.344063
4,0.762107,-1.347476,0.403436
5,0.738771,0.796897,-0.511515
6,-0.790316,-0.590541,-0.566759


In [28]:
df.fillna({1: 0.5, 2: 0}) #calling fillna with a dict, you can use a different fill value for each column:

Unnamed: 0,0,1,2
0,1.530258,0.5,0.0
1,-0.990702,0.5,0.0
2,1.615311,0.5,0.141599
3,-0.070499,0.5,4.344063
4,0.762107,-1.347476,0.403436
5,0.738771,0.796897,-0.511515
6,-0.790316,-0.590541,-0.566759


fillna returns a new object, but you can modify the existing object in-place:

In [29]:
_ = df.fillna(0, inplace=True)

In [30]:
df

Unnamed: 0,0,1,2
0,1.530258,0.0,0.0
1,-0.990702,0.0,0.0
2,1.615311,0.0,0.141599
3,-0.070499,0.0,4.344063
4,0.762107,-1.347476,0.403436
5,0.738771,0.796897,-0.511515
6,-0.790316,-0.590541,-0.566759


same interpolation methods available for reindexing can be used with fillna:


In [31]:
df = pd.DataFrame(np.random.randn(6, 3))

In [33]:
df.iloc[2:, 1] = NA

In [34]:
df.iloc[4:, 2] = NA

In [35]:
df

Unnamed: 0,0,1,2
0,-0.645449,-0.244689,-2.237666
1,0.067085,0.674679,2.095682
2,0.547922,,0.657636
3,-2.437439,,0.856235
4,-0.925253,,
5,0.518515,,


In [36]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.645449,-0.244689,-2.237666
1,0.067085,0.674679,2.095682
2,0.547922,0.674679,0.657636
3,-2.437439,0.674679,0.856235
4,-0.925253,0.674679,0.856235
5,0.518515,0.674679,0.856235


In [37]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.645449,-0.244689,-2.237666
1,0.067085,0.674679,2.095682
2,0.547922,0.674679,0.657636
3,-2.437439,0.674679,0.856235
4,-0.925253,,0.856235
5,0.518515,,0.856235


with fillna you can do lots of things like pass the mean or median value of aSeries: