# Working with missing values in Pandas

Credit to: http://pandas.pydata.org/pandas-docs/stable/missing_data.html

### Imports

In [1]:
% matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.DataFrame(np.random.randn(5, 3), 
                  index=['a', 'c', 'e', 'f', 'h'],
                  columns=['one', 'two', 'three'])
df["four"] = "bar"

df["five"] = df["one"] > 0

df2 = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
df2

Unnamed: 0,one,two,three,four,five
a,1.629263,1.429735,0.540874,bar,True
b,,,,,
c,-0.321342,1.183519,-0.133209,bar,False
d,,,,,
e,1.20749,-1.358041,-0.734994,bar,True
f,-0.424987,0.735821,0.945278,bar,False
g,,,,,
h,0.921358,-0.154623,-0.476764,bar,True


### Finding missing data

In [3]:
df2.isnull()

Unnamed: 0,one,two,three,four,five
a,False,False,False,False,False
b,True,True,True,True,True
c,False,False,False,False,False
d,True,True,True,True,True
e,False,False,False,False,False
f,False,False,False,False,False
g,True,True,True,True,True
h,False,False,False,False,False


In [4]:
df2.one.notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

### Inserting missing data

In [5]:
df2.ix[0, "one"] = None
df2

Unnamed: 0,one,two,three,four,five
a,,1.429735,0.540874,bar,True
b,,,,,
c,-0.321342,1.183519,-0.133209,bar,False
d,,,,,
e,1.20749,-1.358041,-0.734994,bar,True
f,-0.424987,0.735821,0.945278,bar,False
g,,,,,
h,0.921358,-0.154623,-0.476764,bar,True


In [6]:
df2["six"] = np.nan
df2

Unnamed: 0,one,two,three,four,five,six
a,,1.429735,0.540874,bar,True,
b,,,,,,
c,-0.321342,1.183519,-0.133209,bar,False,
d,,,,,,
e,1.20749,-1.358041,-0.734994,bar,True,
f,-0.424987,0.735821,0.945278,bar,False,
g,,,,,,
h,0.921358,-0.154623,-0.476764,bar,True,


### Deleting missing data

In [7]:
df2.dropna(axis = 0) # default value. Drops the rows where there is a NaN value. (axis = 0 == rows)

Unnamed: 0,one,two,three,four,five,six


In [8]:
df2["six"] = np.arange(len(df2.index))
df2

Unnamed: 0,one,two,three,four,five,six
a,,1.429735,0.540874,bar,True,0
b,,,,,,1
c,-0.321342,1.183519,-0.133209,bar,False,2
d,,,,,,3
e,1.20749,-1.358041,-0.734994,bar,True,4
f,-0.424987,0.735821,0.945278,bar,False,5
g,,,,,,6
h,0.921358,-0.154623,-0.476764,bar,True,7


In [9]:
df2.dropna(axis = 1)  #drops the columns where there is one NaN value

Unnamed: 0,six
a,0
b,1
c,2
d,3
e,4
f,5
g,6
h,7


### Filling missing data

In [10]:
df2

Unnamed: 0,one,two,three,four,five,six
a,,1.429735,0.540874,bar,True,0
b,,,,,,1
c,-0.321342,1.183519,-0.133209,bar,False,2
d,,,,,,3
e,1.20749,-1.358041,-0.734994,bar,True,4
f,-0.424987,0.735821,0.945278,bar,False,5
g,,,,,,6
h,0.921358,-0.154623,-0.476764,bar,True,7


In [11]:
df2.fillna("Missing data")

Unnamed: 0,one,two,three,four,five,six
a,Missing data,1.42974,0.540874,bar,True,0
b,Missing data,Missing data,Missing data,Missing data,Missing data,1
c,-0.321342,1.18352,-0.133209,bar,False,2
d,Missing data,Missing data,Missing data,Missing data,Missing data,3
e,1.20749,-1.35804,-0.734994,bar,True,4
f,-0.424987,0.735821,0.945278,bar,False,5
g,Missing data,Missing data,Missing data,Missing data,Missing data,6
h,0.921358,-0.154623,-0.476764,bar,True,7


In [12]:
df2.dropna()  # default is axis = 0 or rows

Unnamed: 0,one,two,three,four,five,six
c,-0.321342,1.183519,-0.133209,bar,False,2
e,1.20749,-1.358041,-0.734994,bar,True,4
f,-0.424987,0.735821,0.945278,bar,False,5
h,0.921358,-0.154623,-0.476764,bar,True,7


In [13]:
df2.dropna(axis = 1) 

Unnamed: 0,six
a,0
b,1
c,2
d,3
e,4
f,5
g,6
h,7
