# Missing Data

Let's show a few convenient methods to deal with Missing Data in pandas:

In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

In [7]:
df.notnull()

Unnamed: 0,A,B,C
0,True,True,True
1,True,False,True
2,False,False,True


In [8]:
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [9]:
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [10]:
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


In [11]:
df.fillna(value='101')

Unnamed: 0,A,B,C
0,1,5,1
1,2,101,2
2,101,101,3


In [12]:
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64

# Series Missing Data

----
- Create some missing data
- Create a dictionary. Average salaries for cities in UK  
Darlington	19133  
Easington	18300  
Gateshead	19126  
Hartlepool	20038  

- Create a label index for the below cities  
Gateshead  
Hartlepool  
Hexham  
Jarrow  

In [13]:
x = {'City':['Darlington','Easington','Gateshead','Hartlepool'],
    'Salaries':[19133,18300,19126,20038]}

df = pd.DataFrame(x)

df

Unnamed: 0,City,Salaries
0,Darlington,19133
1,Easington,18300
2,Gateshead,19126
3,Hartlepool,20038


In [14]:
y= ['Gateshead','Hartlepool','Hexham','Jarrow']

y_df = pd.DataFrame(y)

y_df

Unnamed: 0,0
0,Gateshead
1,Hartlepool
2,Hexham
3,Jarrow


In [15]:
y_df.columns = ['city']

y_df

Unnamed: 0,city
0,Gateshead
1,Hartlepool
2,Hexham
3,Jarrow


- Create two Series with different labelled data and add them

In [16]:
s1 = pd.Series([1, 2], index=['A', 'B'], name='s1')
s1

A    1
B    2
Name: s1, dtype: int64

In [17]:
s2 = pd.Series([3, 4], index=['A', 'B'], name='s2')
s2

A    3
B    4
Name: s2, dtype: int64

In [18]:
pd.concat([s1, s2], axis=1)

Unnamed: 0,s1,s2
A,1,3
B,2,4


In [19]:
s3 = pd.Series([4, 5], index=['C', 'D'], name='s3')
s3

C    4
D    5
Name: s3, dtype: int64

In [20]:
pd.concat([s1,s3])

A    1
B    2
C    4
D    5
dtype: int64

In [21]:
pd.concat([s1,s3],axis=1)

Unnamed: 0,s1,s3
A,1.0,
B,2.0,
C,,4.0
D,,5.0


In [29]:
a = pd.Series([35000,71000,16000,5000],index=['Ohio','Texas','Oregon','Utah'])
b = pd.Series([np.nan,71000,16000,35000],index=['California', 'Texas', 'Oregon', 'Ohio'])


In [30]:
a+b

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [33]:
b=b.fillna(0)
b


In [35]:
a=a.fillna(0)
a

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [36]:
a+b

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [37]:
a.add(b, fill_value=0)

California         0.0
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah            5000.0
dtype: float64