In [1]:
import pandas as pd
import numpy as np

In [5]:
df = pd.DataFrame(np.random.randn(5,3), index = ['a','c','e','f','h'],
                 columns=['one','two','three'])

df2 = df.reindex(['a','b','c','d','e','f','g','h'])

In [6]:
df2

Unnamed: 0,one,two,three
a,0.307464,-1.281647,-0.063149
b,,,
c,-0.762342,0.522382,-1.694392
d,,,
e,0.834146,0.19471,-0.277881
f,-1.205281,0.224719,0.450238
g,,,
h,-0.18086,-0.864781,-1.010544


In [4]:
pd.isna(df2['one'])

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [8]:
df2['two'].notna()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: two, dtype: bool

In [9]:
df2.isna()

Unnamed: 0,one,two,three
a,False,False,False
b,True,True,True
c,False,False,False
d,True,True,True
e,False,False,False
f,False,False,False
g,True,True,True
h,False,False,False


In [11]:
df2['one'].isna()

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [12]:
# np.nan's do not compare equal!
np.nan == np.nan

False

In [13]:
# Avoid checking for nan's by equality comparison as shown below
df2['one'] == np.nan

a    False
b    False
c    False
d    False
e    False
f    False
g    False
h    False
Name: one, dtype: bool

In [15]:
# NaN is a float and its presence will cause a column of int's to be cast
# Request dtype = 'Int64' to keep as int
pd.Series([1,2,np.nan,4], dtype='Int64')

0      1
1      2
2    NaN
3      4
dtype: Int64

In [16]:
# For datetimes NaT is used for missing values and is directly compatible with NaN
df3 = df.copy()
df3['timestamp'] = pd.Timestamp('20120101')
df3.loc[['a','c','h'], ['one','timestamp']] = np.nan
df3

Unnamed: 0,one,two,three,timestamp
a,,-1.281647,-0.063149,NaT
c,,0.522382,-1.694392,NaT
e,0.834146,0.19471,-0.277881,2012-01-01
f,-1.205281,0.224719,0.450238,2012-01-01
h,,-0.864781,-1.010544,NaT


In [19]:
df3.dtypes.value_counts()

float64           3
datetime64[ns]    1
dtype: int64

In [20]:
# When assigning a missing value, the value used will depend on the data type
s = pd.Series([1,2,3])
s.loc[0] = None
s

0    NaN
1    2.0
2    3.0
dtype: float64

In [22]:
# For objects, pandas used the given value
s = pd.Series(["a","b","c"])
s.loc[0] = None
s.loc[1] = np.nan

s

0    None
1     NaN
2       c
dtype: object

In [25]:
# Calculations when missing data is present
df2

Unnamed: 0,one,two,three
a,0.307464,-1.281647,-0.063149
b,,,
c,-0.762342,0.522382,-1.694392
d,,,
e,0.834146,0.19471,-0.277881
f,-1.205281,0.224719,0.450238
g,,,
h,-0.18086,-0.864781,-1.010544


In [26]:
# Treats NaN as 0
df2['one'].sum()

-1.0068729143562816

In [30]:
# Ignore NaN in the calculation unless told otherwise
df2.loc['a','three']=np.nan
df2.mean(1) # mean of axis 1

a   -0.487091
b         NaN
c   -0.644784
d         NaN
e    0.250325
f   -0.176775
g         NaN
h   -0.685395
dtype: float64

In [32]:
# Ignore NaN in the calculation unless told otherwise
df2.cumsum()

Unnamed: 0,one,two,three
a,0.307464,-1.281647,-0.063149
c,-0.454878,-0.759264,-1.757541
e,0.379268,-0.564555,-2.035422
f,-0.826013,-0.339836,-1.585184
h,-1.006873,-1.204617,-2.595727


In [34]:
df2.cumsum(skipna=False)

Unnamed: 0,one,two,three
a,0.307464,-1.281647,
b,,,
c,,,
d,,,
e,,,
f,,,
g,,,
h,,,


In [37]:
# NaN groups are excluded in groupby
df2.groupby('one').mean()

Unnamed: 0_level_0,two,three
one,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.205281,0.224719,0.450238
-0.762342,0.522382,-1.694392
-0.18086,-0.864781,-1.010544
0.307464,-1.281647,
0.834146,0.19471,-0.277881


In [39]:
# Use fillna to replace NaN values
df2.fillna(0)

Unnamed: 0,one,two,three
a,0.307464,-1.281647,
b,,,
c,-0.762342,0.522382,-1.694392
d,,,
e,0.834146,0.19471,-0.277881
f,-1.205281,0.224719,0.450238
g,,,
h,-0.18086,-0.864781,-1.010544


In [41]:
df2.fillna('missing')

Unnamed: 0,one,two,three
a,0.307464,-1.28165,missing
b,missing,missing,missing
c,-0.762342,0.522382,-1.69439
d,missing,missing,missing
e,0.834146,0.19471,-0.277881
f,-1.20528,0.224719,0.450238
g,missing,missing,missing
h,-0.18086,-0.864781,-1.01054


In [43]:
# Propagates the last valid observation forward
# With time series data, using pad/ffill is extremely common so 
# that the “last known value” is available at every time point.
df2.fillna(method='pad')

Unnamed: 0,one,two,three
a,0.307464,-1.281647,
b,0.307464,-1.281647,
c,-0.762342,0.522382,-1.694392
d,-0.762342,0.522382,-1.694392
e,0.834146,0.19471,-0.277881
f,-1.205281,0.224719,0.450238
g,-1.205281,0.224719,0.450238
h,-0.18086,-0.864781,-1.010544


In [45]:
# Propagates backwards
df2.fillna(method='bfill')

Unnamed: 0,one,two,three
a,0.307464,-1.281647,-1.694392
b,-0.762342,0.522382,-1.694392
c,-0.762342,0.522382,-1.694392
d,0.834146,0.19471,-0.277881
e,0.834146,0.19471,-0.277881
f,-1.205281,0.224719,0.450238
g,-0.18086,-0.864781,-1.010544
h,-0.18086,-0.864781,-1.010544


In [46]:
# Filling in missing data using the column mean
dff = pd.DataFrame(np.random.randn(10, 3), columns=list('ABC'))
dff.iloc[3:5, 0] = np.nan
dff.iloc[4:6, 1] = np.nan
dff.iloc[5:8, 2] = np.nan

dff

Unnamed: 0,A,B,C
0,-0.45103,0.217589,-2.213704
1,-0.298408,1.176851,1.451832
2,-0.43603,-1.684948,-0.941387
3,,0.403264,-1.143029
4,,,-0.690396
5,-0.988921,,
6,0.076982,0.351583,
7,0.397182,1.383345,
8,0.529952,0.352874,0.280308
9,0.182603,0.796153,-0.410317


In [47]:
dff.fillna(dff.mean())

Unnamed: 0,A,B,C
0,-0.45103,0.217589,-2.213704
1,-0.298408,1.176851,1.451832
2,-0.43603,-1.684948,-0.941387
3,-0.123459,0.403264,-1.143029
4,-0.123459,0.374589,-0.690396
5,-0.988921,0.374589,-0.523813
6,0.076982,0.351583,-0.523813
7,0.397182,1.383345,-0.523813
8,0.529952,0.352874,0.280308
9,0.182603,0.796153,-0.410317


In [48]:
# For only certain columns
dff.fillna(dff.mean()['B':'C'])

Unnamed: 0,A,B,C
0,-0.45103,0.217589,-2.213704
1,-0.298408,1.176851,1.451832
2,-0.43603,-1.684948,-0.941387
3,,0.403264,-1.143029
4,,0.374589,-0.690396
5,-0.988921,0.374589,-0.523813
6,0.076982,0.351583,-0.523813
7,0.397182,1.383345,-0.523813
8,0.529952,0.352874,0.280308
9,0.182603,0.796153,-0.410317


In [49]:
# Same result
dff.where(pd.notna(dff), dff.mean(), axis='columns')

Unnamed: 0,A,B,C
0,-0.45103,0.217589,-2.213704
1,-0.298408,1.176851,1.451832
2,-0.43603,-1.684948,-0.941387
3,-0.123459,0.403264,-1.143029
4,-0.123459,0.374589,-0.690396
5,-0.988921,0.374589,-0.523813
6,0.076982,0.351583,-0.523813
7,0.397182,1.383345,-0.523813
8,0.529952,0.352874,0.280308
9,0.182603,0.796153,-0.410317


In [53]:
# Dropping an axis label with missing data
df4 = pd.DataFrame(df, columns=['one','two','three','four'])
df4

Unnamed: 0,one,two,three,four
a,0.307464,-1.281647,-0.063149,
c,-0.762342,0.522382,-1.694392,
e,0.834146,0.19471,-0.277881,
f,-1.205281,0.224719,0.450238,
h,-0.18086,-0.864781,-1.010544,


In [54]:
df4.dropna(axis=0)

Unnamed: 0,one,two,three,four


In [55]:
df4.dropna(axis=1)

Unnamed: 0,one,two,three
a,0.307464,-1.281647,-0.063149
c,-0.762342,0.522382,-1.694392
e,0.834146,0.19471,-0.277881
f,-1.205281,0.224719,0.450238
h,-0.18086,-0.864781,-1.010544


In [56]:
df4['four'].dropna()

Series([], Name: four, dtype: float64)

In [59]:
# Using interpolation for missing data
print(len(df2.index))
df2['one'].count()

8


5

In [68]:
df2['one'].interpolate() # There are many methods that are index type specific

a    0.307464
b   -0.227439
c   -0.762342
d    0.035902
e    0.834146
f   -1.205281
g   -0.693071
h   -0.180860
Name: one, dtype: float64

In [69]:
# Replacing generic values
ser = pd.Series([0.,1.,2.,3.,4.])
ser.replace(0,5)

0    5.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [70]:
# Replace a list of values
ser.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])

0    4.0
1    3.0
2    2.0
3    1.0
4    0.0
dtype: float64

In [72]:
ser.replace([0, 1, 2, 3, 4], 4)

0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
dtype: float64

In [73]:
# Specifying a mapping dict
ser.replace({0:10, 1:100})

0     10.0
1    100.0
2      2.0
3      3.0
4      4.0
dtype: float64

In [74]:
# For a data frame you can specify individual values by column
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]})
df

Unnamed: 0,a,b
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


In [79]:
df.replace({'a':1,'b':5}, 100)

Unnamed: 0,a,b
0,0,100
1,100,6
2,2,7
3,3,8
4,4,9


In [80]:
df.replace({'a':1,'b':5}, {'a':100, 'b':500})

Unnamed: 0,a,b
0,0,500
1,100,6
2,2,7
3,3,8
4,4,9


In [78]:
# Using a method
ser.replace([2,3],  method='pad')

0    0.0
1    1.0
2    1.0
3    1.0
4    4.0
dtype: float64