In [12]:
import pandas as pd
import numpy as np

df = pd.DataFrame(
    np.random.randn(5, 3),
    index=["a", "c", "e", "f", "h"],
    columns=["one", "two", "three"],
)

df["four"] = "bar"
df["five"] = df["one"] > 0

df = df.reindex(["a", "b", "c", "d", "e", "f", "g", "h"])
df

Unnamed: 0,one,two,three,four,five
a,-0.394865,-1.794126,0.223603,bar,False
b,,,,,
c,-1.350225,-0.54858,0.34497,bar,False
d,,,,,
e,-1.450936,1.949594,2.117628,bar,False
f,0.020119,-1.56443,-0.664295,bar,True
g,,,,,
h,0.372196,1.153066,-2.001996,bar,True


In [6]:
print (pd.isna(df2["one"]))

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [7]:
print (pd.notna(df2["one"]))

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


In [24]:
print(df.isna())

     one    two  three   four   five
a  False  False  False  False  False
b   True   True   True   True   True
c  False  False  False  False  False
d   True   True   True   True   True
e  False  False  False  False  False
f  False  False  False  False  False
g   True   True   True   True   True
h  False  False  False  False  False


At datetime column the missing value means Nat instead of Nan

In [78]:
df2 = df.copy()
df2["timestamp"] = pd.Timestamp("20120101")
df2.loc[["a", "c", "h"], ["one", "timestamp"]] = np.nan
df2

Unnamed: 0,one,two,three,four,five,timestamp
a,,-1.794126,0.223603,bar,False,NaT
b,,,,,,2012-01-01
c,,-0.54858,0.34497,bar,False,NaT
d,,,,,,2012-01-01
e,-1.450936,1.949594,2.117628,bar,False,2012-01-01
f,0.020119,-1.56443,-0.664295,bar,True,2012-01-01
g,,,,,,2012-01-01
h,,1.153066,-2.001996,bar,True,NaT


Calculation with missing value

When summing data, NA (missing) values will be treated as zero.
If the data are all NA, the result will be 0.

In [79]:
print(df)
df["one"].sum()

        one       two     three four   five
a -0.394865 -1.794126  0.223603  bar  False
b       NaN       NaN       NaN  NaN    NaN
c -1.350225 -0.548580  0.344970  bar  False
d       NaN       NaN       NaN  NaN    NaN
e -1.450936  1.949594  2.117628  bar  False
f  0.020119 -1.564430 -0.664295  bar   True
g       NaN       NaN       NaN  NaN    NaN
h  0.372196  1.153066 -2.001996  bar   True


-2.8037111351457256

In [80]:
#by default  na is skipped
df["one"].cumsum(skipna=False)

a   -0.394865
b         NaN
c         NaN
d         NaN
e         NaN
f         NaN
g         NaN
h         NaN
Name: one, dtype: float64

In [81]:
#na group is missing
df.groupby("one").mean()

Unnamed: 0_level_0,two,three
one,Unnamed: 1_level_1,Unnamed: 2_level_1
-1.450936,1.949594,2.117628
-1.350225,-0.54858,0.34497
-0.394865,-1.794126,0.223603
0.020119,-1.56443,-0.664295
0.372196,1.153066,-2.001996


Filling missing values: fillna
fillna() can “fill in” NA values with non-NA data in a couple of ways, which we illustrat

In [82]:
print(df2)
print(df2.fillna(0))
df2["one"].fillna("anything")

        one       two     three four   five  timestamp
a       NaN -1.794126  0.223603  bar  False        NaT
b       NaN       NaN       NaN  NaN    NaN 2012-01-01
c       NaN -0.548580  0.344970  bar  False        NaT
d       NaN       NaN       NaN  NaN    NaN 2012-01-01
e -1.450936  1.949594  2.117628  bar  False 2012-01-01
f  0.020119 -1.564430 -0.664295  bar   True 2012-01-01
g       NaN       NaN       NaN  NaN    NaN 2012-01-01
h       NaN  1.153066 -2.001996  bar   True        NaT
        one       two     three four   five            timestamp
a  0.000000 -1.794126  0.223603  bar  False                    0
b  0.000000  0.000000  0.000000    0      0  2012-01-01 00:00:00
c  0.000000 -0.548580  0.344970  bar  False                    0
d  0.000000  0.000000  0.000000    0      0  2012-01-01 00:00:00
e -1.450936  1.949594  2.117628  bar  False  2012-01-01 00:00:00
f  0.020119 -1.564430 -0.664295  bar   True  2012-01-01 00:00:00
g  0.000000  0.000000  0.000000    0      0  2012-

a    anything
b    anything
c    anything
d    anything
e   -1.450936
f    0.020119
g    anything
h    anything
Name: one, dtype: object

In [40]:
df2.fillna(method="ffill")

Unnamed: 0,one,two,three,four,five,timestamp
a,,-1.794126,0.223603,bar,False,NaT
b,,-1.794126,0.223603,bar,False,2012-01-01
c,,-0.54858,0.34497,bar,False,2012-01-01
d,,-0.54858,0.34497,bar,False,2012-01-01
e,-1.450936,1.949594,2.117628,bar,False,2012-01-01
f,0.020119,-1.56443,-0.664295,bar,True,2012-01-01
g,0.020119,-1.56443,-0.664295,bar,True,2012-01-01
h,0.020119,1.153066,-2.001996,bar,True,2012-01-01


In [49]:
df2.fillna(method="ffill", limit=1)

Unnamed: 0,one,two,three,four,five,timestamp
a,,-1.794126,0.223603,bar,False,NaT
b,,-1.794126,0.223603,bar,False,2012-01-01
c,,-0.54858,0.34497,bar,False,2012-01-01
d,,-0.54858,0.34497,bar,False,2012-01-01
e,-1.450936,1.949594,2.117628,bar,False,2012-01-01
f,0.020119,-1.56443,-0.664295,bar,True,2012-01-01
g,0.020119,-1.56443,-0.664295,bar,True,2012-01-01
h,,1.153066,-2.001996,bar,True,2012-01-01


In [50]:
df2.fillna(df2["one"].mean())

Unnamed: 0,one,two,three,four,five,timestamp
a,-0.715409,-1.794126,0.223603,bar,False,-0.715409
b,-0.715409,-0.715409,-0.715409,-0.715409,-0.715409,2012-01-01 00:00:00
c,-0.715409,-0.54858,0.34497,bar,False,-0.715409
d,-0.715409,-0.715409,-0.715409,-0.715409,-0.715409,2012-01-01 00:00:00
e,-1.450936,1.949594,2.117628,bar,False,2012-01-01 00:00:00
f,0.020119,-1.56443,-0.664295,bar,True,2012-01-01 00:00:00
g,-0.715409,-0.715409,-0.715409,-0.715409,-0.715409,2012-01-01 00:00:00
h,-0.715409,1.153066,-2.001996,bar,True,-0.715409


In [94]:
df2[["two", "three", "four", "timestamp"]] = df2[["two", "three", "four", "timestamp"]].fillna("anything")
print(df2)
df2.dropna(axis=0)
df2.dropna(axis=1)


        one       two     three      four   five            timestamp
a       NaN -1.794126  0.223603       bar  False             anything
b       NaN  anything  anything  anything    NaN  2012-01-01 00:00:00
c       NaN  -0.54858   0.34497       bar  False             anything
d       NaN  anything  anything  anything    NaN  2012-01-01 00:00:00
e -1.450936  1.949594  2.117628       bar  False  2012-01-01 00:00:00
f  0.020119  -1.56443 -0.664295       bar   True  2012-01-01 00:00:00
g       NaN  anything  anything  anything    NaN  2012-01-01 00:00:00
h       NaN  1.153066 -2.001996       bar   True             anything


Unnamed: 0,two,three,four,timestamp
a,-1.794126,0.223603,bar,anything
b,anything,anything,anything,2012-01-01 00:00:00
c,-0.54858,0.34497,bar,anything
d,anything,anything,anything,2012-01-01 00:00:00
e,1.949594,2.117628,bar,2012-01-01 00:00:00
f,-1.56443,-0.664295,bar,2012-01-01 00:00:00
g,anything,anything,anything,2012-01-01 00:00:00
h,1.153066,-2.001996,bar,anything


In [95]:
df2["one"].interpolate()

a         NaN
b         NaN
c         NaN
d         NaN
e   -1.450936
f    0.020119
g    0.020119
h    0.020119
Name: one, dtype: float64

In [102]:
df2["one"].replace(np.nan, 0)

a    0.000000
b    0.000000
c    0.000000
d    0.000000
e   -1.450936
f    0.020119
g    0.000000
h    0.000000
Name: one, dtype: float64