In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
pd.isnull(np.nan)

True

In [4]:
pd.isnull(None)

True

In [5]:
pd.isnull(False)

False

In [7]:
pd.isnull(pd.Series([1,3,np.nan]))

0    False
1    False
2     True
dtype: bool

In [9]:
pd.isnull(pd.DataFrame({
    'Column A': [1,np.nan,7],
    'Column B': [np.nan,2,3],
    'Column C': [np.nan,2,np.nan]
}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


Pandas Operations with missing Values

In [10]:
pd.Series([1,2,3,np.nan]).sum()

6.0

In [11]:
pd.Series([1,2,3,np.nan]).count()

3

In [12]:
s = pd.Series([1,2,3,np.nan,np.nan,4])

In [13]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [14]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [15]:
pd.notnull(s).sum()

4

In [16]:
pd.isnull(s).sum()

2

In [17]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [18]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [19]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [20]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [22]:
df = pd.DataFrame({
    'Column A' : [1,np.nan,30,np.nan],
    'Column B' : [2,8,31,np.nan],
    'Column C' : [np.nan,9,32,100],
    'Column D' : [5,8,34,110]
})

In [23]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [24]:
df.shape

(4, 4)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 260.0 bytes


In [26]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [28]:
df.isnull().sum()
#number of null values in each Column

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [31]:
df.dropna(axis=1)

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [32]:
df2 = pd.DataFrame({
    'Column A' : [1,np.nan,30],
    'Column B' : [2,np.nan,31],
    'Column C' : [np.nan,np.nan,100]
})

In [33]:
df2

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [36]:
df.dropna(how = 'all')
# df.dropna(how = 'any') - default behavior

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [38]:
df.dropna(thresh = 3, axis = 'columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


In [39]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [40]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [41]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [42]:
s.fillna(method = 'ffill')

  s.fillna(method = 'ffill')


0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [43]:
s.fillna(method = 'bfill')

  s.fillna(method = 'bfill')


0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

In [44]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [46]:
df.fillna(method = 'ffill', axis = 'columns')

  df.fillna(method = 'ffill', axis = 'columns')


Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


In [47]:
df.fillna({'Column A': 0, 'Column B' : df['Column B'].mean(), 'Column C' : df['Column C'].max(), 'Column D':5})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,100.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,13.666667,100.0,110


Checking if there are NA's

In [48]:
missing_values = len(s.dropna()) != len(s)

In [49]:
missing_values

True

In [50]:
s.count()

4

In [51]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [52]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [53]:
s.isnull().any()

True

In [56]:
s.isnull().values.any()

True