In [1]:
import numpy as np
import pandas as pd

**Finding Missing Data**

In [2]:
data = {
    'A': [1, 2, np.nan, 4, 5],
    'B': [1, 2, 3, 4, 5],
    'C': [1, 2, 3, np.nan, np.nan],
    'D': [1, np.nan, np.nan, np.nan, 5]
}
df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [4]:
df.isna() # true or false -> true if cell value is NaN otherwise false

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,False,True
2,True,False,False,True
3,False,False,True,True
4,False,False,True,False


In [5]:
# now find how many NULL values (NaN) in each cell
df.isna().sum()

A    1
B    0
C    2
D    3
dtype: int64

In [6]:
df.isna().any() # check for NULL values (or NaN) in each column

A     True
B    False
C     True
D     True
dtype: bool

**Removing Missing Data**

In [7]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [8]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0


In [9]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [12]:
df.dropna(thresh = 1)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [13]:
df_2 = pd.DataFrame({
    "A": [1, np.nan, np.nan, 4, np.nan, np.nan, 7, np.nan, np.nan, 10, np.nan, np.nan],
    "B": [np.nan, 2, np.nan, np.nan, np.nan, 6, np.nan, 8, np.nan, np.nan, np.nan, 12],
    "C": [np.nan, np.nan, 3, np.nan, 5, np.nan, np.nan, np.nan, 9, np.nan, 11, np.nan]
})
df_2

Unnamed: 0,A,B,C
0,1.0,,
1,,2.0,
2,,,3.0
3,4.0,,
4,,,5.0
5,,6.0,
6,7.0,,
7,,8.0,
8,,,9.0
9,10.0,,


In [15]:
df_2.dropna(thresh = 1)

Unnamed: 0,A,B,C
0,1.0,,
1,,2.0,
2,,,3.0
3,4.0,,
4,,,5.0
5,,6.0,
6,7.0,,
7,,8.0,
8,,,9.0
9,10.0,,


In [17]:
df_2.dropna(thresh = 2) # thresh=2 â†’ keep rows that have at least 2 non-NaN values

Unnamed: 0,A,B,C


In [18]:
df_3 = pd.DataFrame({
    "A": [1,  np.nan, np.nan, 4,  np.nan, 6,  np.nan, 8,  np.nan, 10, np.nan, np.nan],
    "B": [2,  2,      np.nan, np.nan, 5,  np.nan, np.nan, 8,  np.nan, np.nan, 11, np.nan],
    "C": [np.nan, np.nan, 3,  np.nan, 5,  6,  np.nan, np.nan, 9,  np.nan, 11, np.nan]
})
df_3

Unnamed: 0,A,B,C
0,1.0,2.0,
1,,2.0,
2,,,3.0
3,4.0,,
4,,5.0,5.0
5,6.0,,6.0
6,,,
7,8.0,8.0,
8,,,9.0
9,10.0,,


In [20]:
df_3.dropna(thresh = 2) 

Unnamed: 0,A,B,C
0,1.0,2.0,
4,,5.0,5.0
5,6.0,,6.0
7,8.0,8.0,
10,,11.0,11.0


**Filling the missing Data**

In [21]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [22]:
df.fillna(0) # this will replace NaN with 0 (or 0.0 according to condition)

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,0.0
2,0.0,3,3.0,0.0
3,4.0,4,0.0,0.0
4,5.0,5,0.0,5.0


In [23]:
df.fillna(value = {'A':0,'B':100,"C":300,'D':400})

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,400.0
2,0.0,3,3.0,400.0
3,4.0,4,300.0,400.0
4,5.0,5,300.0,5.0


In [24]:
df

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,
2,,3,3.0,
3,4.0,4,,
4,5.0,5,,5.0


In [26]:
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,1,1.0,1.0
1,2.0,2,2.0,3.0
2,3.0,3,3.0,3.0
3,4.0,4,2.0,3.0
4,5.0,5,2.0,5.0
