# Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([[np.nan, 2 , np.nan, 0],
                  [3, 4, np.nan, 1],
                  [np.nan, np.nan, np.nan, 5],
                  [np.nan, np.nan, np.nan, np.nan]],
                 columns = list('ABCD'))

In [3]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0
3,,,,


## dropna - Discard missing values

In [4]:
df.dropna(axis = 1) #axis=1 for columns, axis=0 for rows
#Discards all rows where atleast one value is NaN

0
1
2
3


In [6]:
df.dropna(axis=1, how='all') #Discards only columns where 'all' values are NaN

Unnamed: 0,A,B,D
0,,2.0,0.0
1,3.0,4.0,1.0
2,,,5.0
3,,,


In [7]:
df.dropna(axis=0) #By default, how='any'

Unnamed: 0,A,B,C,D


In [8]:
df.dropna(axis=0, how='all')

Unnamed: 0,A,B,C,D
0,,2.0,,0.0
1,3.0,4.0,,1.0
2,,,,5.0


## fillna

In [9]:
df.fillna(-10) #Replace NaN values with -10

Unnamed: 0,A,B,C,D
0,-10.0,2.0,-10.0,0.0
1,3.0,4.0,-10.0,1.0
2,-10.0,-10.0,-10.0,5.0
3,-10.0,-10.0,-10.0,-10.0


In [11]:
df['B'].fillna(df['B'].mean()) #Replace NaN with mean value

0    2.0
1    4.0
2    3.0
3    3.0
Name: B, dtype: float64

In [12]:
df['B'].mean()

3.0

In [13]:
df.replace(0.0, "There was 0 here")
#First arg - original value
#Second arg - new value

Unnamed: 0,A,B,C,D
0,,2.0,,There was 0 here
1,3.0,4.0,,1
2,,,,5
3,,,,


In [14]:
df.replace(np.NaN, 'There was missing value here')

Unnamed: 0,A,B,C,D
0,There was missing value here,2,There was missing value here,0
1,3,4,There was missing value here,1
2,There was missing value here,There was missing value here,There was missing value here,5
3,There was missing value here,There was missing value here,There was missing value here,There was missing value here
