# Dealing with missing data

In [12]:
import pandas as pd
import numpy as np

In [13]:
data = {
    'A': [1.0,5.0,10.0],
    'B': [2.0,6.0,11.0],
    'C': [3.0,np.nan,12.0],
    'D':[4.0,8.0,np.nan]
}

In [14]:
df = pd.DataFrame(data)
display(df)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [15]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

## Eliminating samples or features with missing values

In [16]:
df.dropna()

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [17]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [18]:
#Create new row with NaN
df.loc[3]=np.nan
display(df)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,,,,


In [19]:
# Only drop rows where all columns are NaN
df = df.dropna(how='all')  
display(df)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [20]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [21]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


# Filling in missing data

In [22]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [23]:
dft = df.fillna(0)
display(dft)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,0.0,8.0
2,10.0,11.0,12.0,0.0


In [31]:
dft = df.fillna(-1)
display(dft)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,-1.0,8.0
2,10.0,11.0,12.0,-1.0


In [30]:
df.mean()

A    5.333333
B    6.333333
C    7.500000
D    6.000000
dtype: float64

In [29]:
dft = df.fillna(df.mean())
display(dft)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0
