# Dealing with missing data

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = {
    'A': [1.0,5.0,10.0],
    'B': [2.0,6.0,11.0],
    'C': [3.0,np.nan,12.0],
    'D':[4.0,8.0,np.nan]
}

In [None]:
df = pd.DataFrame(data)
display(df)

In [None]:
df.isnull().sum()

## Eliminating samples or features with missing values

In [None]:
df.dropna()

In [None]:
df.dropna(axis=1)

In [None]:
#Create new row with NaN
df.loc[3]=np.nan
display(df)

In [None]:
# Only drop rows where all columns are NaN
df = df.dropna(how='all')  
display(df)

In [None]:
# drop rows that have not at least 4 non-NaN values
df.dropna(thresh=4)

In [None]:
# only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

## Imputing missing values

In [None]:
from sklearn.impute import SimpleImputer

imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df)
imputed_data = imr.transform(df)
# imputed_data = imr.transform(df.values) # Can also do this but will get warning.
display(imputed_data)

In [None]:
df2 = df.copy()
df2.loc[:] = imputed_data
display(df2)

In [None]:
display(df)