## Prepare: Handling Missing Values
### ChemML implements 4 strategies to handle missing values and interpolate, replace or remove them.

In [1]:
import pandas as pd
import numpy as np
from chemml.preprocessing import MissingValues

In [2]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df

Unnamed: 0,col1,col2,col3
0,1,1.0,1
1,2,,2
2,3,inf,3
3,,2.0,4
4,missing,3.0,5
5,4,4.0,6
6,5,5.0,7
7,,6.0,8


### Strategy 1: Ignoring Rows

In [3]:
df2 = MissingValues(df, strategy='ignore_row',string_as_null=True,inf_as_null=True,missing_values=None)
df2

Unnamed: 0,col1,col2,col3
0,1.0,1.0,1
5,4.0,4.0,6
6,5.0,5.0,7


In [4]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df

Unnamed: 0,col1,col2,col3
0,1,1.0,1
1,2,,2
2,3,inf,3
3,,2.0,4
4,missing,3.0,5
5,4,4.0,6
6,5,5.0,7
7,,6.0,8


### Strategy 2: Replacing With Zeros

In [5]:
df2 = MissingValues(df, strategy='zero',string_as_null=True,inf_as_null=True,missing_values=None)
df2

Unnamed: 0,col1,col2,col3
0,1.0,1.0,1
1,2.0,0.0,2
2,3.0,0.0,3
3,0.0,2.0,4
4,0.0,3.0,5
5,4.0,4.0,6
6,5.0,5.0,7
7,0.0,6.0,8


In [6]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df

Unnamed: 0,col1,col2
0,1,1.0
1,2,
2,3,inf
3,,2.0
4,missing,3.0
5,4,4.0
6,5,5.0
7,,6.0


### Strategy 3: Interpolate

In [7]:
df2 = MissingValues(df,strategy='interpolate',string_as_null=True,inf_as_null=True,missing_values=None)
df2

Unnamed: 0,col1,col2
0,1.0,1.0
1,2.0,1.333333
2,3.0,1.666667
3,3.333333,2.0
4,3.666667,3.0
5,4.0,4.0
6,5.0,5.0
7,5.0,6.0


In [8]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df

Unnamed: 0,col1,col2,col3
0,1,1.0,1
1,2,,2
2,3,inf,3
3,,2.0,4
4,missing,3.0,5
5,4,4.0,6
6,5,5.0,7
7,,6.0,8


### Strategy 3: Ignore Columns

In [9]:
df2 = MissingValues(df, strategy='ignore_column',string_as_null=True,inf_as_null=True,missing_values=None)
df2

Unnamed: 0,col3
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
