## SIMPLE IMPUTER

In [1]:
# Load libraries
import numpy as np
from sklearn.impute import SimpleImputer

In [2]:
# Create feature matrix with categorical feature
X = np.array([[0, 2.10, 1.45], 
              [1, 1.18, 1.33], 
              [0, 1.22, 1.27],
              [0, -0.21, -1.19],
              [np.nan, 0.87, 1.31],
              [np.nan, -0.67, -0.22]])

In [3]:
print(X)

[[ 0.    2.1   1.45]
 [ 1.    1.18  1.33]
 [ 0.    1.22  1.27]
 [ 0.   -0.21 -1.19]
 [  nan  0.87  1.31]
 [  nan -0.67 -0.22]]


In [4]:
# Create Imputer object
imputer = SimpleImputer(strategy='most_frequent')

In [5]:
imputer.fit_transform(X.T)

array([[ 0.  ,  1.  ,  0.  ,  0.  ,  0.87, -0.67],
       [ 2.1 ,  1.18,  1.22, -0.21,  0.87, -0.67],
       [ 1.45,  1.33,  1.27, -1.19,  1.31, -0.22]])

In [6]:
imputer.fit_transform(X)

array([[ 0.  ,  2.1 ,  1.45],
       [ 1.  ,  1.18,  1.33],
       [ 0.  ,  1.22,  1.27],
       [ 0.  , -0.21, -1.19],
       [ 0.  ,  0.87,  1.31],
       [ 0.  , -0.67, -0.22]])

 ## PIPELINE

In [7]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [8]:
#Create DataFrame
raw_data = {'patient': [1, 1, 1, 2, 2],
        'obs': [1, np.nan, 3, 1, 2],
        'treatment': [0, 1, 0, 1, np.nan],
            'sex':['F', 'M', 'F', 'F', 'M'],
        'score': ['strong', 'weak', 'normal', 'weak', 'strong']}
df = pd.DataFrame(raw_data, columns = ['patient', 'obs', 'treatment', 'sex','score'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   patient    5 non-null      int64  
 1   obs        4 non-null      float64
 2   treatment  4 non-null      float64
 3   sex        5 non-null      object 
 4   score      5 non-null      object 
dtypes: float64(2), int64(1), object(2)
memory usage: 328.0+ bytes


In [9]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['obs','treatment']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))])

categorical_features = ['score','sex']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [11]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor)])

DATA_PIPELINE=clf.fit_transform(df)
DATA_PIPELINE

array([[1., 0., 0., 1., 0., 1., 0.],
       [1., 1., 0., 0., 1., 0., 1.],
       [3., 0., 1., 0., 0., 1., 0.],
       [1., 1., 0., 0., 1., 1., 0.],
       [2., 0., 0., 1., 0., 0., 1.]])