1. The first step before training a model is to impute the data and make it suitable for training (`sklearn.impute`).

In [24]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

arr = pd.DataFrame({
    'A': [1, None, None, 4, 5, 6, 7, 8, None, 10],
    'B': [6, 7, 8, None, 10, 11, None, 13, None, 15],
    'C': [np.nan, np.nan, 'A', 'B', np.nan, 'B', 'B', 'A', 'B', 'A'],
    'D': ['Purchase', 'Not Purchase', 'Purchase', 'Not Purchase', 'Purchase', 'Not Purchase', 'Purchase', 'Not Purchase', 'Purchase', 'Not Purchase']
})

In [25]:
si = SimpleImputer()
arr[['A', 'B']] = si.fit_transform(arr[['A', 'B']])
arr

Unnamed: 0,A,B,C,D
0,1.0,6.0,,Purchase
1,5.857143,7.0,,Not Purchase
2,5.857143,8.0,A,Purchase
3,4.0,10.0,B,Not Purchase
4,5.0,10.0,,Purchase
5,6.0,11.0,B,Not Purchase
6,7.0,10.0,B,Purchase
7,8.0,13.0,A,Not Purchase
8,5.857143,10.0,B,Purchase
9,10.0,15.0,A,Not Purchase


In [26]:
si_new = SimpleImputer(strategy='most_frequent')
arr[['C']] = si_new.fit_transform(arr[['C']])
arr

Unnamed: 0,A,B,C,D
0,1.0,6.0,B,Purchase
1,5.857143,7.0,B,Not Purchase
2,5.857143,8.0,A,Purchase
3,4.0,10.0,B,Not Purchase
4,5.0,10.0,B,Purchase
5,6.0,11.0,B,Not Purchase
6,7.0,10.0,B,Purchase
7,8.0,13.0,A,Not Purchase
8,5.857143,10.0,B,Purchase
9,10.0,15.0,A,Not Purchase


2. Second step is to encode categorical variables (`sklearn.preprocessing`). The 3 types are:
- Ordinal Encoder
- Label Encoder (for target variables only).
- One-Hot Encoder

In [27]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
arr[['C']] = oe.fit_transform(arr[['C']])
arr

Unnamed: 0,A,B,C,D
0,1.0,6.0,1.0,Purchase
1,5.857143,7.0,1.0,Not Purchase
2,5.857143,8.0,0.0,Purchase
3,4.0,10.0,1.0,Not Purchase
4,5.0,10.0,1.0,Purchase
5,6.0,11.0,1.0,Not Purchase
6,7.0,10.0,1.0,Purchase
7,8.0,13.0,0.0,Not Purchase
8,5.857143,10.0,1.0,Purchase
9,10.0,15.0,0.0,Not Purchase


We can also use `LabelEncoder` for target variables.

In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
arr['D'] = le.fit_transform(arr['D'])
arr

Unnamed: 0,A,B,C,D
0,1.0,6.0,1.0,1
1,5.857143,7.0,1.0,0
2,5.857143,8.0,0.0,1
3,4.0,10.0,1.0,0
4,5.0,10.0,1.0,1
5,6.0,11.0,1.0,0
6,7.0,10.0,1.0,1
7,8.0,13.0,0.0,0
8,5.857143,10.0,1.0,1
9,10.0,15.0,0.0,0


We can also use `OneHotEncoder` to encode the categorical variables.

In [29]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False)

df = pd.DataFrame({
    'City': ['Delhi', 'Mumbai', 'Hyderabad', 'Kolkata', 'Mumbai', 'Kolkata', 'Delhi', 'Mumbai', 'Kolkata', 'Delhi'],
    'Population': [100000, 200000, 300000, 400000, 500000, 600000, 700000, 800000, 900000, 1000000]
})

enc_city = ohe.fit_transform(df[['City']])

df = pd.concat([df, pd.DataFrame(enc_city)], axis=1)

df.drop('City', axis=1, inplace=True)

If we want to reverse the encoding, we can use `inverse_transform` method.

In [30]:
ohe.inverse_transform([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0]])

array([['Delhi'],
       ['Mumbai'],
       ['Kolkata'],
       ['Hyderabad']], dtype=object)