In [1]:
import sklearn
sklearn.__version__

'1.4.2'

# 

# Preprocessing

## Ordinal Encoding

In [2]:
from sklearn.preprocessing import OrdinalEncoder

In [3]:
encoder = OrdinalEncoder()

In [10]:
X = [['B'], ['A'], ['C'], ['D']]

In [11]:
encoder.fit(X)

In [12]:
X_ = encoder.transform(X)

In [13]:
X_

array([[1.],
       [0.],
       [2.],
       [3.]])

In [14]:
encoder.categories_

[array(['A', 'B', 'C', 'D'], dtype=object)]

In [15]:
X = [['A', 10], ['B', 9.5], ['C', 8], ['D', 16]]

In [16]:
encoder.fit(X)

In [17]:
X_ = encoder.transform(X)

In [18]:
X_

array([[0., 2.],
       [1., 1.],
       [2., 0.],
       [3., 3.]])

In [19]:
encoder.categories_

[array(['A', 'B', 'C', 'D'], dtype=object),
 array([8, 9.5, 10, 16], dtype=object)]

In [20]:
encoder.inverse_transform(X_)

array([['A', 10],
       ['B', 9.5],
       ['C', 8],
       ['D', 16]], dtype=object)

# Label Encoder

In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
encoder = LabelEncoder()

In [36]:
Y = ['bat', 'ant', 'cat', 'dog', 'ant']

In [37]:
encoder.fit(Y)

In [38]:
Y_ = encoder.transform(Y)

In [39]:
Y_

array([1, 0, 2, 3, 0])

In [40]:
encoder.classes_

array(['ant', 'bat', 'cat', 'dog'], dtype='<U3')

In [41]:
encoder.inverse_transform(Y_)

array(['bat', 'ant', 'cat', 'dog', 'ant'], dtype='<U3')

# One-hot Encoding

In [42]:
from sklearn.preprocessing import OneHotEncoder

In [43]:
encoder = OneHotEncoder()

In [44]:
X = [['ant'], ['bat'], ['cat'], ['bat'], ['ant'], ['cat']]

In [45]:
encoder.fit(X)

In [46]:
X_ = encoder.transform(X)

In [47]:
X_

<6x3 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [48]:
X_.todense()

matrix([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.],
        [1., 0., 0.],
        [0., 0., 1.]])

In [49]:
encoder.categories_

[array(['ant', 'bat', 'cat'], dtype=object)]

In [50]:
encoder.inverse_transform(X_)

array([['ant'],
       ['bat'],
       ['cat'],
       ['bat'],
       ['ant'],
       ['cat']], dtype=object)

In [51]:
import pandas as pd

In [52]:
df = pd.DataFrame(X, columns=['animal'])

In [53]:
df

Unnamed: 0,animal
0,ant
1,bat
2,cat
3,bat
4,ant
5,cat


In [54]:
pd.get_dummies(df)

Unnamed: 0,animal_ant,animal_bat,animal_cat
0,True,False,False
1,False,True,False
2,False,False,True
3,False,True,False
4,True,False,False
5,False,False,True


In [55]:
pd.get_dummies(df, dtype=int)

Unnamed: 0,animal_ant,animal_bat,animal_cat
0,1,0,0
1,0,1,0
2,0,0,1
3,0,1,0
4,1,0,0
5,0,0,1


In [56]:
df = pd.DataFrame([['male'], ['female'], ['female'], ['male']], columns=['gender'])

In [58]:
df_ = pd.get_dummies(df, dtype=int)

In [59]:
df_.columns

Index(['gender_female', 'gender_male'], dtype='object')

In [60]:
df_.drop(columns=df_.columns[0])

Unnamed: 0,gender_male
0,1
1,0
2,0
3,1


# MultiLabel Binarizer

In [61]:
from sklearn.preprocessing import MultiLabelBinarizer

In [62]:
encoder = MultiLabelBinarizer()

In [63]:
X = [['sci-fi', 'comedy'], 
     ['comedy'], 
     ['drama', 'romance'],  
     ['sci-fi', 'drama', 'action']]

In [64]:
encoder.fit(X)

In [65]:
X_ = encoder.transform(X)

In [66]:
X_

array([[0, 1, 0, 0, 1],
       [0, 1, 0, 0, 0],
       [0, 0, 1, 1, 0],
       [1, 0, 1, 0, 1]])

In [67]:
encoder.inverse_transform(X_)

[('comedy', 'sci-fi'),
 ('comedy',),
 ('drama', 'romance'),
 ('action', 'drama', 'sci-fi')]

In [68]:
encoder.classes_

array(['action', 'comedy', 'drama', 'romance', 'sci-fi'], dtype=object)

# K Bins Discretizer

In [69]:
from sklearn.preprocessing import KBinsDiscretizer

In [70]:
X = [[10], [11], [12], [16], [21], [22], [35]]

In [71]:
encoder = KBinsDiscretizer(n_bins=3)

In [72]:
encoder.fit(X)

In [76]:
X_ = encoder.transform(X)

In [77]:
X_

<7x3 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [78]:
X_.todense()

matrix([[1., 0., 0.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]])

In [79]:
encoder.bin_edges_

array([array([10., 12., 21., 35.])], dtype=object)

In [80]:
encoder.inverse_transform(X_)

array([[11. ],
       [11. ],
       [16.5],
       [16.5],
       [28. ],
       [28. ],
       [28. ]])

In [81]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')

In [82]:
encoder.fit_transform(X)

array([[0.],
       [0.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.]])

In [83]:
encoder = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')

In [84]:
encoder.fit_transform(X)



array([[0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [2.]])

# MinMax Scaler

In [85]:
from sklearn.preprocessing import MinMaxScaler

In [91]:
X = [[1], [2], [3.9], [4], [5]]

In [92]:
scaler = MinMaxScaler()

In [93]:
scaler.fit(X)

In [94]:
X_ = scaler.transform(X)

In [95]:
X_

array([[0.   ],
       [0.25 ],
       [0.725],
       [0.75 ],
       [1.   ]])

In [97]:
scaler.inverse_transform(X_)

array([[1. ],
       [2. ],
       [3.9],
       [4. ],
       [5. ]])

# Normalization

In [98]:
from sklearn.preprocessing import Normalizer

In [99]:
normalizer = Normalizer()

In [100]:
X = [[4, 1, 2, 2],
     [1, 3, 9, 3],
     [5, 7, 5, 1]]

In [101]:
normalizer.fit(X)

In [102]:
X_ = normalizer.transform(X)

In [103]:
X_

array([[0.8, 0.2, 0.4, 0.4],
       [0.1, 0.3, 0.9, 0.3],
       [0.5, 0.7, 0.5, 0.1]])

In [104]:
normalizer.inverse_transform(X_)

AttributeError: 'Normalizer' object has no attribute 'inverse_transform'

In [105]:
normalizer = Normalizer(norm='l1')

In [106]:
normalizer.fit(X)

In [107]:
X_ = normalizer.transform(X)

In [108]:
X_

array([[0.44444444, 0.11111111, 0.22222222, 0.22222222],
       [0.0625    , 0.1875    , 0.5625    , 0.1875    ],
       [0.27777778, 0.38888889, 0.27777778, 0.05555556]])

# Standardization

In [109]:
from sklearn.preprocessing import StandardScaler

In [110]:
scaler = StandardScaler()

In [111]:
X = [[1, 2], [2, 3], [3, 4], [4, 5]]

In [112]:
scaler.fit(X)

In [113]:
X_ = scaler.transform(X)

In [114]:
X_

array([[-1.34164079, -1.34164079],
       [-0.4472136 , -0.4472136 ],
       [ 0.4472136 ,  0.4472136 ],
       [ 1.34164079,  1.34164079]])

In [115]:
scaler.mean_, scaler.var_

(array([2.5, 3.5]), array([1.25, 1.25]))

In [116]:
scaler.inverse_transform(X_)

array([[1., 2.],
       [2., 3.],
       [3., 4.],
       [4., 5.]])