In [1]:
import pandas as pd
import numpy as np

In [2]:
students = [[85, 'M', 'verygood'],
           [95, 'F', 'excellent'],
           [75, None,'good'],
           [np.NaN, 'M', 'average'],
           [70, 'M', 'good'],
           [np.NaN, None, 'verygood'],
           [92, 'F', 'verygood'],
           [98, 'M', 'excellent']]
 
df = pd.DataFrame(students)
df.columns = ['marks', 'gender', 'result']

In [3]:
df

Unnamed: 0,marks,gender,result
0,85.0,M,verygood
1,95.0,F,excellent
2,75.0,,good
3,,M,average
4,70.0,M,good
5,,,verygood
6,92.0,F,verygood
7,98.0,M,excellent


In [4]:
from sklearn.impute import SimpleImputer

In [5]:
imputer = SimpleImputer(missing_values=np.NaN, strategy='median')

In [6]:
df['marks1'] = imputer.fit_transform(df['marks'].values.reshape(-1,1))[:,0]

In [7]:
df

Unnamed: 0,marks,gender,result,marks1
0,85.0,M,verygood,85.0
1,95.0,F,excellent,95.0
2,75.0,,good,75.0
3,,M,average,88.5
4,70.0,M,good,70.0
5,,,verygood,88.5
6,92.0,F,verygood,92.0
7,98.0,M,excellent,98.0


In [8]:
imputer = SimpleImputer(missing_values=None, strategy='most_frequent')

In [10]:
df['gender1'] = imputer.fit_transform(df['gender'].values.reshape(-1,1))[:,0]

In [11]:
df

Unnamed: 0,marks,gender,result,marks1,gender1
0,85.0,M,verygood,85.0,M
1,95.0,F,excellent,95.0,F
2,75.0,,good,75.0,M
3,,M,average,88.5,M
4,70.0,M,good,70.0,M
5,,,verygood,88.5,M
6,92.0,F,verygood,92.0,F
7,98.0,M,excellent,98.0,M


In [12]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from sklearn import set_config

set_config(display='diagram')

In [26]:
num_features = [ 'marks' ]
cat_features = [ 'gender' ]

num_imp = SimpleImputer(missing_values=np.NaN, strategy='median')
cat_imp = SimpleImputer(missing_values=None, strategy='most_frequent')

pre_pl = make_column_transformer(
    (num_imp, num_features),
    (cat_imp, cat_features),
    (StandardScaler(), num_features),
)

data_tf = pre_pl.fit_transform(df)
data_tf
# df_tf = pd.DataFrame( data_tf, columns=num_features + cat_features )

array([[85.0, 'M', -0.08076079672404915],
       [95.0, 'F', 0.8883687639645462],
       [75.0, 'M', -1.0498903574126446],
       [88.5, 'M', nan],
       [70.0, 'M', -1.5344551377569422],
       [88.5, 'M', nan],
       [92.0, 'F', 0.5976298957579675],
       [98.0, 'M', 1.1791076321711247]], dtype=object)

In [20]:
num_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler())

num_pipeline

In [21]:
cat_pipeline = make_pipeline(
    SimpleImputer(missing_values=None, strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

cat_pipeline

In [22]:
pre_pl = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features),
])

pre_pl

In [23]:
df

Unnamed: 0,marks,gender,result,marks1,gender1
0,85.0,M,verygood,85.0,M
1,95.0,F,excellent,95.0,F
2,75.0,,good,75.0,M
3,,M,average,88.5,M
4,70.0,M,good,70.0,M
5,,,verygood,88.5,M
6,92.0,F,verygood,92.0,F
7,98.0,M,excellent,98.0,M


In [24]:
df_tf = pre_pl.fit_transform(df)
df_tf

array([[-0.1664741 ,  0.        ,  1.        ],
       [ 0.94335323,  1.        ,  0.        ],
       [-1.27630143,  0.        ,  1.        ],
       [ 0.22196547,  0.        ,  1.        ],
       [-1.8312151 ,  0.        ,  1.        ],
       [ 0.22196547,  0.        ,  1.        ],
       [ 0.61040503,  1.        ,  0.        ],
       [ 1.27630143,  0.        ,  1.        ]])

In [25]:
df_tf_fr = pd.DataFrame(
    df_tf,
    columns=pre_pl.get_feature_names_out(),
    index=df.index)

df_tf_fr

Unnamed: 0,num__marks,cat__gender_F,cat__gender_M
0,-0.166474,0.0,1.0
1,0.943353,1.0,0.0
2,-1.276301,0.0,1.0
3,0.221965,0.0,1.0
4,-1.831215,0.0,1.0
5,0.221965,0.0,1.0
6,0.610405,1.0,0.0
7,1.276301,0.0,1.0
