In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [16]:
data = {
    'Country': ['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', np.nan, 'Spain', 'France'],
    'Age': [44, 27, 30, 38, 40, 35, 38, np.nan, 48],
    'Salary': [72000, 48000, 54000, 61000, np.nan, 58000, 52000, 79000, 83000],
    'Purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No']
}
df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,,38.0,52000.0,No
7,Spain,,79000.0,Yes
8,France,48.0,83000.0,No


In [17]:
x=df.drop('Purchased',axis=1)
y=df['Purchased']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [11]:
numeric_feature = ['Age','Salary']
numeric_pipeline = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [12]:
categoric_feature=['Country']
categoric_pipeline= Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
preprocessor= ColumnTransformer(transformers=[('numeric_pipeline',numeric_pipeline,numeric_feature),('categorical_processor',categoric_pipeline,categoric_feature)])

In [20]:
x_train_processed=preprocessor.fit_transform(X_train)

In [21]:
X_test_processed = preprocessor.transform(X_test)


In [23]:
x_train_processed

array([[-7.37352914e-01, -5.29067602e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 9.21691143e-01,  8.59734853e-01,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [ 1.65904406e+00,  1.95093678e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00],
       [-1.65904406e+00, -9.25868303e-01,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00],
       [ 1.84338229e-01,  7.21776271e-16,  0.00000000e+00,
         1.00000000e+00,  0.00000000e+00],
       [-1.84338229e-01, -2.31467076e-01,  0.00000000e+00,
         0.00000000e+00,  1.00000000e+00],
       [-1.84338229e-01, -1.12426865e+00,  1.00000000e+00,
         0.00000000e+00,  0.00000000e+00]])

In [24]:
X_test_processed

array([[ 0.        ,  1.55413608,  0.        ,  0.        ,  1.        ],
       [-2.21205874, -1.52106936,  0.        ,  0.        ,  1.        ]])