In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
data = {
    'Country': ['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', np.nan, 'Spain', 'France'],
    'Age': [44, 27, 30, 38, 40, 35, 38, np.nan, 48],
    'Salary': [72000, 48000, 54000, 61000, np.nan, 58000, 52000, 79000, 83000],
    'Purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No']
}

df = pd.DataFrame(data)

In [4]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,,38.0,52000.0,No
7,Spain,,79000.0,Yes
8,France,48.0,83000.0,No


In [5]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [6]:
df.shape

(9, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    8 non-null      object 
 1   Age        8 non-null      float64
 2   Salary     8 non-null      float64
 3   Purchased  9 non-null      object 
dtypes: float64(2), object(2)
memory usage: 420.0+ bytes


In [10]:
df.describe()

Unnamed: 0,Age,Salary
count,8.0,8.0
mean,37.5,63375.0
std,6.88684,13048.67262
min,27.0,48000.0
25%,33.75,53500.0
50%,38.0,59500.0
75%,41.0,73750.0
max,48.0,83000.0


In [11]:
df.isnull().sum()

Unnamed: 0,0
Country,1
Age,1
Salary,1
Purchased,0


In [12]:
x=df[['Country','Age','Salary']]
y=df['Purchased']

In [13]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,
5,France,35.0,58000.0
6,,38.0,52000.0
7,Spain,,79000.0
8,France,48.0,83000.0


In [14]:
y

Unnamed: 0,Purchased
0,No
1,Yes
2,No
3,No
4,Yes
5,Yes
6,No
7,Yes
8,No


In [15]:
numaric_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),('scaler',StandardScaler())])

In [17]:
categorical_transformer=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),('onehot',OneHotEncoder(handle_unknown='ignore',drop='first'))])

In [18]:
preprocessor=ColumnTransformer(transformers=[('num',numaric_transformer,['Age','Salary']),('cat',categorical_transformer,['Country'])])

In [19]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=42)

In [21]:
x_train_processed=preprocessor.fit_transform(x_train)
x_test_processed=preprocessor.transform(x_test)

In [22]:
x_train_processed

array([[-7.37352914e-01, -5.29067602e-01,  0.00000000e+00,
         0.00000000e+00],
       [ 9.21691143e-01,  8.59734853e-01,  0.00000000e+00,
         0.00000000e+00],
       [ 1.65904406e+00,  1.95093678e+00,  0.00000000e+00,
         0.00000000e+00],
       [-1.65904406e+00, -9.25868303e-01,  1.00000000e+00,
         0.00000000e+00],
       [ 1.84338229e-01,  7.21776271e-16,  1.00000000e+00,
         0.00000000e+00],
       [-1.84338229e-01, -2.31467076e-01,  0.00000000e+00,
         1.00000000e+00],
       [-1.84338229e-01, -1.12426865e+00,  0.00000000e+00,
         0.00000000e+00]])

In [23]:
x_test_processed

array([[ 0.        ,  1.55413608,  0.        ,  1.        ],
       [-2.21205874, -1.52106936,  0.        ,  1.        ]])