In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data = {
    'Country': ['France', 'Spain', 'Germany', 'Spain', 'Germany', 'France', np.nan, 'Spain', 'France'],
    'Age': [44, 27, 30, 38, 40, 35, 38, np.nan, 48],
    'Salary': [72000, 48000, 54000, 61000, np.nan, 58000, 52000, 79000, 83000],
    'Purchased': ['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No']
}

df=pd.DataFrame(data)

In [3]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,,38.0,52000.0,No
7,Spain,,79000.0,Yes
8,France,48.0,83000.0,No


In [4]:
df.head()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes


In [5]:
df.tail()

Unnamed: 0,Country,Age,Salary,Purchased
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,,38.0,52000.0,No
7,Spain,,79000.0,Yes
8,France,48.0,83000.0,No


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    8 non-null      object 
 1   Age        8 non-null      float64
 2   Salary     8 non-null      float64
 3   Purchased  9 non-null      object 
dtypes: float64(2), object(2)
memory usage: 420.0+ bytes


In [8]:
df.describe()

Unnamed: 0,Age,Salary
count,8.0,8.0
mean,37.5,63375.0
std,6.88684,13048.67262
min,27.0,48000.0
25%,33.75,53500.0
50%,38.0,59500.0
75%,41.0,73750.0
max,48.0,83000.0


In [9]:
df.isnull().sum()

Unnamed: 0,0
Country,1
Age,1
Salary,1
Purchased,0


In [10]:
x=df.drop('Purchased',axis=1)
y=df['Purchased']

In [11]:
num_imputer = SimpleImputer(strategy='mean')
x[['Age','Salary']]=num_imputer.fit_transform(x[['Age','Salary']])

In [15]:
cat_imputer=SimpleImputer(strategy='most_frequent')
x[['Country']]=cat_imputer.fit_transform(x[['Country']])

In [16]:
x

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,63375.0
5,France,35.0,58000.0
6,France,38.0,52000.0
7,Spain,37.5,79000.0
8,France,48.0,83000.0


In [14]:
x.isnull().sum()

Unnamed: 0,0
Country,0
Age,0
Salary,0


In [21]:
ohe=OneHotEncoder(drop='first',sparse_output=False)
country_encoded=ohe.fit_transform(x[['Country']])

In [19]:
country_encoded

array([[0., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 0.],
       [0., 0.],
       [0., 1.],
       [0., 0.]])

In [23]:
encoded_col=ohe.get_feature_names_out(['Country'])

In [24]:
encoded_col

array(['Country_Germany', 'Country_Spain'], dtype=object)

In [26]:
country_df = pd.DataFrame(country_encoded, columns=encoded_col)

In [27]:
country_df

Unnamed: 0,Country_Germany,Country_Spain
0,0.0,0.0
1,0.0,1.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
5,0.0,0.0
6,0.0,0.0
7,0.0,1.0
8,0.0,0.0


In [None]:
x=x.drop('Country',axis=1)

In [29]:
x

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,63375.0
5,35.0,58000.0
6,38.0,52000.0
7,37.5,79000.0
8,48.0,83000.0


In [32]:
x=pd.concat([x,country_df],axis=1)

In [33]:
x

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
0,44.0,72000.0,0.0,0.0
1,27.0,48000.0,0.0,1.0
2,30.0,54000.0,1.0,0.0
3,38.0,61000.0,0.0,1.0
4,40.0,63375.0,1.0,0.0
5,35.0,58000.0,0.0,0.0
6,38.0,52000.0,0.0,0.0
7,37.5,79000.0,0.0,1.0
8,48.0,83000.0,0.0,0.0


In [35]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.2,random_state=42)

In [38]:
x_test.shape

(2, 4)

In [37]:
x_train.shape

(7, 4)

In [39]:
y_train.shape

(7,)

In [40]:
y_test.shape

(2,)

In [42]:
scaler=StandardScaler()

cols_to_scale=['Age','Salary']

x_train[cols_to_scale]=scaler.fit_transform(x_train[cols_to_scale])
x_test[cols_to_scale]=scaler.transform(x_test[cols_to_scale])

In [43]:
x_train

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
5,-0.737353,-0.529658,0.0,0.0
0,0.921691,0.859143,0.0,0.0
8,1.659044,1.950344,0.0,0.0
2,-1.659044,-0.926458,1.0,0.0
4,0.184338,0.003543,1.0,0.0
3,-0.184338,-0.232057,0.0,1.0
6,-0.184338,-1.124858,0.0,0.0


In [44]:
x_test

Unnamed: 0,Age,Salary,Country_Germany,Country_Spain
7,-0.276507,1.553544,0.0,1.0
1,-2.212059,-1.521658,0.0,1.0
