In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [3]:
# Step 1: Create a synthetic dataset
data = pd.DataFrame({
    'age': [25, np.nan, 35, 45, np.nan, 50, 22, 28],
    'salary': [50000, 60000, 75000, 80000, np.nan, 90000, 40000, np.nan],
    'gender': ['male', 'female', np.nan, 'female', 'male', 'male', 'female', 'female'],
    'purchased': ['no', 'yes', 'no', 'yes', 'no', 'yes', 'no', 'yes']
})

print("Original Dataset:")
print(data)



Original Dataset:
    age   salary  gender purchased
0  25.0  50000.0    male        no
1   NaN  60000.0  female       yes
2  35.0  75000.0     NaN        no
3  45.0  80000.0  female       yes
4   NaN      NaN    male        no
5  50.0  90000.0    male       yes
6  22.0  40000.0  female        no
7  28.0      NaN  female       yes


In [5]:
# Step 2: Handle Missing Values
data['age'].fillna(data['age'].mean(), inplace=True)
data['salary'].fillna(data['salary'].mean(), inplace=True)
data['gender'].fillna(data['gender'].mode()[0], inplace=True)

print("\nDataset after handling missing values:")
print(data)




Dataset after handling missing values:
         age        salary  gender purchased
0  25.000000  50000.000000    male        no
1  34.166667  60000.000000  female       yes
2  35.000000  75000.000000  female        no
3  45.000000  80000.000000  female       yes
4  34.166667  65833.333333    male        no
5  50.000000  90000.000000    male       yes
6  22.000000  40000.000000  female        no
7  28.000000  65833.333333  female       yes


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['salary'].fillna(data['salary'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [7]:
# Step 3: Encode Categorical Variables
label_encoder = LabelEncoder()
data['gender'] = label_encoder.fit_transform(data['gender'])
data['purchased'] = label_encoder.fit_transform(data['purchased'])

print("\nDataset after encoding categorical variables:")
print(data)





Dataset after encoding categorical variables:
         age        salary  gender  purchased
0  25.000000  50000.000000       1          0
1  34.166667  60000.000000       0          1
2  35.000000  75000.000000       0          0
3  45.000000  80000.000000       0          1
4  34.166667  65833.333333       1          0
5  50.000000  90000.000000       1          1
6  22.000000  40000.000000       0          0
7  28.000000  65833.333333       0          1


In [9]:
# Step 4: Feature Scaling
scaler = StandardScaler()
data[['age', 'salary']] = scaler.fit_transform(data[['age', 'salary']])

print("\nDataset after feature scaling:")
print(data)


Dataset after feature scaling:
        age    salary  gender  purchased
0 -1.025800 -1.049500       1          0
1  0.000000 -0.386658       0          1
2  0.093255  0.607605       0          0
3  1.212309  0.939026       0          1
4  0.000000  0.000000       1          0
5  1.771836  1.601868       1          1
6 -1.361516 -1.712341       0          0
7 -0.690084  0.000000       0          1
