In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'age': [25, np.nan, 35, 45, np.nan, 55],
    'salary': [50000, 60000, np.nan, 80000, 70000, np.nan],
    'city': ['New York', 'Los Angeles', 'New York', np.nan, 'Chicago', 'Chicago'],
    'purchased': ['No', 'Yes', 'No', 'Yes', 'No', np.nan]
}
df = pd.DataFrame(data)

In [3]:
print("Original DataFrame:")
print(df)

Original DataFrame:
    age   salary         city purchased
0  25.0  50000.0     New York        No
1   NaN  60000.0  Los Angeles       Yes
2  35.0      NaN     New York        No
3  45.0  80000.0          NaN       Yes
4   NaN  70000.0      Chicago        No
5  55.0      NaN      Chicago       NaN


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        4 non-null      float64
 1   salary     4 non-null      float64
 2   city       5 non-null      object 
 3   purchased  5 non-null      object 
dtypes: float64(2), object(2)
memory usage: 320.0+ bytes


In [6]:
df.isnull().sum()

age          2
salary       2
city         1
purchased    1
dtype: int64

In [4]:
from sklearn.impute import SimpleImputer

In [7]:
num_imputer = SimpleImputer(strategy='mean')
df[['age', 'salary']] = num_imputer.fit_transform(df[['age', 'salary']])

In [8]:
print("\nDataFrame after handling missing values:")
print(df)


DataFrame after handling missing values:
    age   salary         city purchased
0  25.0  50000.0     New York        No
1  40.0  60000.0  Los Angeles       Yes
2  35.0  65000.0     New York        No
3  45.0  80000.0          NaN       Yes
4  40.0  70000.0      Chicago        No
5  55.0  65000.0      Chicago       NaN


In [9]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [10]:
label_encoder = LabelEncoder()
df['purchased'] = label_encoder.fit_transform(df['purchased'])

In [11]:
df = pd.get_dummies(df, columns=['city'], drop_first=True)

In [12]:
print("\nDataFrame after encoding categorical data:")
print(df)


DataFrame after encoding categorical data:
    age   salary  purchased  city_Los Angeles  city_New York
0  25.0  50000.0          0                 0              1
1  40.0  60000.0          1                 1              0
2  35.0  65000.0          0                 0              1
3  45.0  80000.0          1                 0              0
4  40.0  70000.0          0                 0              0
5  55.0  65000.0          2                 0              0
