In [1]:
#IMPRTING PANDAS AS pd AND NUMPY AS np
import pandas as pd
import numpy as np

In [2]:
#1. Handling Missing Values
#Before creating new features, 
#Have to fix the holes in your data. You can use simple imputation or more strategic fills using NumPy.

In [3]:
# Sample Data
df = pd.DataFrame({'Age': [25, np.nan, 30, 35, np.nan], 
                   'Salary': [50000, 54000, np.nan, 62000, 58000]})

# Fill Age with the mean
df['Age_filled'] = df['Age'].fillna(df['Age'].mean())

# Flag missing values (an often overlooked but powerful feature)
df['Salary_is_missing'] = np.where(df['Salary'].isnull(), 1, 0)

In [4]:
df

Unnamed: 0,Age,Salary,Age_filled,Salary_is_missing
0,25.0,50000.0,25.0,0
1,,54000.0,30.0,0
2,30.0,,30.0,1
3,35.0,62000.0,35.0,0
4,,58000.0,30.0,0


In [5]:
#2. Binning (Transformation)

In [6]:
# Create age groups
conditions = [
    (df['Age_filled'] < 30),
    (df['Age_filled'] >= 30) & (df['Age_filled'] < 60),
    (df['Age_filled'] >= 60)
]
choices = ['Young', 'Adult', 'Senior']

df['Age_Group'] = np.select(conditions, choices, default='Unknown')

In [7]:
df

Unnamed: 0,Age,Salary,Age_filled,Salary_is_missing,Age_Group
0,25.0,50000.0,25.0,0,Young
1,,54000.0,30.0,0,Adult
2,30.0,,30.0,1,Adult
3,35.0,62000.0,35.0,0,Adult
4,,58000.0,30.0,0,Adult


In [8]:
#3. Encoding Categorical Data


In [9]:
#Encoding using Pandas
df_encoded = pd.get_dummies(df, columns=['Age_Group'], prefix='Group')

In [10]:
df

Unnamed: 0,Age,Salary,Age_filled,Salary_is_missing,Age_Group
0,25.0,50000.0,25.0,0,Young
1,,54000.0,30.0,0,Adult
2,30.0,,30.0,1,Adult
3,35.0,62000.0,35.0,0,Adult
4,,58000.0,30.0,0,Adult


In [11]:
#4.Interaction Features

In [12]:
# Creating a 'Salary per Year of Age' feature
df['Salary_per_Age'] = df['Salary'] / df['Age_filled']

In [13]:
df

Unnamed: 0,Age,Salary,Age_filled,Salary_is_missing,Age_Group,Salary_per_Age
0,25.0,50000.0,25.0,0,Young,2000.0
1,,54000.0,30.0,0,Adult,1800.0
2,30.0,,30.0,1,Adult,
3,35.0,62000.0,35.0,0,Adult,1771.428571
4,,58000.0,30.0,0,Adult,1933.333333


In [17]:
df.mean

<bound method DataFrame.mean of     Age   Salary  Age_filled  Salary_is_missing Age_Group  Salary_per_Age
0  25.0  50000.0        25.0                  0     Young     2000.000000
1   NaN  54000.0        30.0                  0     Adult     1800.000000
2  30.0      NaN        30.0                  1     Adult             NaN
3  35.0  62000.0        35.0                  0     Adult     1771.428571
4   NaN  58000.0        30.0                  0     Adult     1933.333333>

In [18]:
df.median

<bound method DataFrame.median of     Age   Salary  Age_filled  Salary_is_missing Age_Group  Salary_per_Age
0  25.0  50000.0        25.0                  0     Young     2000.000000
1   NaN  54000.0        30.0                  0     Adult     1800.000000
2  30.0      NaN        30.0                  1     Adult             NaN
3  35.0  62000.0        35.0                  0     Adult     1771.428571
4   NaN  58000.0        30.0                  0     Adult     1933.333333>