# Feature generation
## Example 1: Generating BMI (Numeric Transformation)

In [1]:
import pandas as pd
import numpy as np

data = {
    "person_id": [1, 2, 3, 4],
    "height_cm": [170, 165, 180, 175],
    "weight_kg": [65, 72, 90, 85],
    "age": [25, 40, 35, 50]
}

df = pd.DataFrame(data)

print(df)

   person_id  height_cm  weight_kg  age
0          1        170         65   25
1          2        165         72   40
2          3        180         90   35
3          4        175         85   50


### Step 1: Convert Height to Meters

In [3]:
df["height_m"] = df["height_cm"] / 100

df

Unnamed: 0,person_id,height_cm,weight_kg,age,height_m
0,1,170,65,25,1.7
1,2,165,72,40,1.65
2,3,180,90,35,1.8
3,4,175,85,50,1.75


### Step 2: Generate BMI

$BMI = \frac{weight}{{height}^2}$

In [4]:
df["BMI"] = df["weight_kg"] / (df["height_m"] ** 2)

print(df)

   person_id  height_cm  weight_kg  age  height_m        BMI
0          1        170         65   25      1.70  22.491349
1          2        165         72   40      1.65  26.446281
2          3        180         90   35      1.80  27.777778
3          4        175         85   50      1.75  27.755102


### Step 3: Create BMI Category (Feature from Feature)

In [5]:
def bmi_category(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif bmi < 25:
        return "Normal"
    elif bmi < 30:
        return "Overweight"
    else:
        return "Obese"

df["BMI_category"] = df["BMI"].apply(bmi_category)

print(df)

   person_id  height_cm  weight_kg  age  height_m        BMI BMI_category
0          1        170         65   25      1.70  22.491349       Normal
1          2        165         72   40      1.65  26.446281   Overweight
2          3        180         90   35      1.80  27.777778   Overweight
3          4        175         85   50      1.75  27.755102   Overweight


convert BMI_category to ordered `category` dtype

In [7]:
df['BMI_category'] = pd.Categorical(df['BMI_category'],
                                         categories=['Underweight', 'Normal', 'Overweight', 'Obese'],
                                         ordered=True)
df.dtypes

person_id          int64
height_cm          int64
weight_kg          int64
age                int64
height_m         float64
BMI              float64
BMI_category    category
dtype: object

## Generating Ratio Features

In [10]:
sales_data = {
    "employee": ["A", "B", "C", "D"],
    "total_sales": [50000, 75000, 60000, 90000],
    "clients_handled": [50, 60, 40, 75]
}

df_sales = pd.DataFrame(sales_data)

print(df_sales)

  employee  total_sales  clients_handled
0        A        50000               50
1        B        75000               60
2        C        60000               40
3        D        90000               75


### Sales per Client (Efficiency Feature)

In [12]:
df_sales["sales_per_client"] = (
    df_sales["total_sales"] / df_sales["clients_handled"]
)

print(df_sales)

  employee  total_sales  clients_handled  sales_per_client
0        A        50000               50            1000.0
1        B        75000               60            1250.0
2        C        60000               40            1500.0
3        D        90000               75            1200.0


### Log Transformation (Handling Skewness)
- Reduces skewness
- Makes power law distributions more normal-like
- Common in economics and business

In [13]:
df_sales["log_total_sales"] = np.log(df_sales["total_sales"])

print(df_sales)

  employee  total_sales  clients_handled  sales_per_client  log_total_sales
0        A        50000               50            1000.0        10.819778
1        B        75000               60            1250.0        11.225243
2        C        60000               40            1500.0        11.002100
3        D        90000               75            1200.0        11.407565
