In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
df = pd.read_csv('data/insurance_train.csv')

In [3]:
categorical_cols = ['sex','smoker','region']
numerical_cols = ['age','bmi','children']   
target_col = 'expenses'

In [4]:
df['age_squared'] = df['age'] ** 2
df['age_cubed'] = df['age'] ** 3
df['sqrt_age'] = np.sqrt(df['age'])
df['log_age'] = np.log1p(df['age'])

df['age_group'] = pd.cut(df['age'], bins=[0, 18, 25, 35, 50, 65, 100], 
                          labels=['0-18', '19-25', '26-35', '36-50', '51-65', '65+'])

df['bmi_category'] = pd.cut(
    df['bmi'], 
    bins=[0, 18.5, 25, 30, 100], 
    labels=['Underweight', 'Normal', 'Overweight', 'Obese']
)
df['is_obese'] = (df['bmi'] >= 30).astype(int)

df['bmi_squared'] = df['bmi'] ** 2
df['log_bmi'] = np.log1p(df['bmi'])

df['age_bmi'] = df['age'] * df['bmi'] 
if 'smoker' in df.columns:
    df['smoker_numeric'] = (df['smoker'] == 'yes').astype(int)
    df['age_smoker'] = df['age'] * df['smoker_numeric']

df['age_bmi_smoker'] = df['age'] * df['bmi'] * df['smoker_numeric']

df['is_high_risk'] = ((df['smoker'] == 'yes') & (df['bmi'] > 30)).astype(int)
df['is_young_smoker'] = ((df['smoker'] == 'yes') & (df['age'] < 30)).astype(int)
df['is_old_smoker'] = ((df['smoker'] == 'yes') & (df['age'] > 50)).astype(int)

df['has_children'] = (df['children'] > 0).astype(int)
df['many_children'] = (df['children'] >= 3).astype(int)

df['age_per_child'] = df['age'] / (df['children'] + 1)
df['children_bmi'] = df['children'] * df['bmi']

region_avg_expense = df.groupby('region')['expenses'].mean().to_dict()
df['region_avg_expense'] = df['region'].map(region_avg_expense)

for region in df['region'].unique():
    df[f'is_{region}'] = (df['region'] == region).astype(int)
    df[f'smoker_{region}'] = ((df['region'] == region) & (df['smoker'] == 'yes')).astype(int)

df['bmi_risk_tier'] = pd.qcut(df['bmi'], 5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

df['age_premium_factor'] = 1 + (df['age'] / 50) ** 1.5

df['family_risk'] = df['bmi'] * (1 + 0.2 * df['children']) * np.where(df['smoker'] == 'yes', 1.5, 1)

df['smoker_severity'] = df['smoker_numeric'] * (1 + (df['age'] / 100))

In [5]:
all_columns = df.columns.tolist()

categorical_cols = ['sex', 'smoker', 'region','bmi_category', 'age_group', 'bmi_risk_tier',]

numerical_cols = [col for col in all_columns 
                 if col not in categorical_cols 
                 and col != target_col
                 and df[col].dtype in ['int64', 'float64']]

In [6]:
numerical_cols

['age',
 'bmi',
 'children',
 'age_squared',
 'age_cubed',
 'sqrt_age',
 'log_age',
 'is_obese',
 'bmi_squared',
 'log_bmi',
 'age_bmi',
 'smoker_numeric',
 'age_smoker',
 'age_bmi_smoker',
 'is_high_risk',
 'is_young_smoker',
 'is_old_smoker',
 'has_children',
 'many_children',
 'age_per_child',
 'children_bmi',
 'region_avg_expense',
 'is_northwest',
 'smoker_northwest',
 'is_southwest',
 'smoker_southwest',
 'is_southeast',
 'smoker_southeast',
 'is_northeast',
 'smoker_northeast',
 'age_premium_factor',
 'family_risk',
 'smoker_severity']

In [7]:
categorical_cols

['sex', 'smoker', 'region', 'bmi_category', 'age_group', 'bmi_risk_tier']

In [8]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses,age_squared,age_cubed,sqrt_age,...,is_southwest,smoker_southwest,is_southeast,smoker_southeast,is_northeast,smoker_northeast,bmi_risk_tier,age_premium_factor,family_risk,smoker_severity
0,19,male,35.5,0,no,northwest,1646.43,361,6859,4.358899,...,0,0,0,0,0,0,High,1.234248,35.5,0.0
1,57,male,31.5,0,no,northwest,11353.23,3249,185193,7.549834,...,0,0,0,0,0,0,Medium,2.217187,31.5,0.0
2,51,male,37.0,0,no,southwest,8798.59,2601,132651,7.141428,...,1,0,0,0,0,0,Very High,2.03015,37.0,0.0
3,49,female,36.6,3,no,southeast,10381.48,2401,117649,7.0,...,0,0,1,0,0,0,Very High,1.970151,58.56,0.0
4,21,male,22.3,1,no,southwest,2103.08,441,9261,4.582576,...,1,0,0,0,0,0,Very Low,1.272191,26.76,0.0


In [9]:
df.to_csv('data/engineered_features_unprocessed.csv', index=False)