In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
import joblib

In [3]:
test_df = pd.read_csv('data/insurance_test.csv')

In [5]:
preprocessor = joblib.load('models/preprocessor.pkl')

In [None]:
# Same as from feature engineering on the training set

categorical_cols = ['sex', 'smoker', 'region', 'bmi_category', 'age_group', 'bmi_risk_tier']
numerical_cols = ['age', 'bmi', 'children', 'age_squared', 'age_cubed', 'sqrt_age', 
                    'log_age', 'is_obese', 'bmi_squared', 'log_bmi', 'age_bmi', 
                    'smoker_numeric', 'age_smoker', 'age_bmi_smoker', 'is_high_risk', 
                    'is_young_smoker', 'is_old_smoker', 'has_children', 'many_children', 
                    'age_per_child', 'children_bmi', 'region_avg_expense', 
                    'is_northwest', 'smoker_northwest', 'is_southwest', 'smoker_southwest', 
                    'is_southeast', 'smoker_southeast', 'is_northeast', 'smoker_northeast', 
                    'age_premium_factor', 'family_risk', 'smoker_severity']

test_df['age_squared'] = test_df['age'] ** 2
test_df['age_cubed'] = test_df['age'] ** 3
test_df['sqrt_age'] = np.sqrt(test_df['age'])
test_df['log_age'] = np.log1p(test_df['age'])

test_df['age_group'] = pd.cut(test_df['age'], bins=[0, 18, 25, 35, 50, 65, 100], 
                        labels=['0-18', '19-25', '26-35', '36-50', '51-65', '65+'])

test_df['bmi_squared'] = test_df['bmi'] ** 2
test_df['log_bmi'] = np.log1p(test_df['bmi'])

test_df['bmi_category'] = pd.cut(test_df['bmi'], bins=[0, 18.5, 25, 30, 35, 100], 
                            labels=['Underweight', 'Normal', 'Overweight', 'Obese', 'Severely_Obese'])
test_df['bmi_risk_tier'] = pd.qcut(test_df['bmi'], 5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
test_df['is_obese'] = (test_df['bmi'] >= 30).astype(int)

test_df['smoker_numeric'] = (test_df['smoker'] == 'yes').astype(int)

test_df['age_bmi'] = test_df['age'] * test_df['bmi']
test_df['age_smoker'] = test_df['age'] * test_df['smoker_numeric']
test_df['bmi_smoker'] = test_df['bmi'] * test_df['smoker_numeric']
test_df['age_bmi_smoker'] = test_df['age'] * test_df['bmi'] * test_df['smoker_numeric']

test_df['is_high_risk'] = ((test_df['smoker'] == 'yes') & (test_df['bmi'] > 30)).astype(int)
test_df['is_young_smoker'] = ((test_df['smoker'] == 'yes') & (test_df['age'] < 30)).astype(int)
test_df['is_old_smoker'] = ((test_df['smoker'] == 'yes') & (test_df['age'] > 50)).astype(int)

test_df['has_children'] = (test_df['children'] > 0).astype(int)
test_df['many_children'] = (test_df['children'] >= 3).astype(int)
test_df['age_per_child'] = test_df['age'] / (test_df['children'] + 1)
test_df['children_bmi'] = test_df['children'] * test_df['bmi']

# Taken from the training set of course we don't know this for the test set
region_avg_expense = {
    'northwest': 12367.27247, 
    'northeast': 13625.194749,
    'southwest': 12486.716527,
    'southeast': 14464.178021
}
test_df['region_avg_expense'] = test_df['region'].map(region_avg_expense)

for region in ['northwest', 'southwest', 'southeast', 'northeast']:
    test_df[f'is_{region}'] = (test_df['region'] == region).astype(int)
    test_df[f'smoker_{region}'] = ((test_df['region'] == region) & 
                                    (test_df['smoker'] == 'yes')).astype(int)

test_df['age_premium_factor'] = 1 + (test_df['age'] / 50) ** 1.5
test_df['smoker_severity'] = test_df['smoker_numeric'] * (1 + (test_df['age'] / 100))
test_df['family_risk'] = test_df['bmi'] * (1 + 0.2 * test_df['children']) * np.where(test_df['smoker'] == 'yes', 1.5, 1)

X_test = test_df[numerical_cols + categorical_cols]
X_test_processed = preprocessor.transform(X_test)

In [24]:
rf_model = joblib.load('random_forest_model.pkl')
xgb_model = joblib.load('tuned_xgboost_model.pkl')
rf_predictions = rf_model.predict(X_test_processed)
xgb_predictions = xgb_model.predict(X_test_processed)

In [None]:
# From notebook 4
rf_w = 0.4943
xgb_w = 0.5057

In [None]:
submissions = {
    'rf': rf_predictions,
    'xgb': xgb_predictions,
    'avg_ensemble': (rf_predictions + xgb_predictions) / 2,
    'weighted_ensemble': rf_w * rf_predictions + xgb_w * xgb_predictions
}

chosen_model = 'weighted_ensemble'

submission_df = pd.DataFrame({
    'id': test_df.index + 1 if 'id' not in test_df.columns else test_df['id'],
    'expenses_pred': submissions[chosen_model]
})

submission_df.to_csv('hardy_predictions.csv', index=False)

The submissions are not completely identical to my competition ones because I didn't use random seed during the competition I just added it in retrospect

`hardy_predictions.csv` is the competition predictions

`submission.csv` is generated now using this code (identical just using random seed 1004)