In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler
from dataclasses import dataclass, field, fields


@dataclass
class Features:
    numeric: list[str] = field(default_factory=list)
    categorical: list[str] = field(default_factory=list)
    ordinal: list[str] = field(default_factory=list)
    numeric_log: list[str] = field(default_factory=list)

    @property
    def names(self):
        to_return = []
        for f in fields(self):
            vals = getattr(self, f.name)
            if vals:
                to_return.extend(vals)
        return to_return


def get_feat_columns():
    numeric_feat_cols = [
        "Age",
        "Health Score",
        "Credit Score",
        "Insurance Duration",
        "Number of Dependents",
        "Vehicle Age",
    ]
    numeric_log_feat_cols = ["Annual Income"]
    categorical_feat_cols = [
        "Gender",
        "Marital Status",
        "Education Level",
        "Occupation",
        "Location",
        "Policy Type",
        "Customer Feedback",
        "Smoking Status",
        "Exercise Frequency",
        "Property Type",
    ]
    ordinal_feat_cols = [
        "Previous Claims",
    ]

    feat_cols = Features(
        numeric=numeric_feat_cols,
        numeric_log=numeric_log_feat_cols,
        categorical=categorical_feat_cols,
        ordinal=ordinal_feat_cols,
    )
    return feat_cols


In [20]:
feat_cols = get_feat_columns()

# Preprocessing pipeline
numeric_transformer = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)


def log1p(df: pd.DataFrame) -> pd.DataFrame:
    return np.log1p(df)


log_transformer = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="median")),
        ("log", FunctionTransformer(np.log1p, validate=True, feature_names_out="one-to-one")),
        ("scaler", StandardScaler()),
    ]
)

# No need to OH encode bc XGBoost can deal with that.
cat_transformer = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]
)

ord_transformer = Pipeline(
    [
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
    ]
)

transformers = []
if feat_cols.numeric:
    transformers.append(("num", numeric_transformer, feat_cols.numeric))
if feat_cols.numeric_log:
    transformers.append(("num_log", log_transformer, feat_cols.numeric_log))
if feat_cols.categorical:
    transformers.append(("cat", cat_transformer, feat_cols.categorical))
if feat_cols.ordinal:
    transformers.append(("ord", ord_transformer, feat_cols.ordinal))
preprocessor = ColumnTransformer(
    transformers=transformers, remainder="passthrough", verbose_feature_names_out=False
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
    ]
)
pipeline.set_output(transform="pandas")


In [21]:
df = pd.read_feather("../data/prepared/prepared_imputed_data.feather")

In [22]:
pipeline.fit(df).get_feature_names_out()

array(['Age', 'Health Score', 'Credit Score', 'Insurance Duration',
       'Number of Dependents', 'Vehicle Age', 'Annual Income', 'Gender',
       'Marital Status', 'Education Level', 'Occupation', 'Location',
       'Policy Type', 'Customer Feedback', 'Smoking Status',
       'Exercise Frequency', 'Property Type', 'Previous Claims',
       'Policy Start Date', 'Premium Amount'], dtype=object)

In [23]:
pipeline.fit_transform(df)

Unnamed: 0,Age,Health Score,Credit Score,Insurance Duration,Number of Dependents,Vehicle Age,Annual Income,Gender,Marital Status,Education Level,Occupation,Location,Policy Type,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Previous Claims,Policy Start Date,Premium Amount
372605,0.808151,-1.476464,-0.681503,-0.777935,-0.006681,1.459463,-0.872407,Female,Married,Master's,Employed,Rural,Basic,Average,No,Weekly,Apartment,0.0,20200210,2742.0
551204,1.403654,-0.082155,-1.794158,-0.777935,1.473681,-1.137413,0.609442,Female,Married,Bachelor's,Employed,Suburban,Comprehensive,Poor,No,Weekly,Apartment,0.0,20201016,1347.0
240320,1.254778,0.752953,0.013020,0.763890,-0.006681,1.632588,-1.512614,Female,Married,Bachelor's,Employed,Urban,Comprehensive,Poor,No,Monthly,House,348080.0,20210102,2196.0
1047361,-0.159542,0.058499,1.189457,-0.007023,-1.487043,1.286338,-0.499349,Male,Divorced,Bachelor's,Employed,Rural,Basic,Poor,Yes,Rarely,Condo,260033.0,20220511,684.0
555362,0.435961,0.096581,-1.942984,-0.392479,-0.006681,-0.964288,0.672599,Male,Divorced,PhD,Employed,Suburban,Comprehensive,Poor,Yes,Weekly,Apartment,260033.0,20200728,1714.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110268,-0.829484,-0.810613,-1.879202,-0.007023,1.473681,1.459463,-1.388292,Female,Single,High School,Employed,Suburban,Basic,Good,No,Rarely,Apartment,133240.0,20201008,538.0
259178,-0.457294,-0.138349,-1.808332,1.534802,-0.006681,1.113213,1.062392,Male,Divorced,Bachelor's,Employed,Urban,Comprehensive,Poor,No,Daily,House,0.0,20220115,819.0
131932,0.138209,-0.033678,1.813111,-0.777935,-1.487043,-0.271787,1.186012,Male,Married,Bachelor's,Employed,Urban,Premium,Average,Yes,Daily,Condo,0.0,20230930,403.0
671155,0.584837,1.360687,-1.057113,-1.548847,-1.487043,-1.310538,-0.823763,Male,Divorced,Bachelor's,Employed,Rural,Comprehensive,Good,Yes,Monthly,Condo,0.0,20191011,542.0
