# Part02 Feature Engineering

In [24]:
# import modules

from typing import List

import pandas as pd
import joblib
import yaml
import numpy as np
from sklearn.decomposition import PCA

from src.commons.Utils import impute_scale_and_convert_to_numpy

In [25]:
feature_train: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/ohe_train.csv"
)

churn_train: pd.DataFrame = pd.read_csv(
    filepath_or_buffer="../../dataset/01_interim/churn_train.csv"
)

In [26]:
with open(file="../../config.yaml", mode="r") as file_stream: 
    stream_loader = yaml.load(
        stream=file_stream,
        Loader=yaml.SafeLoader
    )
    mean_total_charges: float = stream_loader["MEAN_TOTAL_CHARGES"]

In [27]:
feature_np: np.ndarray
churn_np: np.ndarray

feature_np, churn_np = impute_scale_and_convert_to_numpy(
    ohe_df=feature_train,
    columns_with_nulls=["TotalCharges"],
    impute_val=[mean_total_charges],
    scaler_folder="../../models/scaler",
    churn_df=churn_train
) 

print(f"ohe_np shape {feature_np.shape}")
print("---"*6)
print(f"churn_np shape {churn_np.shape}")

ohe_np shape (5346, 46)
------------------
churn_np shape (5346, 1)


We have 46 features to predict two categories in `churn` target

Furthermore, since we did one-hot encoder when preparing for the train dataset, the resulting matrix is quite sparse

Hence, to reduce wasting resources for training sparse matrix, we want first to optimize the features. 

To do so, we will convert the 46 features into dense representation using Primary Component Analysis (PCA), whereby the 46 dimensions will be tranformed into lower-dimension vector

As a rule-of-thumb, the lower-dimension is about fourth-root of the initial dimension

Hence, a brief math (46 ^ 1/4) ~ 3

In [28]:
pca: PCA = PCA(n_components=3)

feature_pca: np.ndarray = pca.fit_transform(feature_np)

joblib.dump(
    value=pca, 
    filename="../../models/feature_pca.pkl"
)

print(f"feature_np shape {feature_np.shape}")
print("---"*6)
print(f"feature_pca shape {feature_pca.shape}")

feature_np shape (5346, 46)
------------------
feature_pca shape (5346, 3)
