In [40]:
import pandas as pd
import numpy as np
from pathlib import Path
from ptb_ltc.config.core import config
from ptb_ltc.processing.processing_utils import(
    feature_engineering_hh,
    feature_engineering_ind
)

### ETL for gold data

In [41]:
# Load
silver = pd.read_csv(Path('data','silver.csv'))

In [42]:
# Feature engineering and selection for household-level variables
heads = feature_engineering_hh(silver)
# Feature engineering and selection of individual-level variables
ind = feature_engineering_ind(silver)

In [43]:
# Find the range of the variable
def range_(x):
    return x.max() - x.min()

# Groupby idhogar (household id) and apply aggregation functions
ind_to_drop = ["Id", "Target"]

ind_agg = (
    ind.drop(columns=ind_to_drop)
    .groupby("idhogar")
    .agg(["min", "max", "sum", "count", "std", range_])
)

In [44]:
# Find the range of the variable
def range_(x):
    return x.max() - x.min()

# Groupby idhogar (household id) and apply aggregation functions
ind_to_drop = ["Id", "Target"]
ind_agg = (
    ind.drop(columns=ind_to_drop)
    .groupby("idhogar")
    .agg(["min", "max", "sum", "count", "std", range_])
)

In [45]:
# Rename the columns
new_col = []
for c in ind_agg.columns.levels[0]:
    for stat in ind_agg.columns.levels[1]:
        new_col.append(f"{c}-{stat}")
ind_agg.columns = new_col
        

In [46]:
# Aggregating individual-level variables to household-level: in order to incorporate 
# the individual data into the household data, we need to aggregate it for each household.
# The simplest way to do this is to groupby the family id idhogar and then agg the data.
# The overall strategy is to use a set of aggregation functions for each individual-level variable,
# and then use model-based feature selection methods to filter the most predictive features.

# Find the range of the variable
def range_(x):
    return x.max() - x.min()

# Groupby idhogar (household id) and apply aggregation functions
ind_to_drop = ["Id", "Target"]

ind_agg = (
    ind.drop(columns=ind_to_drop)
    .groupby("idhogar")
    .agg(["min", "max", "sum", "count", "std", range_])
)

# Rename the columns
new_col = []
for c in ind_agg.columns.levels[0]:
    for stat in ind_agg.columns.levels[1]:
        new_col.append(f"{c}-{stat}")
ind_agg.columns = new_col
        
# Select the aggregated features
ind_agg = ind_agg.loc[:, config.processing.SELECTED_IND_AGGREGATED_FEATURES].copy()


In [47]:
# Merge ind_agg back to the heads dataframe on the household id to get the final interim dataset.
# Note that heads ONLY contains household-level variables, and only contains heads of households only as rows
# and the ind_agg dataframe contains aggregated rows (each row is a household)
df_merged = pd.merge(heads, ind_agg, on="idhogar", how="left")
# Replace all - by _, and convert to small case
df_merged.columns = df_merged.columns.str.replace("-", "_").str.lower()

In [48]:
# Post-processing: select features 
gold = df_merged.loc[:, config.model.SELECTED_FEATURES+['target']].copy()
# binarize the target variable
gold["target"] = np.where(gold["target"]<=2, 1, 0)
# rename columns to those used in ptb_ltc
gold = gold.rename(columns=config.model.SELECTED_FEATURES_RENAME_MAPPING)


In [49]:
# Load the gold data
gold.to_csv(Path('data','gold.csv'), index=False)