In [11]:
import pandas as pd
from pathlib import Path
from ptb_ltc.config.core import config
from ptb_ltc.processing.processing_utils import(
    obj_to_cont,
    data_preprocessing_silver
)

### ETL for silver data

In [12]:
# Extract
bronze = pd.read_csv(Path('data','bronze.csv'))

In [13]:
# Transform
silver = (bronze
    # map yes/no to 1/0
    .pipe(lambda _df: obj_to_cont(vars = ["dependency", "edjefa", "edjefe"], 
                                  df= _df, 
                                  mapping = {"yes": 1, "no": 0}))
     # Handling missingness in v18q1 (number of tablets)
     .assign(v18q1 = lambda _df: _df.v18q1.fillna(0))
     # Handling missingness in v2a1 (monthly rent payment)
     .assign(v2a1 = lambda _df: _df.v2a1.mask(_df.tipovivi1 == 1, 0))
     # If individual is over 19 or younger than 7 and have NA rez_esc, we set it to 0
     .assign(rez_esc=lambda _df: _df["rez_esc"].mask(
        ((_df["age"] > 19) | (_df["age"] < 7)) & (_df["rez_esc"].isnull()), 0
    ))
    # From the variable description, the maximum value
    # for rez_esc is 5. Therefore, any values above 5
    # should be set to 5
    .assign(rez_esc=lambda _df: _df['rez_esc'].clip(upper=5))
    # Create missing flags for rez_esc and v2al
    .assign(v2a1_missing = lambda _df: _df['v2a1'].isnull())
    .assign(rez_esc_missing = lambda _df: _df['rez_esc'].isnull())
    # Drop the squared variables
    .drop(columns=config.processing.SQUARED_VARS)
)
 

In [14]:
# Load
silver.to_csv(Path('data','silver.csv'), index=False)