In [1]:
import pandas as pd
import numpy as np

## Data preprocessing

In [None]:
def preprocessing():

    all_read = pd.read_spss('data/all.sav')
    all_read = all_read.replace('NA', np.nan)
    all_read = all_read.replace(r'^\s*$', pd.NA, regex=True)

    all_data = all_read.rename(columns={
        'ques_50_01': 'height',
        'ques_50_02': 'weight',
        'ques_50_03': 'waist',
        'ques_51_01': 'sbp',
        'ques_51_02': 'dbp',
        'ques_51_02_01': 'htn_med',
        'ques_08': 'fridge_case',
        'ques_08_01': 'freezer_case',
        'ques_08_04': 'fryingpan',
        'ques_08_05': 'coatingpot',
        'ques_08_06': 'coatingcase',
        'ques_08_07': 'ricepot',
        'ques_28_01': 'water',
        'ques_17': 'smoking',
        'ques_28_02': 'water_type',
        'ques_29_01': 'cupnoodle',
        'ques_29_02': 'fast_food', # added
        'ques_29_03': 'microwave_food',
        'ques_29_04': 'popcorn',
        'ques_29_05': 'can_food',
        'ques_31_01': 'large_fish',
        'ques_31_02': 'fish',
        'ques_31_03': 'crustacean',
        'ques_31_05': 'shellfish',
        'ques_35_01': 'wrapped_food', # added
        'ques_35_02': 'paper_cup', # added
        'ques_35_03': 'pet_drink', # added
        'gender': 'sex',
    })

    all_data['PFAS'] = all_data['PFOA'] + all_data['PFOS'] + all_data['PFHxS'] + all_data['PFNA'] + all_data['PFDeA']
    all_data['PFAS_class'] = np.where(all_data['PFAS'] < 20, "none", "expose")
    all_data['age_class'] = np.where(all_data['age'] < 20, "mh", "adult")
    all_data['BMI'] = all_data['weight'] / (all_data['height'] / 100)**2
    all_data['sex'] = np.where(all_data['sex'] == 1, "Male", "Female")
    all_data['htn_med'] = np.where(all_data['htn_med'] == 1, "no", "yes")

    all_data['height'] = all_data['height'].fillna(all_data['height'].mean())
    all_data['weight'] = all_data['weight'].fillna(all_data['weight'].mean())

    # Select columns
    selected_columns = [
        'sex', 'age', 'height', 'weight', 'waist', 
        'sbp', 'dbp', 
        'htn_med',
        'BPb', 'BHg', 'Uhg', 'MEHHP', 'MEOHP', 'MCPP', 'MEP', 'BPA', 'MP', 'BP_3', 'COT', 'tt_MA', 'PBA',
        'U_crea', 'S_crea', 
        'WBC', 'EOS', 'RBC', 'PLT', 'HB', 'HCT', 
        'HbA1c', 'ALT', 'AST', 'GGT', 'CHOL', 'HDL', 'TG', 'T_lipid', 'S_crea', 'U_crea', 'SG', 'T_IgE',
        'fryingpan', 'coatingpot', 'coatingcase', 'ricepot', 'fridge_case', 'freezer_case', 
        'water',
        'PFAS_class',
        'age_class',
        'smoking', 'cupnoodle', 'microwave_food', 'popcorn', 'can_food', 'large_fish', 'fish', 'crustacean', 'shellfish',
        'OHP', 'NAP', 'OHFlu', 'OHPhe', 'MnBP', 'MECPP', 'MBzP', 'BPF', 'BPS', 'TCS', 'EP', 'PP', 'BP', 'BMA', # added
        'fast_food', 'wrapped_food', 'paper_cup', 'pet_drink', # added
        'BMI', # added
    ]
    all_data = all_data[selected_columns]

    # Convert categorical variables
    water_map = {1: "tap", 2: "tap", 3: "filter", 4: "filter", 5: "bottled", 
                 6: "spring", 7: "well", 8: "well", 9: "others"}
    all_data['water'] = all_data['water'].map(water_map).astype('category')
        
    # fridge_case & freezer_case columns conversion
    for column in ['fridge_case', 'freezer_case']:
        data_map = {
            1: 'glass', 2: 'metal', 3: 'plastic',
            4: 'ziploc', 5: 'others', 6: 'others'
        }
        all_data[column] = all_data[column].map(data_map).astype('category')
    
    all_data['PFAS_class'] = pd.Categorical(all_data['PFAS_class'], categories=["none", "expose"])
    all_data['sex'] = pd.Categorical(all_data['sex'])
    all_data['htn_med'] = pd.Categorical(all_data['htn_med'], categories=['no', 'yes'], ordered=False)
    all_data['age_class'] = pd.Categorical(all_data['age_class'], categories=['mh', 'adult'], ordered=False)

    all_data = all_data.replace(8888, np.nan)
    all_data = all_data.replace(999, np.nan)
    
    target_values = [7777, 8888, 9999, 999]
    columns_with_any_value = all_data.columns[all_data.isin(target_values).any()].tolist()
    print(f"Columns containing masked values: {columns_with_any_value}")
    
    return all_data

all_data = preprocessing()

Columns containing masked values: []


In [3]:
import pickle

all_data.to_pickle('data/all_data.pkl')
all_data.to_csv('data/all_data.csv')