In [1]:
import csv
import pandas as pd
import json
import os
import os.path as pathlib
import numpy as np
from copy import deepcopy
from glob import glob
from finalize_table_common_fun import table_one_hot_encoding
from sklearn.impute import SimpleImputer

In [2]:
def do_conversion(dataset_type='random'):
    toproot_dir = '/ssd/kangdang/mmc_project_2023_submission'
    root_dir = pathlib.join(toproot_dir, 'main_exp_code', 'step1_preprocessing_redo')
    output_dir = pathlib.join(root_dir, 'preprocessed_data', dataset_type + '_processed_final_csv')
    os.makedirs(output_dir, exist_ok=True)
    train_orig_name = pathlib.join(output_dir, 'train_table_orig_' + str(dataset_type) + '_12_vars_with_nan_intermediate_table_patient_level.csv')
    train_table = pd.read_csv(train_orig_name)
    val_orig_name = pathlib.join(output_dir, 'val_table_orig_' + str(dataset_type) + '_12_vars_with_nan_intermediate_table_patient_level.csv')
    val_table = pd.read_csv(val_orig_name)
    test_orig_name = pathlib.join(output_dir, 'test_table_orig_' + str(dataset_type) + '_12_vars_with_nan_intermediate_table_patient_level.csv')
    test_table = pd.read_csv(test_orig_name)
    
    # 性别, 年龄, 舒张压, 收缩压
    # 体质指数 BMI, 病程（月）, 您是否有明确诊断的高血压？, 您是否患有高血脂？, 您是否患有心血管疾病？
    # 吸烟分类, 饮酒分类[non-drinker,former drinker(light or heavy),current drinker(light or heavy)]
    numerical_vars = ['param2', 'param3', 'param4', 'param5', 'param6', 'param7']
    categorical_vars = ['param1', 'param8', 'param9', 'param10', 'param11', 'param12']
    numerical_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    numerical_imputer.fit(train_table[numerical_vars])
    train_table[numerical_vars] = numerical_imputer.transform(train_table[numerical_vars])
    val_table[numerical_vars] = numerical_imputer.transform(val_table[numerical_vars])
    test_table[numerical_vars] = numerical_imputer.transform(test_table[numerical_vars])
    
    categorical_imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    categorical_imputer.fit(train_table[categorical_vars])
    train_table[categorical_vars] = categorical_imputer.transform(train_table[categorical_vars])
    val_table[categorical_vars] = categorical_imputer.transform(val_table[categorical_vars])
    test_table[categorical_vars] = categorical_imputer.transform(test_table[categorical_vars])
    
    train_table_imputed = table_one_hot_encoding(train_table)
    val_table_imputed = table_one_hot_encoding(val_table)
    test_table_imputed = table_one_hot_encoding(test_table)
    
    train_table_imputed.to_csv(train_orig_name.replace('with_nan_intermediate_table', 'fusion_simple_imputed'), index=False)
    val_table_imputed.to_csv(val_orig_name.replace('with_nan_intermediate_table', 'fusion_simple_imputed'), index=False)
    test_table_imputed.to_csv(test_orig_name.replace('with_nan_intermediate_table', 'fusion_simple_imputed'), index=False)

In [3]:
do_conversion(dataset_type='random')