# Correct the input data

## 0. Libraries and global variables

In [1]:
import pandas as pd

In [2]:
imputed_dataframe = True # Numerical variables imputed or not

## 1. Load data

Load data from each hospital unit. The data is available in both CSV and pickle formats, with or without imputation for numerical variables. Imputed dataframe will be used in the analysis

In [3]:
import os
# Get the path of imputed/not imputed data
path = "../../0_get_data/data/out/"
if imputed_dataframe:
    path += "patient_imputed.pkl"
else:
    path += "patient.pkl"

# Load the pickle dataframe and reset index
df = pd.read_pickle(path).reset_index(drop=True) #TODO PONER PATH CORRECTO

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49970 entries, 0 to 49969
Data columns (total 57 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   lab_potassium              49970 non-null  float64       
 1   lab_rbc                    49970 non-null  float64       
 2   lab_leukocyte              49970 non-null  float64       
 3   lab_crp                    49970 non-null  float64       
 4   lab_mcv                    49970 non-null  float64       
 5   lab_sodium                 49970 non-null  float64       
 6   lab_urea                   49970 non-null  float64       
 7   lab_platelet               49970 non-null  float64       
 8   lab_hct                    49970 non-null  float64       
 9   lab_hemoglobin             49970 non-null  float64       
 10  lab_neutrophil             49970 non-null  float64       
 11  lab_lymphocyte_percentage  49970 non-null  float64       
 12  lab_

## 2. Perform minor correctios to df

Rename some columns

In [4]:
# Mapper ("old_name": "new_name")
new_col_names = {"COD_FEC_FALLECIMIENTO": "death_datetime",
                 "NUHSA_ENCRIPTADO": "id",
                 "centro": "center",
                 "uci": "icu",
                 "provincia": "province",
                 "periodo_1": "wave_1",
                 "periodo_2": "wave_2",
                 "periodo_3": "wave_3",
                 "periodo_4": "wave_4",
                 "periodo_5": "wave_5",
                 "periodo_6": "wave_6",
                 "periodo_7": "wave_7"
                 }

# perform the rename
df.rename(columns=new_col_names, inplace=True)

The removal of waves 1 and 2 is justified by the unavailability of vaccines at that time. These initial waves are omitted to ensure accurate analysis and representation of the data.

In [5]:
# List of wave columns
wave_columns = ["wave_1", "wave_2"]

# Iterate over each wave column
for wave_column in wave_columns:
    print(f"Rows removed from {wave_column}: {df[df[wave_column] == 1].shape[0]}")
    df = df[df[wave_column] != 1]

# Drop the wave columns
df.drop(columns=wave_columns, inplace=True)

Rows removed from wave_1: 17
Rows removed from wave_2: 12481


Reorder the columns

In [6]:
general_var_order = ["id",
                     "sex",
                     "age",
                     "center",
                     "type_center",
                     "num_shots", 
                     "icu", 
                     "province", 
                     "reinfected",
                     "inpatient_days",                 
                     "admission_datetime",
                     "discharge_datetime", 
                     "hospital_outcome",
                     "death_datetime",
                     "delta_days_death"
                     ]

waves_cols =  sorted([col for col in df.columns if col.startswith('wave')])
pmhx_cols = sorted([col for col in df.columns if col.startswith('pmhx')])
lab_cols = sorted([col for col in df.columns if col.startswith('lab')])

# Define the desired order
desired_orders = general_var_order + waves_cols + pmhx_cols + lab_cols

# Perform the order
df = df[desired_orders]
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 37472 entries, 0 to 49968
Data columns (total 55 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         37472 non-null  object        
 1   sex                        37472 non-null  int64         
 2   age                        37472 non-null  int64         
 3   center                     37472 non-null  object        
 4   type_center                37472 non-null  category      
 5   num_shots                  37472 non-null  int64         
 6   icu                        37472 non-null  int64         
 7   province                   37472 non-null  object        
 8   reinfected                 37472 non-null  category      
 9   inpatient_days             37472 non-null  int64         
 10  admission_datetime         37472 non-null  datetime64[ns]
 11  discharge_datetime         37472 non-null  datetime64[ns]
 12  hospital_

Correct dtypes

In [7]:
# Define the type of variables to change
mapper = {"sex": "category",
          "center": "category",
          "icu": "category",
          "province": "category",
          "hospital_outcome": "category",
          "delta_days_death": "Int64", #Int64 (accepts NA) not int64
          "wave_1": "category",
          "wave_2": "category",
          "wave_3": "category",
          "wave_4": "category",
          "wave_5": "category",
          "wave_6": "category",
          "wave_7": "category",
          "pmhx_activecancer": "category",
          "pmhx_asthma": "category",
          "pmhx_chf": "category",
          "pmhx_chronicliver": "category",
          "pmhx_ckd": "category",
          "pmhx_copd": "category",
          "pmhx_dementia": "category",
          "pmhx_diabetes": "category",
          "pmhx_hld": "category",
          "pmhx_htn": "category",
          "pmhx_ihd": "category",
          "pmhx_obesity": "category",
          "pmhx_stroke": "category",
          }

# Check the columns exists
filtered_mapper = {column: dtype for column, dtype in mapper.items() if column in df.columns}

#perform the change
df = df.astype(filtered_mapper)

In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 37472 entries, 0 to 49968
Data columns (total 55 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         37472 non-null  object        
 1   sex                        37472 non-null  category      
 2   age                        37472 non-null  int64         
 3   center                     37472 non-null  category      
 4   type_center                37472 non-null  category      
 5   num_shots                  37472 non-null  int64         
 6   icu                        37472 non-null  category      
 7   province                   37472 non-null  category      
 8   reinfected                 37472 non-null  category      
 9   inpatient_days             37472 non-null  int64         
 10  admission_datetime         37472 non-null  datetime64[ns]
 11  discharge_datetime         37472 non-null  datetime64[ns]
 12  hospital_

After a preliminary analysis with clinical experts, the variables `reinfected` are discarded because they have a very small n and do not show a significant improvement. In addition, the variables `province` and `center` are eliminated because they do not provide any value to the survival analysis. They will be stored in separate files to be presented as appendices.

In [9]:
df.drop(columns=["reinfected"], inplace=True)
df_annexed = df.loc[:,["province", "center"]]
df.drop(columns=["province", "center"], inplace=True)

We add a categorical column that indicate if the patient is vaccinated 14 days before the admission.

In [10]:
df["vaccinated"] = df["num_shots"].apply(lambda x: 1 if x > 0 else 0).astype("category")

In [11]:
print(df.info(), end='\n\n')
print(df_annexed.info())

<class 'pandas.core.frame.DataFrame'>
Index: 37472 entries, 0 to 49968
Data columns (total 53 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   id                         37472 non-null  object        
 1   sex                        37472 non-null  category      
 2   age                        37472 non-null  int64         
 3   type_center                37472 non-null  category      
 4   num_shots                  37472 non-null  int64         
 5   icu                        37472 non-null  category      
 6   inpatient_days             37472 non-null  int64         
 7   admission_datetime         37472 non-null  datetime64[ns]
 8   discharge_datetime         37472 non-null  datetime64[ns]
 9   hospital_outcome           37472 non-null  category      
 10  death_datetime             9724 non-null   datetime64[ns]
 11  delta_days_death           9724 non-null   Int64         
 12  wave_3   

Let's reorder the variables.

In [12]:
new_order = ['id',
             'sex',
             'age',
             'num_shots',
             "type_center",
             'vaccinated',
             'icu',
             'inpatient_days',
             'admission_datetime',
             'discharge_datetime',
             'hospital_outcome',
             'death_datetime',
             'delta_days_death',
             'wave_1', 
             'wave_2',
             'wave_3','wave_4',
             'wave_5',
             'wave_6',
             'wave_7',
             'pmhx_activecancer',
             'pmhx_asthma',
             'pmhx_chf',
             'pmhx_chronicliver',
             'pmhx_ckd',
             'pmhx_copd','pmhx_dementia',
             'pmhx_diabetes',
             'pmhx_hld',
             'pmhx_htn',
             'pmhx_ihd','pmhx_obesity',
             'pmhx_stroke',
             'lab_alt',
             'lab_ast',
             'lab_creatinine',
             'lab_crp',
             'lab_ddimer',
             'lab_glucose',
             'lab_hct',
             'lab_hemoglobin',
             'lab_inr',
             'lab_ldh',
             'lab_leukocyte',
             'lab_lymphocyte',
             'lab_lymphocyte_percentage',
             'lab_mch',
             'lab_mcv',
             'lab_neutrophil',
             'lab_neutrophil_percentage',
             'lab_platelet',
             'lab_potassium',
             'lab_rbc',
             'lab_sodium',
             'lab_urea']

# Get the intersection of existing columns and the new order
columns_to_reorder = [col for col in new_order if col in df.columns]

# Reorder the DataFrame columns
df = df[columns_to_reorder]

Change the levels of type_center to english

In [13]:
type_center_EN = {
    'Hospital comarcal': 'County Hospital',
    'Hospital de especialidades': 'Specialized Hospital',
    'Hospital regional': 'Regional Hospital'
}

df['type_center'] = df['type_center'].map(type_center_EN)


## 3. Save corrected dataframe

In [14]:
save_path = "./data/" + os.path.basename(path)

df.to_pickle(save_path)
df_annexed.to_pickle('./data/geocount_df.pkl')