# Correct the input data

## 0. Libraries and global variables

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
imputed_dataframe = True # Numerical variables imputed or not

## 1. Load data

Load data from each hospital unit. The data is available in both CSV and pickle formats, with or without imputation for numerical variables. Imputed dataframe will be used in the analysis

In [3]:
import os
# Get the path of imputed/not imputed data
path = "../../0_get_data/data/out/"
if imputed_dataframe:
    path += "patient_imputed.pkl"
else:
    path += "patient.pkl"
print(os.listdir())
# Load the pickle dataframe and reset index
df = pd.read_pickle(path).reset_index(drop=True) #TODO PONER PATH CORRECTO

print(df.info())

['01_correct_input_df-checkpoint.ipynb']


FileNotFoundError: [Errno 2] No such file or directory: '../../0_get_data/data/out/patient_imputed.pkl'

## 2. Perform minor correctios to df

Rename some columns

In [None]:
# Mapper ("old_name": "new_name")
new_col_names = {"COD_FEC_FALLECIMIENTO": "death_datetime",
                 "NUHSA_ENCRIPTADO": "id",
                 "centro": "center",
                 "uci": "icu",
                 "provincia": "province",
                 "periodo_1": "wave_1",
                 "periodo_2": "wave_2",
                 "periodo_3": "wave_3",
                 "periodo_4": "wave_4",
                 "periodo_5": "wave_5",
                 "periodo_6": "wave_6",
                 "periodo_7": "wave_7"
                 }

# perform the rename
df.rename(columns=new_col_names, inplace=True)

The first record in which the admitted patient is vaccinated is the one indicated in the output of the next cell, records from pandemic periods in which there were no vaccinations available will be deleted.

In [None]:
first_vacc = pd.read_pickle("../../0_get_data/data/vacunas.pkl")["FEC_VACUNACION"].sort_values().reset_index().iloc[0]["FEC_VACUNACION"]
print("first vaccunation in the dataset:", first_vacc.strftime('%d/%m/%Y'))

waves_dates = pd.read_csv("../../0_get_data/data/05_Periodos_pandemicos.csv", sep=";")
waves_dates["fecha_inicio"] = pd.to_datetime(waves_dates["fecha_inicio"])
waves_dates["fecha_fin"] = pd.to_datetime(waves_dates["fecha_fin"])
# Create a dictionary to map periods to waves
period_to_wave = {
    "Periodo 1": "wave_1",
    "Periodo 2": "wave_2",
    "Periodo 3": "wave_3",
    "Periodo 4": "wave_4",
    "Periodo 5": "wave_5",
    "Periodo 6": "wave_6",
    "Periodo 7": "wave_7"
}

# Apply the dictionary to the 'nombre_periodo' column
waves_dates['nombre_periodo'] = waves_dates['nombre_periodo'].apply(lambda period: period_to_wave.get(period))

waves_dates['vacc_available'] = waves_dates["fecha_fin"] >= first_vacc

waves_drop = waves_dates[waves_dates["vacc_available"]==False]["nombre_periodo"].to_list()

In [None]:
for wave_i in waves_drop:
    index_to_drop = df[df[wave_i]==1].index
    print(f"Rows removed from {wave_i}: {len(index_to_drop)}")
    df.drop(index_to_drop, inplace=True)
    df.drop(columns=wave_i, inplace=True)

Reorder the columns

In [None]:
general_var_order = ["id",
                     "sex",
                     "age",
                     "center",
                     "type_center",
                     "num_shots", 
                     "icu", 
                     "province", 
                     "reinfected",
                     "inpatient_days",                 
                     "admission_datetime",
                     "discharge_datetime", 
                     "hospital_outcome",
                     "death_datetime",
                     "delta_days_death"
                     ]

waves_cols =  sorted([col for col in df.columns if col.startswith('wave')])
pmhx_cols = sorted([col for col in df.columns if col.startswith('pmhx')])
lab_cols = sorted([col for col in df.columns if col.startswith('lab')])

# Define the desired order
desired_orders = general_var_order + waves_cols + pmhx_cols + lab_cols

# Perform the order
df = df[desired_orders]
print(df.info())

Correct dtypes

In [None]:
# Define the type of variables to change
mapper = {"sex": "category",
          "center": "category",
          "icu": "category",
          "province": "category",
          "hospital_outcome": "category",
          "delta_days_death": "Int64", #Int64 (accepts NA) not int64
          #"wave_1": "category", Removed no vacc available
          #"wave_2": "category", Removed no vacc available
          "wave_3": "category",
          "wave_4": "category",
          "wave_5": "category",
          "wave_6": "category",
          "wave_7": "category",
          "pmhx_activecancer": "category",
          "pmhx_asthma": "category",
          "pmhx_chf": "category",
          "pmhx_chronicliver": "category",
          "pmhx_ckd": "category",
          "pmhx_copd": "category",
          "pmhx_dementia": "category",
          "pmhx_diabetes": "category",
          "pmhx_hld": "category",
          "pmhx_htn": "category",
          "pmhx_ihd": "category",
          "pmhx_obesity": "category",
          "pmhx_stroke": "category",
          }

#perform the change
df = df.astype(mapper)

In [None]:
df.info()

After a preliminary analysis with clinical experts, the variables `reinfected` are discarded because they have a very small n and do not show a significant improvement. In addition, the variables `province` and `center` are eliminated because they do not provide any value to the survival analysis. They will be stored in separate files to be presented as appendices.

In [None]:
df.drop(columns=["reinfected"], inplace=True)
df_annexed = df.loc[:,["province", "center"]]
df.drop(columns=["province", "center"], inplace=True)

We add a categorical column that indicate if the patient is vaccinated 14 days before the admission.

In [None]:
df["vaccinated"] = df["num_shots"].apply(lambda x: 1 if x > 0 else 0).astype("category")

In [None]:

print(df.info(), end='\n\n')
print(df_annexed.info())

Let's reorder the variables.

In [None]:
new_order = ['id', 'sex', 'age', 'num_shots', "type_center", 'vaccinated', 'icu', 'inpatient_days',
            'admission_datetime', 'discharge_datetime', 'hospital_outcome',
            'death_datetime', 'delta_days_death', #'wave_1', 'wave_2', removed no vacc available
            'wave_3','wave_4', 'wave_5', 'wave_6', 'wave_7', 'pmhx_activecancer','pmhx_asthma',
            'pmhx_chf', 'pmhx_chronicliver', 'pmhx_ckd', 'pmhx_copd','pmhx_dementia', 
            'pmhx_diabetes', 'pmhx_hld', 'pmhx_htn', 'pmhx_ihd','pmhx_obesity', 
            'pmhx_stroke', 'lab_alt', 'lab_ast', 'lab_creatinine','lab_crp', 
            'lab_ddimer', 'lab_glucose', 'lab_hct', 'lab_hemoglobin','lab_inr', 
            'lab_ldh', 'lab_leukocyte', 'lab_lymphocyte','lab_lymphocyte_percentage', 
            'lab_mch', 'lab_mcv', 'lab_neutrophil','lab_neutrophil_percentage', 
            'lab_platelet', 'lab_potassium', 'lab_rbc','lab_sodium', 'lab_urea']

df = df[new_order]

## 3. Save corrected dataframe

In [None]:
save_path = "./data/" + os.path.basename(path)

df.to_pickle(save_path)
df_annexed.to_pickle('./data/geocount_df.pkl')