In [2]:
import pandas as pd
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder

In [3]:
nw_hosp_admissions = pd.read_csv('../data/nw_hosp/admissions.csv')
nw_icu_stays = pd.read_csv('../data/nw_icu/icustays.csv')
icu_ditems = pd.read_csv('../data/nw_icu/d_items.csv')
hosp_lab_items = pd.read_csv('../data/nw_hosp/d_labitems.csv')
chart_events = pd.read_csv('../data/nw_icu/chartevents.csv')
lab_events = pd.read_csv('../data/nw_hosp/labevents.csv')
prescriptions = pd.read_csv('../data/nw_hosp/prescriptions.csv')
procedure_events = pd.read_csv('../data/nw_icu/procedureevents.csv')

In [None]:
chart_events.head()

In [8]:
cols = ['admission_type', 'admission_location', 'discharge_location', 'insurance', 'language', 'marital_status','race']

#Diccionario de Encoders. Un encoder para cada columna que queremos codificar
d = defaultdict(LabelEncoder)

#Aplicamos el encoder a cada columna del tirón con apply
nw_hosp_admissions[cols] = nw_hosp_admissions[cols].apply(
    lambda x: d[x.name].fit_transform(x)
)

#Sacamos los mapeos de cada columna. Estamos volviendo a codificar, pero solo los valores únicos de cada columna, no toda la columna 
hosp_mappings = {col: dict(zip(d[col].classes_, d[col].transform(d[col].classes_))) 
            for col in cols}


# Para ver los mapeos de cada columna
for col, mapeo in hosp_mappings.items():
    print(f"\nMapeo para '{col}':")
    for valor, codigo in mapeo.items():
        print(f"  {valor} -> {codigo}")







Mapeo para 'admission_type':
  ELECTIVE -> 0
  ELECTIVE-ROUTINE -> 1
  EMERGENCY -> 2
  TRAUMA -> 3
  URGENT -> 4
  nan -> 5

Mapeo para 'admission_location':
  CARDIOLOGY -> 0
  EMERGENCY MEDICINE -> 1
  GASTROENTEROLOGY -> 2
  INTENSIVE CARE -> 3
  MEDICINE -> 4
  NEUROLOGICAL INTENSIVE CARE -> 5
  NEUROLOGY -> 6
  OBSTETRICS -> 7
  OBSTETRICS AND GYNECOLOGY -> 8
  ORTHOPAEDIC SURGERY -> 9
  PEDIATRIC INTENSIVE CARE -> 10
  PEDIATRICS -> 11
  RADIOLOGY -> 12
  RESEARCH -> 13
  SURGERY -> 14
  UNKNOWN -> 15
  nan -> 16

Mapeo para 'discharge_location':
  ACUTE CARE HOSPITAL -> 0
  ACUTE INPATIENT REHABILITATION -> 1
  ADMITTED TO L&D -> 2
  AGAINST MEDICAL ADVICE (AMA) OR ELOPEMENT -> 3
  CANCER CENTER OR CHILDREN'S HOSPITAL -> 4
  CRITICAL ACCESS HOSPITAL -> 5
  DESIGNATED DISASTER ALTERNATIVE CARE SITE -> 6
  ED DISMISSED-NEVER ARRIVED -> 7
  EXPIRED -> 8
  EXPIRED - HOSPICE -> 9
  GIFT OF HOPE / STILL A PATIENT -> 10
  GROUP HOME -> 11
  HOME OR SELF CARE -> 12
  HOME WITH EQUIPME

In [None]:
nw_hosp_admissions.head()


61843

In [3]:
cols = ['first_careunit', 'last_careunit']

#Diccionario de Encoders. Un encoder para cada columna que queremos codificar
d = defaultdict(LabelEncoder)

#Aplicamos el encoder a cada columna del tirón con apply
nw_icu_stays[cols] = nw_icu_stays[cols].apply(
    lambda x: d[x.name].fit_transform(x)
)

#Sacamos los mapeos de cada columna. Estamos volviendo a codificar, pero solo los valores únicos de cada columna, no toda la columna 
icu_mappings = {col: dict(zip(d[col].classes_, d[col].transform(d[col].classes_))) 
            for col in cols}


# Para ver los mapeos de cada columna
for col, mapeo in icu_mappings.items():
    print(f"\nMapeo para '{col}':")
    for valor, codigo in mapeo.items():
        print(f"  {valor} -> {codigo}")


Mapeo para 'first_careunit':
  BOARDERS -> 0
  CCU -> 1
  CICU -> 2
  CTICU -> 3
  CTU -> 4
  ICU -> 5
  ICU OVERFLOW -> 6
  IMCU -> 7
  INPATIENT -> 8
  MICU -> 9
  MTU -> 10
  NEUROSCIENCE INPATIENT -> 11
  NICU -> 12
  NSICU -> 13
  ORTHOPEDICS -> 14
  PICU -> 15
  RADIOLOGY -> 16
  SICU -> 17

Mapeo para 'last_careunit':
  BOARDERS -> 0
  CCU -> 1
  CICU -> 2
  CTICU -> 3
  CTU -> 4
  ICU -> 5
  ICU OVERFLOW -> 6
  IMCU -> 7
  INPATIENT -> 8
  MICU -> 9
  MTU -> 10
  NEUROSCIENCE INPATIENT -> 11
  NICU -> 12
  NSICU -> 13
  ORTHOPEDICS -> 14
  PICU -> 15
  SICU -> 16


In [4]:
nw_icu_stays.head()

Unnamed: 0,subject_id,hadm_id,stay_id,first_careunit,last_careunit,intime,outtime,los
0,30000238,44714428,59801852,3,3,2195-10-14 15:12:00,2195-10-16 14:32:00,1.972222
1,30000246,40653169,55154064,2,2,2104-05-05 18:23:00,2104-05-07 14:26:00,1.835417
2,30000521,46912110,58492193,17,16,2188-04-17 16:46:00,2188-04-18 18:16:00,1.0625
3,30000590,44608425,50707570,13,13,2177-12-09 12:01:00,2177-12-10 16:43:00,1.195833
4,30000828,40615900,57131111,9,1,2124-10-17 23:12:00,2124-10-18 11:57:00,0.53125


In [19]:
cols = ['label']

#Diccionario de Encoders. Un encoder para cada columna que queremos codificar
d = defaultdict(LabelEncoder)

#Aplicamos el encoder a cada columna del tirón con apply
icu_ditems[cols] = icu_ditems[cols].apply(
    lambda x: d[x.name].fit_transform(x)
)

#Sacamos los mapeos de cada columna. Estamos volviendo a codificar, pero solo los valores únicos de cada columna, no toda la columna 
ditems_mapping = {col: dict(zip(d[col].classes_, d[col].transform(d[col].classes_))) 
            for col in cols}


# Para ver los mapeos de cada columna
for col, mapeo in ditems_mapping.items():
    print(f"\nMapeo para '{col}':")
    for valor, codigo in mapeo.items():
        print(f"  {valor} -> {codigo}")


Mapeo para 'label':
  ACTIVITY / POSITIONING WITH MOBILITY -> 0
  ADMIT TO INPATIENT -> 1
  ADMIT TO INPATIENT - CENTRAL -> 2
  ADVANCE DIET AS TOLERATED -> 3
  AIRBORNE PRECAUTIONS -> 4
  AIRWAY CLEARANCE THERAPY -> 5
  AMBULATE PATIENT -> 6
  AMBULATORY OXIMETRY -> 7
  ANTICOAGULATION MANAGEMENT REFERRAL -> 8
  APPLY ICE TO AFFECTED AREA -> 9
  ARTERIAL LINE INSERTION -> 10
  ASPIRATION PRECAUTIONS -> 11
  AVS INSTRUCTIONS: SUGAMMADEX ADMINISTERED -> 12
  BEDREST -> 13
  BP DIASTOLIC -> 14
  BP SYSTOLIC -> 15
  BRONCHOSCOPY -> 16
  CAPNOGRAPHY -> 17
  CARDIAC MONITORING -> 18
  CARDIAC REHAB REFERRAL -> 19
  CARDIAC SURGERY REFERRAL -> 20
  CARDIOLOGY REFERRAL -> 21
  CARDIOVERSION -> 22
  CATH CASE REQUEST -> 23
  CATHETER CARE -> 24
  CENTRAL LINE -> 25
  CENTRAL LINE CARE & MAINTENANCE -> 26
  CHANGE TUBING -> 27
  CHEST TUBE REMOVAL -> 28
  CHEST TUBE TO CONTINUOUS SUCTION -> 29
  CHEST TUBE TO WATER SEAL -> 30
  CLINICAL MANAGEMENT OF PAIN, AGITATION, DELIRIUM IN THE MECHANICAL

In [20]:
icu_ditems.head()

Unnamed: 0,itemid,label,abbreviation,linksto,category,unitname,param_type,lownormalvalue,highnormalvalue
0,300001,270,BMI,chartevents,General,Kg/m^2,Numeric,,
1,320045,266,HR,chartevents,Routine Vital Signs,bpm,Numeric,,
2,320050,268,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,,
3,320050,269,ABPs,chartevents,Routine Vital Signs,mmHg,Numeric,,
4,320051,268,ABPd,chartevents,Routine Vital Signs,mmHg,Numeric,,


In [21]:
cols = ['label', 'fluid', 'category']

#Diccionario de Encoders. Un encoder para cada columna que queremos codificar
d = defaultdict(LabelEncoder)

#Aplicamos el encoder a cada columna del tirón con apply
hosp_lab_items[cols] = hosp_lab_items[cols].apply(
    lambda x: d[x.name].fit_transform(x)
)

#Sacamos los mapeos de cada columna. Estamos volviendo a codificar, pero solo los valores únicos de cada columna, no toda la columna 
lab_items_mapping = {col: dict(zip(d[col].classes_, d[col].transform(d[col].classes_))) 
            for col in cols}


# Para ver los mapeos de cada columna
for col, mapeo in lab_items_mapping.items():
    print(f"\nMapeo para '{col}':")
    for valor, codigo in mapeo.items():
        print(f"  {valor} -> {codigo}")


Mapeo para 'label':
  % Hemoglobin A1c -> 0
  25-OH Vitamin D -> 1
  Absolute Basophil Count -> 2
  Absolute Eosinophil Count -> 3
  Absolute Lymphocyte Count -> 4
  Absolute Monocyte Count -> 5
  Absolute Neutrophil Count -> 6
  Acetaminophen -> 7
  Alanine Aminotransferase (ALT) -> 8
  Albumin -> 9
  Albumin, Body Fluid -> 10
  Albumin, Urine -> 11
  Alkaline Phosphatase -> 12
  Alpha-Fetoprotein -> 13
  Ammonia -> 14
  Amorphous Crystals -> 15
  Amylase -> 16
  Amylase, Body Fluid -> 17
  Anion Gap -> 18
  Anti-SARS-CoV-2 IgG -> 19
  Anti-Thyroglobulin Antibodies -> 20
  Asparate Aminotransferase (AST) -> 21
  Bacteria -> 22
  Bands -> 23
  Basophils -> 24
  Beta Hydroxybutyrate -> 25
  Beta-2 Microglobulin -> 26
  Bilirubin, Direct -> 27
  Bilirubin, Total -> 28
  Bilirubin, Total, Ascites -> 29
  Bilirubin, Total, Body Fluid -> 30
  Bilirubin, Total, Pleural -> 31
  Blasts -> 32
  C-Reactive Protein -> 33
  C. diff PCR -> 34
  C3 -> 35
  C4 -> 36
  CA 19-9 -> 37
  CA-125 -> 38
  

In [22]:
chart_events.head()

Unnamed: 0,subject_id,hadm_id,stay_id,caregiver_id,charttime,storetime,itemid,value,valuenum,valueuom,warning
0,30000246,40653169,55154064,,2104-05-05 18:40:00,2104-05-05 18:47:00,320045,58.0,58.0,bpm,
1,30000246,40653169,55154064,,2104-05-05 18:40:00,2104-05-05 18:47:00,320179,134.0,134.0,mmHg,
2,30000246,40653169,55154064,,2104-05-05 18:40:00,2104-05-05 18:47:00,320180,81.0,81.0,mmHg,
3,30000246,40653169,55154064,,2104-05-05 18:40:00,2104-05-05 18:47:00,320210,13.0,13.0,insp/min,
4,30000246,40653169,55154064,,2104-05-05 18:40:00,2104-05-05 18:47:00,320277,96.0,96.0,%,


In [None]:
prescriptions.head()

In [23]:
distinct_patients_hosp = nw_hosp_admissions['subject_id'].nunique()
print(f"Number of distinct hospital patients: {distinct_patients_hosp}")

distinct_patients_icu = nw_icu_stays['subject_id'].nunique()
print(f"Number of distinct ICU patients: {distinct_patients_icu}")

common_patients = set(nw_hosp_admissions['subject_id']).intersection(set(nw_icu_stays['subject_id']))
print(f"Number of patients in both datasets: {len(common_patients)}")


Number of distinct hospital patients: 25923
Number of distinct ICU patients: 23204
Number of patients in both datasets: 23204


In [24]:

cols_hosp = ['subject_id', 'hadm_id']
hosp_admissions = nw_hosp_admissions.loc[:, cols_hosp]

hosp_admissions.head()

Unnamed: 0,subject_id,hadm_id
0,30000238,44714428
1,30000246,40653169
2,30000246,43974039
3,30000246,45869139
4,30000521,40796332


In [25]:
cols_icu = ['subject_id', 'stay_id']
icu_stays = nw_icu_stays.loc[:, cols_icu]
icu_stays.head()

Unnamed: 0,subject_id,stay_id
0,30000238,59801852
1,30000246,55154064
2,30000521,58492193
3,30000590,50707570
4,30000828,57131111


In [26]:
joined_data = pd.merge(hosp_admissions, icu_stays, on='subject_id', how='inner')
joined_data.head()

Unnamed: 0,subject_id,hadm_id,stay_id
0,30000238,44714428,59801852
1,30000246,40653169,55154064
2,30000246,43974039,55154064
3,30000246,45869139,55154064
4,30000521,40796332,58492193


In [27]:
number_of_rows = joined_data.shape[0]
print(f"Number of rows in the joined dataset: {number_of_rows}")

number_of_unique_patients = joined_data['subject_id'].nunique()
print(f"Number of unique patients in the joined dataset: {number_of_unique_patients}")


Number of rows in the joined dataset: 82116
Number of unique patients in the joined dataset: 23204


In [28]:
cols = joined_data.columns
hadm_col = 'hadm_id' if 'hadm_id' in cols else next((c for c in cols if 'hadm' in c), None)
stay_col = 'stay_id' if 'stay_id' in cols else next((c for c in cols if 'stay' in c or 'icustay' in c), None)

aggs = {}
if hadm_col:
    aggs['hospital_stay_count'] = (hadm_col, 'nunique')
if stay_col:
    aggs['icu_stay_count'] = (stay_col, 'nunique')

patient_counts = joined_data.groupby('subject_id').agg(**aggs).reset_index()

# asegurar columnas y tipos
if 'hospital_stay_count' not in patient_counts.columns:
    patient_counts['hospital_stay_count'] = 0
if 'icu_stay_count' not in patient_counts.columns:
    patient_counts['icu_stay_count'] = 0

patient_counts['hospital_stay_count'] = patient_counts['hospital_stay_count'].astype(int)
patient_counts['icu_stay_count'] = patient_counts['icu_stay_count'].astype(int)

patient_counts = patient_counts.sort_values(by=['hospital_stay_count', 'icu_stay_count'], ascending=False)


patient_counts.head()

Unnamed: 0,subject_id,hospital_stay_count,icu_stay_count
9687,34172231,47,1
12868,35568721,42,1
22630,39757072,41,2
7645,33282566,35,10
752,30311241,34,2


In [29]:
multiple_hosp_stays = patient_counts[patient_counts['hospital_stay_count'] > 1]
print(f"Number of patients with multiple hospital stays: {multiple_hosp_stays.shape[0]}")
multiple_hosp_stays.head()



Number of patients with multiple hospital stays: 12322


Unnamed: 0,subject_id,hospital_stay_count,icu_stay_count
9687,34172231,47,1
12868,35568721,42,1
22630,39757072,41,2
7645,33282566,35,10
752,30311241,34,2


In [30]:
# 597 pacientes con múltiples estancias en la UCI tienen una unica estancia hospitalaria
multiple_icu_stays = patient_counts[patient_counts['icu_stay_count'] > 1]
print(f"Number of patients with multiple ICU stays: {multiple_icu_stays.shape[0]}")
multiple_icu_stays.head()

Number of patients with multiple ICU stays: 3860


Unnamed: 0,subject_id,hospital_stay_count,icu_stay_count
22630,39757072,41,2
7645,33282566,35,10
752,30311241,34,2
22239,39585304,32,20
12589,35452644,32,3


In [31]:

one_hosp_stay_multiple_icu = multiple_icu_stays[multiple_icu_stays['hospital_stay_count'] == 1]
one_hosp_stay_multiple_icu.head()

Unnamed: 0,subject_id,hospital_stay_count,icu_stay_count
8618,33701011,1,5
18858,38127711,1,5
20346,38746252,1,5
1436,30615230,1,4
9282,33974685,1,4


In [32]:
#De los 597, un 36% tienen fecha de fallecimiento asociada a la estancia hospitalaria
hosp_reports = pd.merge(nw_hosp_admissions, one_hosp_stay_multiple_icu[['subject_id']], on='subject_id', how='inner')
hosp_reports.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admit_provider_id,admission_location,discharge_location,insurance,language,marital_status,race,edregtime,edouttime,hospital_expire_flag
0,30010964,43002475,2130-02-14 06:45:00,2130-03-16 19:21:00,2130-03-16 18:30:00,2,,1,8,5,15,6,7,2130-02-14 01:03:00,2130-02-14 06:45:00,0
1,30043338,43548391,2175-07-11 03:00:00,2175-08-21 21:39:00,2175-08-21 17:33:00,4,,16,8,5,15,6,2,,,0
2,30081242,42067774,2175-05-29 20:56:00,2175-06-16 14:48:00,,4,,16,28,5,15,9,3,,,0
3,30086735,44565426,2185-03-06 13:39:00,2185-03-21 21:06:00,2185-04-01 19:29:00,2,,1,28,5,15,9,7,2185-03-06 11:01:00,2185-03-06 17:02:00,0
4,30088775,45068329,2122-01-11 14:39:00,2122-01-19 14:00:00,,4,,16,12,5,15,2,7,,,0
