In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

In [None]:
## Create Data Type Dictionary (Not necessary - marked for potential deletion): 

# Define the data types as a dictionary
dtype_dict_ops = {'subject_id': int,
    'hadm_id': int,
    'opdate': int,
    'age': int,
    'sex': int,
    'weight': int,
    'height': int,
    'race': int,
    'asa': int,
    'emop': int,
    'department': int,
    'antype': int,
    'icd10_pcs': int,
    'category_desc': int,
    'desc_short': int,
    'category_id': int,
    'orin_time': int,
    'orout_time': int,
    'opstart_time': int,
    'opend_time': int,
    'admission_time': int,
    'discharge_time': int,
    'anstart_time': int,
    'anend_time': int,
    'cpbon_time': int,
    'cpboff_time': int,
    'icuin_time': int,
    'icuout_time': int,
    'inhosp_death_time': int,
    'subject_id_y': int,
    'chart_time': int,
    'item_name': int,
    'value': int,
    'orout_time_y': int,
    'value': int
    }

dtype_dict_vital = {
    'op_id': int,
    'subject_id': int,
    'chart_time': int,
    'item_name': int,
    'value': int}

dtype_dict_lab = {
    'subject_id': int,
    'chart_time': int,
    'item_name': int,
    'value': int}


## LOAD - Operations

In [16]:
## Load Operations 

operations_df = pd.read_csv('../_data/operation_pcd.csv')
operations_df.columns, operations_df.shape, operations_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128031 entries, 0 to 128030
Data columns (total 30 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   op_id              128031 non-null  int64  
 1   subject_id         128031 non-null  int64  
 2   hadm_id            128031 non-null  int64  
 3   opdate             128031 non-null  int64  
 4   age                128031 non-null  int64  
 5   sex                128031 non-null  object 
 6   weight             126611 non-null  float64
 7   height             127269 non-null  float64
 8   race               128031 non-null  object 
 9   asa                124636 non-null  float64
 10  emop               128031 non-null  int64  
 11  department         128031 non-null  object 
 12  antype             128031 non-null  object 
 13  icd10_pcs          128031 non-null  object 
 14  category_desc      128031 non-null  object 
 15  desc_short         128031 non-null  object 
 16  ca

(Index(['op_id', 'subject_id', 'hadm_id', 'opdate', 'age', 'sex', 'weight',
        'height', 'race', 'asa', 'emop', 'department', 'antype', 'icd10_pcs',
        'category_desc', 'desc_short', 'category_id', 'orin_time', 'orout_time',
        'opstart_time', 'opend_time', 'admission_time', 'discharge_time',
        'anstart_time', 'anend_time', 'cpbon_time', 'cpboff_time', 'icuin_time',
        'icuout_time', 'inhosp_death_time'],
       dtype='object'),
 (128031, 30),
 None)

## Vitals
### LOAD - Vitals

In [2]:
## Load VITALS
vitals_df = pd.read_csv('../_data/vitals_in_hospital_filter.csv')
vitals_df.head()

Unnamed: 0,#,op_id,subject_id,chart_time,item_name,value,nearest_orout
0,1,456749370,100000842,1990,hr,64.0,1990
1,2,456749370,100000842,1990,pip,4.0,1990
2,3,456749370,100000842,1990,pmean,1.0,1990
3,4,456749370,100000842,1990,rr,9.0,1990
4,5,456749370,100000842,1990,spo2,100.0,1990


### Pivot - Vitals

In [25]:
pivoted_vitals = pd.pivot_table(vitals_df, index=['op_id', 'subject_id', 'chart_time'], columns='item_name', values='value')
pivoted_vitals.head()
pivoted_vitals.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 128491 entries, (400000455, 179458020, 2005) to (499999032, 136003154, 2175)
Data columns (total 20 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   art_dbp  36721 non-null   float64
 1   art_mbp  37555 non-null   float64
 2   art_sbp  36808 non-null   float64
 3   bis      1963 non-null    float64
 4   bt       27719 non-null   float64
 5   ci       53 non-null      float64
 6   cvp      2691 non-null    float64
 7   ffp      230 non-null     float64
 8   ftn      94 non-null      float64
 9   hr       112146 non-null  float64
 10  pap_dbp  322 non-null     float64
 11  pap_mbp  366 non-null     float64
 12  pap_sbp  325 non-null     float64
 13  pip      83815 non-null   float64
 14  pmean    79294 non-null   float64
 15  rbc      1011 non-null    float64
 16  rr       98032 non-null   float64
 17  spo2     116990 non-null  float64
 18  uo       4155 non-null    float64
 19  vt       85338 n

### Preliminary EDA - VITALS
1. Drop fields with high null count. 

In [13]:
vitals_todrop = ['bis','ci', 'ffp','ftn','pap_dbp','pap_mbp','pap_sbp','rbc','uo']
pivoted_vitals.drop(columns=vitals_todrop, inplace=True)
pivoted_vitals.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 128491 entries, (400000455, 179458020, 2005) to (499999032, 136003154, 2175)
Data columns (total 11 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   art_dbp  36721 non-null   float64
 1   art_mbp  37555 non-null   float64
 2   art_sbp  36808 non-null   float64
 3   bt       27719 non-null   float64
 4   cvp      2691 non-null    float64
 5   hr       112146 non-null  float64
 6   pip      83815 non-null   float64
 7   pmean    79294 non-null   float64
 8   rr       98032 non-null   float64
 9   spo2     116990 non-null  float64
 10  vt       85338 non-null   float64
dtypes: float64(11)
memory usage: 18.0 MB


## LABS
### Load - Labs

In [6]:
## Load LABS
labs_df = pd.read_csv('../_data/labs_in_hospital_filter.csv')
labs_df.columns, labs_df.shape

(Index(['#', 'subject_id', 'chart_time', 'item_name', 'value', 'nearest_orout'], dtype='object'),
 (629055, 6))

### Pivot - Labs

In [7]:
pivoted_labs = pd.pivot_table(labs_df, index=['subject_id', 'chart_time'], columns='item_name', values='value')
pivoted_labs.head()

Unnamed: 0_level_0,item_name,alp,alt,ast,chloride,creatinine,crp,glucose,hb,hba1c,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc
subject_id,chart_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100001820,20595,,,,,,,167.0,,,,,,,,,
100002094,3615,45.0,16.0,21.0,113.0,0.5,,,13.6,,24.4,5.1,184.0,3.3,145.0,1.0,9.1
100002094,3528960,,,,,,,100.0,,,,,,,,,
100002234,2425,,,,,0.95,,,,,,,,,,,
100002413,1165,,,,,0.85,,,,,,,,,,,


### Preliminary EDA - Labs
1. Drop fields with high NA

In [8]:
pivoted_labs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 95322 entries, (100001820, 20595) to (199999413, 3855)
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   alp              34769 non-null  float64
 1   alt              36429 non-null  float64
 2   ast              36448 non-null  float64
 3   chloride         44877 non-null  float64
 4   creatinine       40262 non-null  float64
 5   crp              9170 non-null   float64
 6   glucose          46707 non-null  float64
 7   hb               46898 non-null  float64
 8   hba1c            507 non-null    float64
 9   hco3             32364 non-null  float64
 10  lymphocyte       36306 non-null  float64
 11  platelet         44989 non-null  float64
 12  potassium        63067 non-null  float64
 13  sodium           62797 non-null  float64
 14  total_bilirubin  33308 non-null  float64
 15  wbc              44500 non-null  float64
dtypes: float64(16)
memory usage: 

In [14]:
labs_todrop = ['crp','hba1c']
pivoted_labs.drop(columns=labs_todrop, inplace=True)
pivoted_labs.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 95322 entries, (100001820, 20595) to (199999413, 3855)
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   alp              34769 non-null  float64
 1   alt              36429 non-null  float64
 2   ast              36448 non-null  float64
 3   chloride         44877 non-null  float64
 4   creatinine       40262 non-null  float64
 5   glucose          46707 non-null  float64
 6   hb               46898 non-null  float64
 7   hco3             32364 non-null  float64
 8   lymphocyte       36306 non-null  float64
 9   platelet         44989 non-null  float64
 10  potassium        63067 non-null  float64
 11  sodium           62797 non-null  float64
 12  total_bilirubin  33308 non-null  float64
 13  wbc              44500 non-null  float64
dtypes: float64(14)
memory usage: 11.5 MB


In [None]:
## CHUNKING IS QUESTIONABLE - SKIP TO BELOW FOR loading the full csv
import pandas as pd

chunk_size = 10000
# Assuming we have three large DataFrames to merge: 
# You can read them in chunks of chunk size x
operations_df_chunk = pd.read_csv('../_data/operation_pcd.csv', dtype=dtype_dict_ops,chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])
vitals_df_chunk = pd.read_csv('../_data/vitals_in_hospital_filter.csv', dtype=dtype_dict_vital, chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])
labs_df_chunk = pd.read_csv('../_data/labs_in_hospital_filter.csv', dtype=dtype_dict_lab, chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])


## MERGE 
### Operations_pcd with Vitals, then Labs.

In [18]:
operations_w_vitals_df = pd.merge(operations_df,pivoted_vitals, on='op_id', how='left')
operations_w_vitals_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128494 entries, 0 to 128493
Data columns (total 41 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   op_id              128494 non-null  int64  
 1   subject_id         128494 non-null  int64  
 2   hadm_id            128494 non-null  int64  
 3   opdate             128494 non-null  int64  
 4   age                128494 non-null  int64  
 5   sex                128494 non-null  object 
 6   weight             127068 non-null  float64
 7   height             127726 non-null  float64
 8   race               128494 non-null  object 
 9   asa                124952 non-null  float64
 10  emop               128494 non-null  int64  
 11  department         128494 non-null  object 
 12  antype             128494 non-null  object 
 13  icd10_pcs          128494 non-null  object 
 14  category_desc      128494 non-null  object 
 15  desc_short         128494 non-null  object 
 16  ca

In [19]:
operations_w_vitals_df.head()

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,art_mbp,art_sbp,bt,cvp,hr,pip,pmean,rr,spo2,vt
0,484069807,178742874,229842382,0,30,F,48.0,153.0,Asian,,...,,,36.4,,90.0,10.0,,28.0,,288.0
1,446270725,158995752,257857903,0,70,M,43.0,169.0,Asian,,...,60.0,78.0,24.6,,76.0,13.0,,14.0,100.0,480.0
2,478413008,133278262,277235295,0,35,F,54.0,,Asian,,...,,,,,,,,,100.0,
3,468516791,116924034,299190423,17280,45,F,47.0,152.0,Asian,1.0,...,,,,,94.0,11.0,,21.5,100.0,100.0
4,493866243,174229093,275813505,1440,50,F,76.0,160.0,Asian,2.0,...,,,,,68.0,,,19.0,100.0,


In [23]:
operations_vitals_labs_df = pd.merge(operations_w_vitals_df,pivoted_labs, on='subject_id', how='left')

operations_vitals_labs_df.shape, operations_vitals_labs_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200593 entries, 0 to 200592
Data columns (total 55 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   op_id              200593 non-null  int64  
 1   subject_id         200593 non-null  int64  
 2   hadm_id            200593 non-null  int64  
 3   opdate             200593 non-null  int64  
 4   age                200593 non-null  int64  
 5   sex                200593 non-null  object 
 6   weight             198158 non-null  float64
 7   height             199124 non-null  float64
 8   race               200593 non-null  object 
 9   asa                194133 non-null  float64
 10  emop               200593 non-null  int64  
 11  department         200593 non-null  object 
 12  antype             200593 non-null  object 
 13  icd10_pcs          200593 non-null  object 
 14  category_desc      200593 non-null  object 
 15  desc_short         200593 non-null  object 
 16  ca

((200593, 55), None)

## Export to CSV

In [24]:
# EXPORT and Specify the file path 
output_csv_file = '../_data/operations_fulldata.csv'

# Save the DataFrame to a CSV file
operations_vitals_labs_df.to_csv(output_csv_file, index=False)

In [None]:
distinct_count_w_details = ops_vitals_merged_df['op_id'].nunique()
print(distinct_count_w_details)

At least there is nothing being dropped from the Operations table (see 128k above).   
Still doubt that the Vitals are coming over - espectially given how many records there are. 

TESTING - Are records getting merged correctly? 

In [None]:

testing_df = ops_vitals_merged_df[ops_vitals_merged_df['item_name'].notna()]
testing_df

In [None]:
record_count = testing_df.shape[0]
print("Number of records with non-null 'item_name':", record_count)

In [None]:
data_types = testing_df.dtypes
print(data_types)