In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

## LOAD - Operations

In [2]:
## Load Operations 

operations_df = pd.read_csv('../_data/operation_pcd.csv')
operations_df.columns, operations_df.shape, operations_df.info()

test = operations_df[operations_df['op_id']==494869962]
test

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128031 entries, 0 to 128030
Data columns (total 30 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   op_id              128031 non-null  int64  
 1   subject_id         128031 non-null  int64  
 2   hadm_id            128031 non-null  int64  
 3   opdate             128031 non-null  int64  
 4   age                128031 non-null  int64  
 5   sex                128031 non-null  object 
 6   weight             126611 non-null  float64
 7   height             127269 non-null  float64
 8   race               128031 non-null  object 
 9   asa                124636 non-null  float64
 10  emop               128031 non-null  int64  
 11  department         128031 non-null  object 
 12  antype             128031 non-null  object 
 13  icd10_pcs          128031 non-null  object 
 14  category_desc      128031 non-null  object 
 15  desc_short         128031 non-null  object 
 16  ca

Unnamed: 0,op_id,subject_id,hadm_id,opdate,age,sex,weight,height,race,asa,...,opend_time,admission_time,discharge_time,anstart_time,anend_time,cpbon_time,cpboff_time,icuin_time,icuout_time,inhosp_death_time
5213,494869962,121939743,273337113,27360,60,M,70.0,179.0,Asian,2.0,...,28660.0,0,136795,28540.0,28660.0,,,,,


## Vitals
### LOAD - Vitals

In [3]:
## Load VITALS
vitals_df = pd.read_csv('../_data/vitals_in_hospital_filter.csv')
vitals_df.head()

test_vital = vitals_df[vitals_df['op_id']==494869962]
test_vital.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, 160453 to 160457
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   #              5 non-null      int64  
 1   op_id          5 non-null      int64  
 2   subject_id     5 non-null      int64  
 3   chart_time     5 non-null      int64  
 4   item_name      5 non-null      object 
 5   value          5 non-null      float64
 6   nearest_orout  5 non-null      int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 320.0+ bytes


### Pivot - Vitals

In [4]:
pivoted_vitals = pd.pivot_table(vitals_df, index=['op_id', 'subject_id', 'chart_time'], columns='item_name', values='value')
pivoted_vitals.head()
pivoted_vitals.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 128491 entries, (400000455, 179458020, 2005) to (499999032, 136003154, 2175)
Data columns (total 20 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   art_dbp  36721 non-null   float64
 1   art_mbp  37555 non-null   float64
 2   art_sbp  36808 non-null   float64
 3   bis      1963 non-null    float64
 4   bt       27719 non-null   float64
 5   ci       53 non-null      float64
 6   cvp      2691 non-null    float64
 7   ffp      230 non-null     float64
 8   ftn      94 non-null      float64
 9   hr       112146 non-null  float64
 10  pap_dbp  322 non-null     float64
 11  pap_mbp  366 non-null     float64
 12  pap_sbp  325 non-null     float64
 13  pip      83815 non-null   float64
 14  pmean    79294 non-null   float64
 15  rbc      1011 non-null    float64
 16  rr       98032 non-null   float64
 17  spo2     116990 non-null  float64
 18  uo       4155 non-null    float64
 19  vt       85338 n

As a result of the pivot, these columns were converted into the index:  
- 0   #              5 non-null      int64  
- 1   **op_id**          5 non-null      int64  
- 2   **subject_id**     5 non-null      int64  
- 3   **chart_time**     5 non-null      int64  

Need to reset the index to the table can be used normallly again. 

In [5]:
# confirm that the index changed to the above: 
pivoted_vitals.index

#Reset the index! 
pivoted_vitals.reset_index(inplace=True)

# confim again. 
pivoted_vitals.index

RangeIndex(start=0, stop=128491, step=1)

In [6]:

## Test pivoted_vitals = ensure there remains only 1 record of an op_id
test_ppivot = pivoted_vitals[pivoted_vitals['op_id']==494869962]
test_ppivot


item_name,op_id,subject_id,chart_time,art_dbp,art_mbp,art_sbp,bis,bt,ci,cvp,...,pap_dbp,pap_mbp,pap_sbp,pip,pmean,rbc,rr,spo2,uo,vt
121792,494869962,121939743,28655,,,,,,,,...,,,,26.0,,,22.5,100.0,,296.0


### Preliminary EDA - VITALS
1. Drop fields with high null count. 

In [None]:
vitals_todrop = ['bis','ci', 'ffp','ftn','pap_dbp','pap_mbp','pap_sbp','rbc','uo']
pivoted_vitals.drop(columns=vitals_todrop, inplace=True)
pivoted_vitals.info()

## LABS
### Load - Labs

In [None]:
## Load LABS
labs_df = pd.read_csv('../_data/labs_in_hospital_filter.csv')
labs_df.columns, labs_df.shape

### Pivot - Labs

In [None]:
pivoted_labs = pd.pivot_table(labs_df, index=['subject_id', 'chart_time'], columns='item_name', values='value')
pivoted_labs.head()

### Preliminary EDA - Labs
1. Drop fields with high NA

In [None]:
pivoted_labs.info()

In [None]:
labs_todrop = ['crp','hba1c']
pivoted_labs.drop(columns=labs_todrop, inplace=True)
pivoted_labs.info()

In [None]:
## CHUNKING IS QUESTIONABLE - SKIP TO BELOW FOR loading the full csv
import pandas as pd

chunk_size = 10000
# Assuming we have three large DataFrames to merge: 
# You can read them in chunks of chunk size x
operations_df_chunk = pd.read_csv('../_data/operation_pcd.csv', dtype=dtype_dict_ops,chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])
vitals_df_chunk = pd.read_csv('../_data/vitals_in_hospital_filter.csv', dtype=dtype_dict_vital, chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])
labs_df_chunk = pd.read_csv('../_data/labs_in_hospital_filter.csv', dtype=dtype_dict_lab, chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])


## MERGE 
### Operations_pcd with Vitals, then Labs.

In [None]:
operations_w_vitals_df = pd.merge(operations_df,pivoted_vitals, on='op_id', how='left')
operations_w_vitals_df.info()

In [None]:
operations_w_vitals_df.head()

In [None]:
operations_vitals_labs_df = pd.merge(operations_w_vitals_df,pivoted_labs, on='subject_id', how='left')

operations_vitals_labs_df.shape, operations_vitals_labs_df.info()


## Export to CSV

In [None]:
# EXPORT and Specify the file path 
output_csv_file = '../_data/operations_fulldata.csv'

# Save the DataFrame to a CSV file
operations_vitals_labs_df.to_csv(output_csv_file, index=False)

In [None]:
distinct_count_w_details = ops_vitals_merged_df['op_id'].nunique()
print(distinct_count_w_details)

At least there is nothing being dropped from the Operations table (see 128k above).   
Still doubt that the Vitals are coming over - espectially given how many records there are. 

TESTING - Are records getting merged correctly? 

In [None]:

testing_df = ops_vitals_merged_df[ops_vitals_merged_df['item_name'].notna()]
testing_df

In [None]:
record_count = testing_df.shape[0]
print("Number of records with non-null 'item_name':", record_count)

In [None]:
data_types = testing_df.dtypes
print(data_types)