In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

In [None]:
## Create Data Type Dictionary: 

# Define the data types as a dictionary
dtype_dict_ops = {'subject_id': int,
    'hadm_id': int,
    'opdate': int,
    'age': int,
    'sex': int,
    'weight': int,
    'height': int,
    'race': int,
    'asa': int,
    'emop': int,
    'department': int,
    'antype': int,
    'icd10_pcs': int,
    'category_desc': int,
    'desc_short': int,
    'category_id': int,
    'orin_time': int,
    'orout_time': int,
    'opstart_time': int,
    'opend_time': int,
    'admission_time': int,
    'discharge_time': int,
    'anstart_time': int,
    'anend_time': int,
    'cpbon_time': int,
    'cpboff_time': int,
    'icuin_time': int,
    'icuout_time': int,
    'inhosp_death_time': int,
    'subject_id_y': int,
    'chart_time': int,
    'item_name': int,
    'value': int,
    'orout_time_y': int,
    'value': int
    }

dtype_dict_vital = {
    'op_id': int,
    'subject_id': int,
    'chart_time': int,
    'item_name': int,
    'value': int}

dtype_dict_lab = {
    'subject_id': int,
    'chart_time': int,
    'item_name': int,
    'value': int}


In [None]:
## Load Operations 

operations_df = pd.read_csv('../_data/operation_pcd.csv')
operations_df.columns, operations_df.shape, operations_df.info


In [17]:
## Load VITALS
vitals_df = pd.read_csv('../_data/vitals_in_hospital_filter.csv')
vitals_df.head()

Unnamed: 0,op_id,subject_id,chart_time,item_name,value
0,400000455,179458020,2005,spo2,100.0
1,400000455,179458020,2005,vt,344.0
2,400000455,179458020,2005,hr,50.0
3,400000455,179458020,2005,rr,12.0
4,400000455,179458020,2005,pmean,7.0


In [20]:
pivoted_vitals = pd.pivot_table(vitals_df, index=['op_id', 'subject_id', 'chart_time'], columns='item_name', values='value')
pivoted_vitals.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_name,art_dbp,art_mbp,art_sbp,bis,bt,ci,cvp,ffp,ftn,hr,pap_dbp,pap_mbp,pap_sbp,pip,pmean,rbc,rr,spo2,uo,vt
op_id,subject_id,chart_time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
400000455,179458020,1990,,,,,,,,,,,,,,,7.0,,12.0,100.0,,344.0
400000455,179458020,1995,,,,,,,,,,48.0,,,,13.0,7.0,,12.0,100.0,,336.0
400000455,179458020,2000,,,,,,,,,,48.0,,,,13.0,7.0,,12.0,100.0,,336.0
400000455,179458020,2005,,,,,,,,,,50.0,,,,13.0,7.0,,12.0,100.0,,344.0
400000790,119508072,2210,,,,,,,,,,70.0,,,,,,,,99.0,,496.0


In [22]:
## Load LABS
labs_df = pd.read_csv('../_data/labs_in_hospital_filter.csv')
labs_df.columns, labs_df.shape

(Index(['subject_id', 'chart_time', 'item_name', 'value'], dtype='object'),
 (1392383, 4))

In [23]:
pivoted_labs = pd.pivot_table(labs_df, index=['subject_id', 'chart_time'], columns='item_name', values='value')
pivoted_labs.head()

Unnamed: 0_level_0,item_name,alp,alt,ast,chloride,creatinine,crp,glucose,hb,hba1c,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc
subject_id,chart_time,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
100001820,20595,,,,,,,167.0,,,,,,,,,
100002094,3510,,,,,,,160.0,,,,,,3.3,144.0,,
100002094,3615,45.0,16.0,21.0,113.0,0.5,,,13.6,,24.4,5.1,184.0,3.3,145.0,1.0,9.1
100002094,3528960,,,,,,,100.0,,,,,,,,,
100002094,3528975,,,,,,,,13.0,,,,198.0,,,,7.99


Import and Merge with chunking. 


In [None]:
## CHUNKING IS QUESTIONABLE - SKIP TO BELOW FOR loading the full csv
import pandas as pd

chunk_size = 10000
# Assuming we have three large DataFrames to merge: 
# You can read them in chunks of chunk size x
operations_df_chunk = pd.read_csv('../_data/operation_pcd.csv', dtype=dtype_dict_ops,chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])
vitals_df_chunk = pd.read_csv('../_data/vitals_in_hospital_filter.csv', dtype=dtype_dict_vital, chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])
labs_df_chunk = pd.read_csv('../_data/labs_in_hospital_filter.csv', dtype=dtype_dict_lab, chunksize=chunk_size, na_values=['NA', 'N/A', 'NaN'])


VITALS + OPS

In [None]:
## Join Vitals and OPS on OP_ID

ops_vitals_merged_chunks = []

for operations_chunk, vitals_chunk in zip(operations_df_chunk, vitals_df_chunk):
    merged_chunk = pd.merge(operations_chunk, vitals_chunk, on='op_id',how='left')
    ops_vitals_merged_chunks.append(merged_chunk)

In [None]:
ops_vitals_merged_df = pd.concat(ops_vitals_merged_chunks)

In [None]:
ops_vitals_merged_df.columns, ops_vitals_merged_df.shape

In [None]:
# EXPORT and Specify the file path 
output_csv_file = '../_data/operation_vitals.csv'

# Save the DataFrame to a CSV file
ops_vitals_merged_df.to_csv(output_csv_file, index=False)

In [None]:
distinct_count_w_details = ops_vitals_merged_df['op_id'].nunique()
print(distinct_count_w_details)

At least there is nothing being dropped from the Operations table (see 128k above).   
Still doubt that the Vitals are coming over - espectially given how many records there are. 

TESTING - Are records getting merged correctly? 

In [None]:
testing_df= pd.read_csv('../_data/operation_vitals.csv')


In [None]:

testing_df = ops_vitals_merged_df[ops_vitals_merged_df['item_name'].notna()]
testing_df

In [None]:
record_count = testing_df.shape[0]
print("Number of records with non-null 'item_name':", record_count)

In [None]:
sample = testing_df[testing_df['op_id']==400000455]
bleh = sample[['orout_time_y','orout_time_x']]
bleh

In [None]:
data_types = testing_df.dtypes
print(data_types)

LABS + OPS

In [None]:
## Join LABS and OPS on SUBJECT_ID

ops_labs_merged_chunks = []

for operations_chunk, lab_chunk in zip(operations_df_chunk, labs_df_chunk):
    merged_chunk = pd.merge(operations_chunk, lab_chunk, on='subject_id',how='left')
    ops_labs_merged_chunks.append(merged_chunk)


In [None]:
ops_labs_merged_df = pd.concat(ops_labs_merged_chunks)


In [None]:
ops_labs_merged_df.shape, ops_labs_merged_df.columns



In [None]:
# EXPORT and Specify the file path 
output_csv_file = '../_data/operation_labs.csv'

# Save the DataFrame to a CSV file
ops_labs_merged_df.to_csv(output_csv_file, index=False)

TESTING


In [None]:
filter_test_lab = ops_labs_merged_df[ops_labs_merged_df['item_name'].notna()]
filter_test_lab

In [None]:
record_count_lab = filter_test_lab.shape[0]
print("Number of records with non-null 'item_name':", record_count_lab)