# Notebook For Compressing Healthcare Data

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType, StructType, StructField, FloatType, IntegerType
import numpy as np
from rapidfuzz import fuzz
import os  # import os module

# Initialize SparkSession
spark = SparkSession.builder.master("local[*]") \
    .enableHiveSupport() \
    .appName("Compress Healthcare Data") \
    .getOrCreate()

25/02/26 22:25:00 WARN Utils: Your hostname, Saganas-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.69 instead (on interface en0)
25/02/26 22:25:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/26 22:25:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [31]:
############## Helper Functions ################

def read_all_files_in_data_dir(directory):
    """
    Read in all datasets in a directory and read them into a pyspark dataframe
    
    :return dict -> key:(filename): value (pyspark_data storing data)
    """
    dataframes_list = {}
    for entry in os.scandir(directory):  
        if entry.is_file():  # check if it's a file
            # Read File into pyspark dataframe
            name_of_file = entry.path.split('/')[1].split('.')[0]
            print(name_of_file)
            file_path = entry.path
            if file_path == 'data/.DS_Store':
                continue
            print(f"Reading File Path: {file_path}")
            df = spark.read.option("header", True).csv(file_path)
            dataframes_list[name_of_file] = df
    print(f"SUCCESS: Dataframes Read: {dataframes_list.keys()}")
    return dataframes_list

def print_schema_for_all_dataframes(dataframes_list):
    """
    Print Schema For All Dataframes in dataframe list and write out to file
    """
    with open('all_data_schemas.txt', 'w') as file:
        for df_key in dataframes_list.keys():
            file.write(f"Schema for file: {df_key}\n")
            file.write(f"{str(dataframes_list[df_key].schema)}\n\n")
            file.write('----------------------------\n')
            print(f"Schema for file: {df_key}")
            dataframes_list[df_key].printSchema()


In [None]:
dataframes_list = read_all_files_in_data_dir('data')

In [32]:
print_schema_for_all_dataframes(dataframes_list)

Schema for file: radiology
root
 |-- note_id: string (nullable = true)
 |-- subject_id: string (nullable = true)
 |-- hadm_id: string (nullable = true)
 |-- note_type: string (nullable = true)
 |-- note_seq: string (nullable = true)
 |-- charttime: string (nullable = true)
 |-- storetime: string (nullable = true)
 |-- text: string (nullable = true)

Schema for file: d_hcpcs
root
 |-- code: string (nullable = true)
 |-- category: string (nullable = true)
 |-- long_description: string (nullable = true)
 |-- short_description: string (nullable = true)

Schema for file: patients
root
 |-- subject_id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- anchor_age: string (nullable = true)
 |-- anchor_year: string (nullable = true)
 |-- anchor_year_group: string (nullable = true)
 |-- dod: string (nullable = true)

Schema for file: diagnoses_icd
root
 |-- subject_id: string (nullable = true)
 |-- hadm_id: string (nullable = true)
 |-- seq_num: string (nullable = true)
 |-- ic

In [None]:
dataframes_list['patients'].show()

+----------+------+----------+-----------+-----------------+----------+
|subject_id|gender|anchor_age|anchor_year|anchor_year_group|       dod|
+----------+------+----------+-----------+-----------------+----------+
|  10000032|     F|        52|       2180|      2014 - 2016|2180-09-09|
|  10000048|     F|        23|       2126|      2008 - 2010|      NULL|
|  10000058|     F|        33|       2168|      2020 - 2022|      NULL|
|  10000068|     F|        19|       2160|      2008 - 2010|      NULL|
|  10000084|     M|        72|       2160|      2017 - 2019|2161-02-13|
|  10000102|     F|        27|       2136|      2008 - 2010|      NULL|
|  10000108|     M|        25|       2163|      2014 - 2016|      NULL|
|  10000115|     M|        24|       2154|      2017 - 2019|      NULL|
|  10000117|     F|        48|       2174|      2008 - 2010|      NULL|
|  10000161|     M|        60|       2163|      2020 - 2022|      NULL|
|  10000178|     F|        59|       2157|      2017 - 2019|    

In [None]:
df.show(2, truncate=False)

+----------+--------+-------+--------+-----------+
|subject_id|hadm_id |seq_num|icd_code|icd_version|
+----------+--------+-------+--------+-----------+
|10000032  |22595853|1      |5723    |9          |
|10000032  |22595853|2      |78959   |9          |
+----------+--------+-------+--------+-----------+
only showing top 2 rows

