In [None]:
import os
import findspark
from pyspark.sql import SparkSession

os.environ['SPARK_HOME'] = 'C:/Users/John/AppData/Local/Programs/Python/Python312/Lib/site-packages/pyspark'  # Update this path to your Spark installation
findspark.init()

In [2]:
# Initialize SparkSession
spark = SparkSession.builder.master("local[*]") \
    .enableHiveSupport() \
    .config("spark.driver.memory", "8g") \
    .appName("Compress Healthcare Data") \
    .getOrCreate()

In [None]:
def read_all_files_in_data_dir(directory):
    """
    Read in all .csv datasets in a directory and read them into a pyspark dataframe
    
    :return dict -> key:(filename): value (pyspark_data storing data)
    """
    dataframes_list = {}
    for entry in os.scandir(directory):  
        if entry.is_file() and entry.path.endswith('.csv'):  # check if it's a .csv file
            file_path = entry.path
            # Read File into pyspark dataframe
            name_of_file = entry.path.split('/')[-1].split('.')[0]
            print(f"Reading File Path: {file_path}")
            df = spark.read.csv(file_path, header=True, inferSchema=True)
            dataframes_list[name_of_file] = df
    print("SUCCESS: All .csv Data in Data Directory Read")
    return dataframes_list

def print_schema_for_all_dataframes(dataframes_list):
    """
    Print Schema For All Dataframes in dataframe list and write out to file, including row counts.
    """
    with open('all_data_schemas.txt', 'w') as file:
        for df_key in dataframes_list.keys():
            df = dataframes_list[df_key]
            schema_string = df._jdf.schema().treeString()
            row_count = df.count()
            file.write(f"Schema for file: {df_key}\n")
            file.write(f"Row count: {row_count}\n")
            file.write(schema_string)
            file.write('\n----------------------------\n')
    print("SUCCESS: Schemas and row counts for all dataframes have been written to all_data_schemas.txt")

In [4]:
dataframes_list = read_all_files_in_data_dir('../sql_data')

Reading File Path: ../sql_data\admissions.csv
Reading File Path: ../sql_data\diagnoses_icd.csv
Reading File Path: ../sql_data\discharge.csv
Reading File Path: ../sql_data\discharge_detail.csv
Reading File Path: ../sql_data\drgcodes.csv
Reading File Path: ../sql_data\d_hcpcs.csv
Reading File Path: ../sql_data\d_icd_diagnoses.csv
Reading File Path: ../sql_data\d_icd_procedures.csv
Reading File Path: ../sql_data\d_labitems.csv
Reading File Path: ../sql_data\emar.csv
Reading File Path: ../sql_data\emar_detail.csv
Reading File Path: ../sql_data\hcpcsevents.csv
Reading File Path: ../sql_data\labevents.csv
Reading File Path: ../sql_data\microbiologyevents.csv
Reading File Path: ../sql_data\omr.csv
Reading File Path: ../sql_data\patients.csv
Reading File Path: ../sql_data\pharmacy.csv
Reading File Path: ../sql_data\prescriptions.csv
Reading File Path: ../sql_data\procedures_icd.csv
Reading File Path: ../sql_data\radiology.csv
Reading File Path: ../sql_data\radiology_detail.csv
Reading File Pat

In [5]:
print_schema_for_all_dataframes(dataframes_list)

SUCCESS: Schemas and row counts for all dataframes have been written to all_data_schemas.txt


In [6]:
spark.stop()