# Notebook For Compressing Healthcare Data

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col
from pyspark.sql.types import StringType, StructType, StructField, FloatType, IntegerType
import numpy as np
from rapidfuzz import fuzz
import os  # import os module

# Initialize SparkSession
spark = SparkSession.builder.master("local[*]") \
    .enableHiveSupport() \
        .config("spark.driver.memory", "8g") \
    .appName("Compress Healthcare Data") \
    .getOrCreate()

In [None]:
############## Helper Functions ################

def read_all_files_in_data_dir(directory):
    """
    Read in all datasets in a directory and read them into a pyspark dataframe
    
    :return dict -> key:(filename): value (pyspark_data storing data)
    """
    dataframes_list = {}
    for entry in os.scandir(directory):  
        if entry.is_file():  # check if it's a file
            file_path = entry.path
            if file_path in ['data/.DS_Store', 'data/data.zip']:
                continue
            # Read File into pyspark dataframe
            name_of_file = entry.path.split('/')[1].split('.')[0]
            print(f"Reading File Path: {file_path}")
            df = spark.read.csv(file_path, header=True)
            dataframes_list[name_of_file] = df
    print("SUCCESS: All Data in Data Directory Read")
    return dataframes_list

def print_schema_for_all_dataframes(dataframes_list):
    """
    Print Schema For All Dataframes in dataframe list and write out to file
    """
    with open('all_data_schemas.txt', 'w') as file:
        for df_key in dataframes_list.keys():
            df = dataframes_list[df_key]
            schema_string = df._jdf.schema().treeString()
            file.write(f"Schema for file: {df_key}\n")
            file.write(schema_string)
            file.write('\n----------------------------\n')

def convert_add_pyspark_df_to_pandas(dataframes_list):
    """
    Take list of pyspark dataframes, convert to pandas dataframes, convert to pickle file, and write to pandas_dataframes directory
    """
    for df_key in dataframes_list.keys():
        dataframes_list[df_key].write.mode('overwrite').parquet(f'parquet_files/{df_key}')
        # pandas_df = dataframes_list[df_key].toPandas()
        # pandas_df.to_pickle(f'pandas_dataframes/{df_key}.pkl')
    print("SUCCESS: All dataframes have been written as parquet files")

In [None]:
dataframes_list = read_all_files_in_data_dir('data')

In [None]:
print_schema_for_all_dataframes(dataframes_list)

In [None]:
convert_add_pyspark_df_to_pandas(dataframes_list)

In [None]:
spark.stop()

In [None]:
# Example on how to read the data

# import pandas as pd
# import pyarrow

# df = pd.read_parquet('parquet_files/d_hcpcs/')
# df.head()