In [9]:
from data_generator.csv_data_processor import CSVDataProcessor
from utils.util_funcs import get_row_count, calculate_age, display_df
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ETL").getOrCreate()

csv_reader = CSVDataProcessor(spark, "data/healthcare_dataset.csv")

# Read the CSV file
df = csv_reader.run()


In [10]:
from data_generator.admission_data_processor import AdmissionProcessor

df = df.withColumn("DOB", AdmissionProcessor.generate_dob()).drop("age")

In [11]:

from pyspark.sql.functions import to_date, col, floor, datediff, current_date, lower, lit, array, rand
from constants.admission_types_tests_dataset import admission_mapping, admission_tests
from pyspark.sql.functions import col, rand, collect_list, size, array, floor


# Flatten the mapping and create a DataFrame
flattened = [
    (top_level, sub_level, stay_type, admission_tests.get(sub_level, ["No tests"]))
    for top_level, sub_level_dict in admission_mapping.items()
    for sub_level, stay_types in sub_level_dict.items()
    for stay_type in stay_types
]



In [12]:
mapping_df = spark.createDataFrame(flattened, ["top_level_admission", "sub_level_admission", "stay_type", "possible_tests"])

display_df(mapping_df)

Unnamed: 0,top_level_admission,sub_level_admission,stay_type,possible_tests
0,emergency,injury_rtc,day_patient,"[X-rays, CT scans, MRI, Ultrasound, Blood tests]"
1,emergency,injury_rtc,inpatient,"[X-rays, CT scans, MRI, Ultrasound, Blood tests]"
2,emergency,self_inflicted,inpatient,"[Psychological assessment, X-rays (for physica..."
3,emergency,cardiology,day_patient,"[ECG, Echocardiogram, Stress tests, Cardiac ca..."
4,emergency,cardiology,inpatient,"[ECG, Echocardiogram, Stress tests, Cardiac ca..."
5,emergency,neurology,inpatient,"[MRI or CT scans of the brain, Electroencephal..."
6,emergency,pulmonology,day_patient,"[Pulmonary function tests, Chest X-ray, CT sca..."
7,emergency,pulmonology,inpatient,"[Pulmonary function tests, Chest X-ray, CT sca..."
8,emergency,infectious_diseases,outpatient,"[Blood cultures, PCR tests, Antibody tests, Im..."
9,emergency,infectious_diseases,inpatient,"[Blood cultures, PCR tests, Antibody tests, Im..."


In [13]:
# create joining column to mapping_df
admission_types = list(admission_mapping.keys())

print(admission_types)

keys_array = array([lit(key) for key in admission_types])

df = df.withColumn("top_level_admission", keys_array[floor(rand() * len(admission_types))]).drop("admission_type")


['emergency', 'gp_referral', 'hospital_referral', 'self_referral', 'elective']


In [14]:
df.show()

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+--------+-------------------+
|                name|gender|blood_type|medical_condition|date_of_admission|            doctor|            hospital|insurance_provider|    billing_amount|room_number|discharge_date| medication|test_results|     DOB|top_level_admission|
+--------------------+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+--------+-------------------+
|     Tiffany Ramirez|Female|        O-|         Diabetes|       2022-11-17|    Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|        146|    2022-12-01|    Aspirin|Inconclusive|20000701|  hospital_referral|
|         Ruben Burns|  Male|        O+|           Asthm

                                                                                

In [20]:
df = calculate_age(df)

In [21]:
from datetime import datetime

# Define constants and conditions
female_only = ['maternity', 'obstetrics']
is_female = lower(col('gender')) == 'female'
today = datetime.now()
is_pediatric = col("Age") < 18
is_geriatric = (col("Age") >= 65) & (col("sub_level_admission") == "geriatrics")

In [31]:
from pyspark.sql.functions import concat, when
from data_generator.constants import ColConstants

df = (df.withColumn("is_female", is_female)
        .withColumn("is_pediatric", is_pediatric)
        .withColumn("top_level_admission", 
                    when(col("is_pediatric"), 
                            concat(lit(ColConstants.peds), 
                                   col("top_level_admission")
                                   )
                            ).otherwise(col("top_level_admission"))
                    )
      )

In [32]:
df.show(n=2000)

[Stage 12:>                                                         (0 + 1) / 1]

+--------------------+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+--------------------+---+---------+------------+
|                name|gender|blood_type|medical_condition|date_of_admission|              doctor|            hospital|insurance_provider|    billing_amount|room_number|discharge_date| medication|test_results|       DOB| top_level_admission|Age|is_female|is_pediatric|
+--------------------+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+--------------------+---+---------+------------+
|     Tiffany Ramirez|Female|        O-|         Diabetes|       2022-11-17|      Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|        146|    2022-12-01|    Aspirin|I

                                                                                