In [83]:
from data_generator.csv_data_processor import CSVDataProcessor
from utils.util_funcs import get_row_count, display_df
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_date, col, datediff, current_date, lower, lit, rand, array, floor, date_add, concat, when, row_number, expr
    

In [84]:




spark = SparkSession.builder.appName("ETL").getOrCreate()

csv_reader = CSVDataProcessor(spark, "data/healthcare_dataset.csv")

# Read the CSV file
df = csv_reader.run()


In [87]:
min_age_days = 1 * 365  # Minimum age in days (18 years)
max_age_days = 90 * 365  # Maximum age in days (90 years)

df = (df.withColumn("DOB", expr(f"date_sub(current_date(), CAST(round(rand() * ({max_age_days} - {min_age_days}) + {min_age_days}) AS INT))"))
        .withColumn("Age", floor(datediff(current_date(), col("DOB")) / 365.25)))




In [88]:
df.show()

+--------------------+---+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+----------+
|                name|Age|gender|blood_type|medical_condition|date_of_admission|            doctor|            hospital|insurance_provider|    billing_amount|room_number|admission_type|discharge_date| medication|test_results|       DOB|
+--------------------+---+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+----------+
|     Tiffany Ramirez| 14|Female|        O-|         Diabetes|       2022-11-17|    Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|        146|      Elective|    2022-12-01|    Aspirin|Inconclusive|2010-01-28|
|         Ruben Burns| 25|  Male|        O+|        

In [89]:
df.where(col("Age") < 16).show()

+--------------------+---+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+----------+
|                name|Age|gender|blood_type|medical_condition|date_of_admission|              doctor|            hospital|insurance_provider|    billing_amount|room_number|admission_type|discharge_date| medication|test_results|       DOB|
+--------------------+---+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+----------+
|     Tiffany Ramirez| 14|Female|        O-|         Diabetes|       2022-11-17|      Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|        146|      Elective|    2022-12-01|    Aspirin|Inconclusive|2010-01-28|
|         Amy Roberts|  9|  Male|        B-|

In [29]:
from constants.admission_types_tests_dataset import admission_mapping, admission_tests
# Flatten the mapping and create a DataFrame
flattened = [
    (top_level, sub_level, stay_type, admission_tests.get(sub_level, ["No tests"]))
    for top_level, sub_level_dict in admission_mapping.items()
    for sub_level, stay_types in sub_level_dict.items()
    for stay_type in stay_types
]



In [30]:
mapping_df = spark.createDataFrame(flattened, ["top_level_admission", "sub_level_admission", "stay_type", "possible_tests"])

display_df(mapping_df)

Unnamed: 0,top_level_admission,sub_level_admission,stay_type,possible_tests
0,emergency,injury_rtc,inpatient,"[X-rays, CT scans, MRI, Ultrasound, Blood tests]"
1,emergency,injury_rtc,day_patient,"[X-rays, CT scans, MRI, Ultrasound, Blood tests]"
2,emergency,self_inflicted,inpatient,"[Psychological assessment, X-rays (for physica..."
3,emergency,cardiology,inpatient,"[ECG, Echocardiogram, Stress tests, Cardiac ca..."
4,emergency,cardiology,day_patient,"[ECG, Echocardiogram, Stress tests, Cardiac ca..."
5,emergency,neurology,inpatient,"[MRI or CT scans of the brain, Electroencephal..."
6,emergency,pulmonology,inpatient,"[Pulmonary function tests, Chest X-ray, CT sca..."
7,emergency,pulmonology,day_patient,"[Pulmonary function tests, Chest X-ray, CT sca..."
8,emergency,infectious_diseases,outpatient,"[Blood cultures, PCR tests, Antibody tests, Im..."
9,emergency,infectious_diseases,inpatient,"[Blood cultures, PCR tests, Antibody tests, Im..."


In [31]:
# create joining column to mapping_df
admission_types = list(admission_mapping.keys())

print(admission_types)

keys_array = array([lit(key) for key in admission_types])

df = df.withColumn("top_level_admission", keys_array[floor(rand() * len(admission_types))]).drop("admission_type")


['emergency', 'gp_referral', 'hospital_referral', 'self_referral', 'elective']


In [32]:
df.show()

+--------------------+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+----------+---+-------------------+
|                name|gender|blood_type|medical_condition|date_of_admission|            doctor|            hospital|insurance_provider|    billing_amount|room_number|discharge_date| medication|test_results|RandomDays|       DOB|Age|top_level_admission|
+--------------------+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+----------+---+-------------------+
|     Tiffany Ramirez|Female|        O-|         Diabetes|       2022-11-17|    Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|        146|    2022-12-01|    Aspirin|Inconclusive|      6168|1951-11-21| 72|         

In [33]:

# Define constants and conditions
female_only = ['maternity', 'obstetrics']
is_female = lower(col('gender')) == 'female'
is_pediatric = col("Age") < 18
is_geriatric = (col("Age") >= 65) & (col("sub_level_admission") == "geriatrics")

In [34]:
from data_generator.constants import ColConstants

df = (df.withColumn("is_female", is_female)
        .withColumn("is_pediatric", is_pediatric)
        .withColumn("top_level_admission", 
                    when(col("is_pediatric"), 
                            concat(lit(ColConstants.peds), 
                                   col("top_level_admission")
                                   )
                            ).otherwise(col("top_level_admission"))
                    )
      )

In [35]:
df.show(n=9999)

+--------------------+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+----------+---+--------------------+---------+------------+
|                name|gender|blood_type|medical_condition|date_of_admission|              doctor|            hospital|insurance_provider|    billing_amount|room_number|discharge_date| medication|test_results|RandomDays|       DOB|Age| top_level_admission|is_female|is_pediatric|
+--------------------+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+----------+---+--------------------+---------+------------+
|     Tiffany Ramirez|Female|        O-|         Diabetes|       2022-11-17|      Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|       

In [36]:
mapping_df.show()

+-------------------+-------------------+-----------+--------------------+
|top_level_admission|sub_level_admission|  stay_type|      possible_tests|
+-------------------+-------------------+-----------+--------------------+
|          emergency|         injury_rtc|  inpatient|[X-rays, CT scans...|
|          emergency|         injury_rtc|day_patient|[X-rays, CT scans...|
|          emergency|     self_inflicted|  inpatient|[Psychological as...|
|          emergency|         cardiology|  inpatient|[ECG, Echocardiog...|
|          emergency|         cardiology|day_patient|[ECG, Echocardiog...|
|          emergency|          neurology|  inpatient|[MRI or CT scans ...|
|          emergency|        pulmonology|  inpatient|[Pulmonary functi...|
|          emergency|        pulmonology|day_patient|[Pulmonary functi...|
|          emergency|infectious_diseases| outpatient|[Blood cultures, ...|
|          emergency|infectious_diseases|  inpatient|[Blood cultures, ...|
|        gp_referral|    

In [37]:
df = df.join(mapping_df, "top_level_admission", "inner")

In [38]:
df.select("name", "sub_level_admission", "top_level_admission", "stay_type", "is_pediatric", "Age", "DOB").orderBy("name", "sub_level_admission").where((col("top_level_admission") == "emergency")).show(n=200)

+----------------+-------------------+-------------------+-----------+------------+---+----------+
|            name|sub_level_admission|top_level_admission|  stay_type|is_pediatric|Age|       DOB|
+----------------+-------------------+-------------------+-----------+------------+---+----------+
|     Aaron Davis|         cardiology|          emergency|  inpatient|       false| 62|1961-02-23|
|     Aaron Davis|         cardiology|          emergency|day_patient|       false| 62|1961-02-23|
|     Aaron Davis|infectious_diseases|          emergency| outpatient|       false| 62|1961-02-23|
|     Aaron Davis|infectious_diseases|          emergency|  inpatient|       false| 62|1961-02-23|
|     Aaron Davis|         injury_rtc|          emergency|  inpatient|       false| 62|1961-02-23|
|     Aaron Davis|         injury_rtc|          emergency|day_patient|       false| 62|1961-02-23|
|     Aaron Davis|          neurology|          emergency|  inpatient|       false| 62|1961-02-23|
|     Aaro

In [39]:
from pyspark.sql import Window

from pyspark.sql.functions import when, col, lit, concat

df = (df.withColumn("stay_type", 
                   when(col("stay_type") == "day_patient", lit("out_patient"))
                   .when(col("stay_type") == "outpatient", lit("out_patient"))
                   .otherwise(col("stay_type"))) 
       .withColumn('stay_name', 
                   when(col('stay_type') == 'out_patient', concat(col('name'), lit('_out_patient')))
                   .otherwise(when(col('stay_type') == 'inpatient', concat(col('name'), lit('_inpatient')))
                   .otherwise(col('name')))))




In [46]:
df.where((col("sub_level_admission") == "injury_rtc") & (col('name').like('Aaron Patel'))).show()

+-------------------+-----------+------+----------+-----------------+-----------------+--------------+-------------+------------------+------------------+-----------+--------------+----------+------------+----------+----------+---+---------+------------+-------------------+-----------+--------------------+--------------------+
|top_level_admission|       name|gender|blood_type|medical_condition|date_of_admission|        doctor|     hospital|insurance_provider|    billing_amount|room_number|discharge_date|medication|test_results|RandomDays|       DOB|Age|is_female|is_pediatric|sub_level_admission|  stay_type|      possible_tests|           stay_name|
+-------------------+-----------+------+----------+-----------------+-----------------+--------------+-------------+------------------+------------------+-----------+--------------+----------+------------+----------+----------+---+---------+------------+-------------------+-----------+--------------------+--------------------+
|          em

In [41]:
# Define a window specification that partitions data by 'top_level_admission' (or another unique patient identifier if needed)
windowSpec = Window.partitionBy('name', 'stay_name').orderBy(rand())

# Assign row numbers within each partition in a random order
ranked_df = df.withColumn("is_geriatric", is_geriatric).withColumn("row_num", row_number().over(windowSpec))


In [47]:
ranked_df.orderBy("name", "sub_level_admission").where((col("sub_level_admission") == "injury_rtc") & (col('name') == 'Aaron Patel')).show(n=200)

+-------------------+-----------+------+----------+-----------------+-----------------+--------------+-------------+------------------+------------------+-----------+--------------+----------+------------+----------+----------+---+---------+------------+-------------------+-----------+--------------------+--------------------+------------+-------+
|top_level_admission|       name|gender|blood_type|medical_condition|date_of_admission|        doctor|     hospital|insurance_provider|    billing_amount|room_number|discharge_date|medication|test_results|RandomDays|       DOB|Age|is_female|is_pediatric|sub_level_admission|  stay_type|      possible_tests|           stay_name|is_geriatric|row_num|
+-------------------+-----------+------+----------+-----------------+-----------------+--------------+-------------+------------------+------------------+-----------+--------------+----------+------------+----------+----------+---+---------+------------+-------------------+-----------+--------------

In [48]:
ranked_df.where((col("gender") == "Male") & (col("is_female") == True)).show()

                                                                                

+-------------------+----+------+----------+-----------------+-----------------+------+--------+------------------+--------------+-----------+--------------+----------+------------+----------+---+---+---------+------------+-------------------+---------+--------------+---------+------------+-------+
|top_level_admission|name|gender|blood_type|medical_condition|date_of_admission|doctor|hospital|insurance_provider|billing_amount|room_number|discharge_date|medication|test_results|RandomDays|DOB|Age|is_female|is_pediatric|sub_level_admission|stay_type|possible_tests|stay_name|is_geriatric|row_num|
+-------------------+----+------+----------+-----------------+-----------------+------+--------+------------------+--------------+-----------+--------------+----------+------------+----------+---+---+---------+------------+-------------------+---------+--------------+---------+------------+-------+
+-------------------+----+------+----------+-----------------+-----------------+------+--------+----

In [13]:
ranked_df.show(n=8000)

24/02/03 17:34:04 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------------------+--------------------+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+----------+---+---------+------------+--------------------+-----------+--------------------+------------+-------+
|top_level_admission|                name|gender|blood_type|medical_condition|date_of_admission|              doctor|            hospital|insurance_provider|    billing_amount|room_number|discharge_date| medication|test_results|RandomDays|       DOB|Age|is_female|is_pediatric| sub_level_admission|  stay_type|      possible_tests|is_geriatric|row_num|
+-------------------+--------------------+------+----------+-----------------+-----------------+--------------------+--------------------+------------------+------------------+-----------+--------------+-----------+------------+----------+----------+---+---------+------------+-----------------

In [49]:
not_geriatric_df = ranked_df.where((col('sub_level_admission')== 'geriatrics') &( col("is_geriatric")== False))

ranked_tot = get_row_count(ranked_df)
geriatric_tot = get_row_count(not_geriatric_df)

filtered_df = ranked_df.join(not_geriatric_df, on= ['sub_level_admission', 'DOB'], how="left_anti")

filtered_tot = get_row_count(filtered_df)

assert filtered_tot == ranked_tot - geriatric_tot, "Row counts do not match expected value"

# checks the above calculation
assert get_row_count(filtered_df.where((col('sub_level_admission')== 'geriatrics') &( col("is_geriatric")== False))) == 0

                                                                                

In [17]:
# TODO filter on is pediatric, geriatric and is_female to be done here and same people with dob? needs  to be considered 

                                                                                

In [50]:
not_female_df = filtered_df.where((col("sub_level_admission").isin(female_only)) & (col("is_female") == False))

assert  not_female_df.where((col("gender") == "Female") & (col("is_female") == True)).count() == 0

In [53]:
not_female_tot = get_row_count(not_female_df)

filtered_df_female = filtered_df.join(not_female_df, on=['sub_level_admission', 'DOB'], how = 'left_anti')

filtered_female_tot = get_row_count(filtered_df_female, True)

assert  filtered_df_female.where((col("gender") == "Male") & (col("is_female") == True)).count() == 0

                                                                                

67623


                                                                                

In [72]:
# filter pediatric patients who cannot be pregnant (based on legal age in the UK, 16) 
under16_female_df = filtered_df_female.where((col("Age") >16) &  (col("is_female") == False)).orderBy("Age")

In [73]:
under16_female_df.show()

                                                                                

+-------------------+----------+-------------------+-----------------+------+----------+-----------------+-----------------+---------------+--------------------+------------------+------------------+-----------+--------------+----------+------------+----------+---+---------+------------+-----------+--------------------+--------------------+------------+-------+
|sub_level_admission|       DOB|top_level_admission|             name|gender|blood_type|medical_condition|date_of_admission|         doctor|            hospital|insurance_provider|    billing_amount|room_number|discharge_date|medication|test_results|RandomDays|Age|is_female|is_pediatric|  stay_type|      possible_tests|           stay_name|is_geriatric|row_num|
+-------------------+----------+-------------------+-----------------+------+----------+-----------------+-----------------+---------------+--------------------+------------------+------------------+-----------+--------------+----------+------------+----------+---+-------

In [54]:
# Filter to keep only the top-ranked row within each partition
ranked_df = filtered_df_female

In [55]:
ranked_df.where(col("name") == "Daniel Mccoy").show(n=8000)

                                                                                

+-------------------+----------+-------------------+------------+------+----------+-----------------+-----------------+-------------+-----------+------------------+------------------+-----------+--------------+----------+------------+----------+---+---------+------------+-----------+--------------------+--------------------+------------+-------+
|sub_level_admission|       DOB|top_level_admission|        name|gender|blood_type|medical_condition|date_of_admission|       doctor|   hospital|insurance_provider|    billing_amount|room_number|discharge_date|medication|test_results|RandomDays|Age|is_female|is_pediatric|  stay_type|      possible_tests|           stay_name|is_geriatric|row_num|
+-------------------+----------+-------------------+------------+------+----------+-----------------+-----------------+-------------+-----------+------------------+------------------+-----------+--------------+----------+------------+----------+---+---------+------------+-----------+--------------------

In [21]:
from pyspark.sql.types import StringType

df = filtered_df_female.select([col(c).cast(StringType()).alias(c) for c in df.columns])
df.write.csv('./temp_data/female/filtered_df_female.csv', mode = 'overwrite', header=True)

                                                                                