In [1]:
import random

from src.data_generator.csv_data_processor import CSVDataProcessor
from src.utils.statistcs.statistical_functions import create_distributed_age_df
from src.utils.read_write import read_postgres_table
from spark_instance import spark
from src.utils.statistcs.data_visualisations import plot_age_distribution_with_sd, plot_kernel_density_age_distribution

from pyspark.sql.functions import col, lower, array, concat_ws, date_format, sum, lit
  

24/04/22 18:58:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [2]:
average_age = 40.2

csv_age_file_file = CSVDataProcessor(spark, "data/uk_age_population.csv")

csv_age_uk_df = csv_age_file_file.runner()

                                                                                

In [None]:
csv_age_uk_df.show()

In [None]:
plot_age_distribution_with_sd(csv_age_uk_df, average_age)


In [None]:
plot_kernel_density_age_distribution(csv_age_uk_df)


In [2]:
sampled_df = create_distributed_age_df(spark,  "data/uk_age_population.csv", 5000)

                                                                                

In [3]:
sampled_df.show()

[Stage 9:>                                                          (0 + 1) / 1]

+---+----------+----------+------+------------------+---------+------------+------------+--------------------+
|Age|       DOB|Blood_type|Gender|              Name|is_female|is_pediatric|is_geriatric|           unique_id|
+---+----------+----------+------+------------------+---------+------------+------------+--------------------+
| 63|1960-11-10|        A+|Female|       A. Gonzalez|     true|       false|       false|A_Gonzalez_1960-1...|
| 35|1988-12-15|        B+|Female|           I. Lane|     true|       false|       false|   I_Lane_1988-12-15|
| 34|1989-11-01|        O+|Female|Q. K. B. Wilkinson|     true|       false|       false|Q_K_B_Wilkinson_1...|
| 46|1977-09-13|        O+|Female|           Q. Wong|     true|       false|       false|   Q_Wong_1977-09-13|
| 74|1949-08-31|        O+|Female|           H. Wade|     true|       false|        true|   H_Wade_1949-08-31|
| 49|1974-08-29|        O+|  Male|          H. Cohen|    false|       false|       false|  H_Cohen_1974-08-29|
|

                                                                                

In [None]:
sampled_df_aggregated = sampled_df.groupby(col("Age")).count()

In [None]:
sampled_df_aggregated.orderBy("Age").show(n=200)

In [None]:
plot_age_distribution_with_sd(sampled_df_aggregated)

In [4]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql import DataFrame


def create_unique_id(non_unique_id_df: DataFrame) -> DataFrame:
    """
    Cleans a name column by removing spaces and periods, and then concatenates it with a date column to form a unique ID.

    Args:
        non_unique_id_df (DataFrame): The input DataFrame that contains the name and date of birth columns.
    Returns:
        DataFrame: A DataFrame with an added column that combines cleaned names with dates of birth into a unique ID.
    """
    # Clean the 'name' column by removing periods and replacing spaces with underscores
    cleaned_df = non_unique_id_df.withColumn(
        "clean_name",
        regexp_replace(regexp_replace(col("name"), "\\.", ""), "\\s", "_")
    )

    return cleaned_df.withColumn(
        "unique_id",
        concat_ws("_", col("clean_name"), col("DOB"))
    ).drop("clean_name")


In [11]:
is_female = lower(col('gender')) == 'female'
is_pediatric = col("Age") < 18
is_geriatric = (col("Age") >= 65)

sampled_df_with_name = (sampled_df.withColumn("is_female", is_female)
                                  .withColumn("is_pediatric", is_pediatric)
                                  .withColumn("is_geriatric", is_geriatric))
sampled_df_with_unique_id = create_unique_id(sampled_df_with_name)

In [12]:
sampled_df_with_unique_id.show()

[Stage 13:>                                                         (0 + 1) / 1]

+---+----------+----------+------+-----------------+---------+------------+------------+--------------------+
|Age|       DOB|Blood_type|Gender|             Name|is_female|is_pediatric|is_geriatric|           unique_id|
+---+----------+----------+------+-----------------+---------+------------+------------+--------------------+
|  7|2016-10-18|        O+|  Male|        Z. H. Lee|    false|        true|       false|  Z_H_Lee_2016-10-18|
| 30|1993-08-23|        O+|Female|      R. Gonzales|     true|       false|       false|R_Gonzales_1993-0...|
| 19|2004-06-10|        A-|Female|       R. Schmidt|     true|       false|       false|R_Schmidt_2004-06-10|
| 15|2008-10-16|        A+|Female|   O. B. Lawrence|     true|        true|       false|O_B_Lawrence_2008...|
| 31|1992-08-16|        B+|Female|         Z. Ayala|     true|       false|       false|  Z_Ayala_1992-08-16|
| 10|2013-09-21|        O+|Female|        M. Oliver|     true|        true|       false| M_Oliver_2013-09-21|
| 55|1969-

                                                                                

In [13]:
sampled_df_with_unique_id.createOrReplaceGlobalTempView("sampled_df_with_unique_id_gt")

In [3]:
# will be scrapping this shortly 
csv_reader = CSVDataProcessor(spark, "data/healthcare_dataset.csv")

# Read the CSV file
raw_df = csv_reader.runner()


                                                                                

In [4]:
from pyspark.sql.functions import expr, floor, datediff, current_date

min_age_days = 1 * 365  # Minimum age in days (18 years)
max_age_days = 90 * 365  # Maximum age in days (90 years)

raw_df = (raw_df.withColumn("DOB", expr(f"date_sub(current_date(), CAST(round(rand() * ({max_age_days} - {min_age_days}) + {min_age_days}) AS INT))"))
        .withColumn("Age", floor(datediff(current_date(), col("DOB")) / 365.25)))


In [6]:
from utils.read_write import write_postgres_table

write_postgres_table(raw_df, "dob_age_raw_data")

                                                                                

In [7]:
# above is commented out as it's saved in the database:
df = read_postgres_table("dob_age_raw_data")

In [8]:
df.show()

+--------------------+---+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+----------+
|                name|Age|gender|blood_type|medical_condition|date_of_admission|            doctor|            hospital|insurance_provider|    billing_amount|room_number|admission_type|discharge_date| medication|test_results|       DOB|
+--------------------+---+------+----------+-----------------+-----------------+------------------+--------------------+------------------+------------------+-----------+--------------+--------------+-----------+------------+----------+
|     Tiffany Ramirez| 82|Female|        O-|         Diabetes|       2022-11-17|    Patrick Parker|    Wallace-Hamilton|          Medicare| 37490.98336352819|        146|      Elective|    2022-12-01|    Aspirin|Inconclusive|1942-01-03|
|         Ruben Burns| 68|  Male|        O+|        

In [14]:
from src.constants.admission_types_dataset import admission_mapping, AdmissionTypes
flattened = [
    (
        top_level.name,
        sub_level_key.name,
        list(sub_level_info.get("stay_types")),
        sub_level_info.get("tests"),  
        random.choice(sub_level_info.get("doctors"))
    )
    for top_level, sub_level_dict in admission_mapping.items()
    for sub_level_key, sub_level_info in sub_level_dict.items()
]



In [15]:
flattened

[('EMERGENCY',
  'INJURY_RTC',
  ['Inpatient', 'Day Patient'],
  ['X-rays', 'CT scans', 'MRI', 'Ultrasound', 'Blood tests'],
  'Dr. Christopher Hendrix'),
 ('EMERGENCY',
  'SELF_INFLICTED',
  ['Inpatient'],
  ['Psychological assessment',
   'X-rays (for physical injuries)',
   'Blood tests',
   'Toxicology screening'],
  'Dr. Tracy Fox'),
 ('EMERGENCY',
  'CARDIOLOGY',
  ['Inpatient', 'Day Patient'],
  ['ECG',
   'Echocardiogram',
   'Stress tests',
   'Cardiac catheterization',
   'Blood tests'],
  'Dr. Diana Klein'),
 ('EMERGENCY',
  'NEUROLOGY',
  ['Inpatient', 'Day Patient', 'Outpatient'],
  ['MRI or CT scans of the brain',
   'Electroencephalogram (EEG)',
   'Lumbar puncture',
   'Nerve conduction studies',
   'Blood tests'],
  'Dr. Steve King'),
 ('EMERGENCY',
  'GASTROENTEROLOGY',
  ['Inpatient', 'Day Patient', 'Outpatient'],
  ['Endoscopy',
   'Colonoscopy',
   'Blood tests',
   'Stool tests',
   'Abdominal ultrasound',
   'CT scan'],
  'Dr. Tracy Stewart'),
 ('EMERGENCY',
  'R

In [16]:
from src.constants.condition_probabilities import condition_age_probability_dict

flattened_condition_probabilities = [
    (
        sub_admission,
        condition,
        gender if gender in ['male', 'female'] else None,
        float(age_range[0]),
        float(age_range[1]) if len(age_range) > 1 else float('inf'),
        float(probability) 
    )
    for sub_admission, conditions in condition_age_probability_dict.items()
    for condition, genders_or_age_prob_list in conditions.items()
    for gender, age_prob_list in (genders_or_age_prob_list.items() if isinstance(genders_or_age_prob_list, dict) else [(None, genders_or_age_prob_list)])
    for age_range, probability in age_prob_list
]



In [17]:
flattened_condition_probabilities

[('ONCOLOGY', 'Bladder Cancer', None, 0.0, 10.0, 0.0),
 ('ONCOLOGY', 'Bladder Cancer', None, 11.0, 17.0, 0.00018),
 ('ONCOLOGY', 'Bladder Cancer', None, 18.0, 25.0, 0.0002),
 ('ONCOLOGY', 'Bladder Cancer', None, 25.0, 34.0, 0.00224),
 ('ONCOLOGY', 'Bladder Cancer', None, 35.0, 44.0, 0.00448),
 ('ONCOLOGY', 'Bladder Cancer', None, 45.0, 59.0, 0.00673),
 ('ONCOLOGY', 'Bladder Cancer', None, 45.0, 54.0, 0.00897),
 ('ONCOLOGY', 'Bladder Cancer', None, 55.0, 64.0, 0.02018),
 ('ONCOLOGY', 'Bladder Cancer', None, 65.0, 75.0, 0.02242),
 ('ONCOLOGY', 'Bladder Cancer', None, 76.0, 80.0, 0.0426),
 ('ONCOLOGY', 'Brain and Nervous System Cancers', None, 0.0, 10.0, 0.0),
 ('ONCOLOGY', 'Brain and Nervous System Cancers', None, 11.0, 17.0, 0.00023),
 ('ONCOLOGY', 'Brain and Nervous System Cancers', None, 18.0, 25.0, 0.00026),
 ('ONCOLOGY', 'Brain and Nervous System Cancers', None, 25.0, 34.0, 0.00291),
 ('ONCOLOGY', 'Brain and Nervous System Cancers', None, 35.0, 44.0, 0.00581),
 ('ONCOLOGY', 'Brain a

In [18]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

columns = ["condition_admission_type", "condition", "gender", "age_min", "age_max", "probability"]
schema = StructType([
    StructField("condition_admission_type", StringType(), True),
    StructField("condition", StringType(), True),
    StructField("condition_gender", StringType(), True),  # Assuming gender can be specific probability for gender 'male', 'female', or null
    StructField("age_min", FloatType(), False),
    StructField("age_max", FloatType(), False),
    StructField("probability", FloatType(), False)
])



condition_prob_df = spark.createDataFrame(data=flattened_condition_probabilities, schema=schema)


In [14]:
condition_prob_df.show()

[Stage 6:>                                                          (0 + 1) / 1]

+------------------------+--------------------+----------------+-------+-------+-----------+
|condition_admission_type|           condition|condition_gender|age_min|age_max|probability|
+------------------------+--------------------+----------------+-------+-------+-----------+
|                ONCOLOGY|      Bladder Cancer|            NULL|    0.0|   10.0|        0.0|
|                ONCOLOGY|      Bladder Cancer|            NULL|   11.0|   17.0|     1.8E-4|
|                ONCOLOGY|      Bladder Cancer|            NULL|   18.0|   25.0|     2.0E-4|
|                ONCOLOGY|      Bladder Cancer|            NULL|   25.0|   34.0|    0.00224|
|                ONCOLOGY|      Bladder Cancer|            NULL|   35.0|   44.0|    0.00448|
|                ONCOLOGY|      Bladder Cancer|            NULL|   45.0|   59.0|    0.00673|
|                ONCOLOGY|      Bladder Cancer|            NULL|   45.0|   54.0|    0.00897|
|                ONCOLOGY|      Bladder Cancer|            NULL|   55.

                                                                                

In [19]:
"""
STEP 1 
    From the flattened data list we wish to create a DataFrame. This contains all the possible combinations for the given 
    top level admissions, sub level admissions, stay types and list of tests available from the admission_mapping, stay_type and admission_tests lists or dictionaries in admission_types_test_dataset.py
"""

mapping_df = spark.createDataFrame(flattened, ["top_level_admission", "sub_level_admission", "stay_types", "possible_tests", "doctor"])

joined_tbl = mapping_df.join(condition_prob_df, on=[mapping_df.sub_level_admission == condition_prob_df.condition_admission_type], how="left")

In [20]:
gt_df = spark.sql("SELECT * FROM global_temp.sampled_df_with_unique_id_gt")


In [21]:
"""
STEP 3  
Create Enum class df and join it on to main driver df. 
Then create conditions 
"""

from src.data_generator.conditions_creator import ConditionsCreator
from src.utils.thread_operations import runner
from src.constants.type_constants import DepartmentTypes

enum_values = [e.name for e in DepartmentTypes]

enum_df = spark.createDataFrame(enum_values, StringType()).toDF("admission_type")

selected_conditions_df = runner(spark, ConditionsCreator, gt_df, joined_tbl, enum_df)




In [35]:
from utils.util_funcs import get_row_count

get_row_count(selected_conditions_df)

                                                                                

1272

In [33]:
selected_conditions_df.show()

+--------------------+--------------------+--------------+-----------------+----------------------+
|           unique_id|           Stay_type|Admission_type| Department_types|Condition_or_Diagnosis|
+--------------------+--------------------+--------------+-----------------+----------------------+
|   I_Bailey_19510420|[Inpatient, Day P...|      ONCOLOGY|      GP_REFERRAL|       Prostate Cancer|
|    D_Smith_19760909|        [Outpatient]|   DERMATOLOGY|    SELF_REFERRAL|               Rosacea|
|F_R_R_Watson_2006...|        [Outpatient]|   DERMATOLOGY|    SELF_REFERRAL|        pediatric_Acne|
| M_F_Wilcox_19990316|        [Outpatient]|   DERMATOLOGY|      GP_REFERRAL|                  Acne|
| C_Cummings_19691125|[Inpatient, Day P...|    CARDIOLOGY|HOSPITAL_REFERRAL|  Hypertension (Hig...|
|    Q_E_Kim_19800219|        [Outpatient]|   DERMATOLOGY|      GP_REFERRAL|               Rosacea|
|G_P_E_Nelson_1977...|        [Outpatient]|   DERMATOLOGY|    SELF_REFERRAL|               Rosacea|


                                                                                

In [None]:
# TODO:  
# tests to be chosen
# admission date to be chosen 
# hospital name 
# patient postcode 
# ethnicity distribution  
# _________________
# Long term 
# investigation which disease affects specific ethnicities
# blood type udf would need to consider ethnicity also eventually



In [None]:
spark.stop()