In [None]:
import random

from src.data_generator.csv_data_processor import CSVDataProcessor
from src.utils.statistcs.statistical_functions import create_distributed_age_df
from src.utils.read_write import read_postgres_table
from spark_instance import spark
from src.utils.statistcs.data_visualisations import plot_age_distribution_with_sd, plot_kernel_density_age_distribution

from pyspark.sql.functions import col, lower, array, concat_ws, date_format, sum, lit
  

In [None]:
average_age = 40.2

csv_age_file_file = CSVDataProcessor(spark, "data/uk_age_population.csv")

csv_age_uk_df = csv_age_file_file.runner()

In [None]:
csv_age_uk_df.show()

In [None]:
plot_age_distribution_with_sd(csv_age_uk_df, average_age)


In [None]:
plot_kernel_density_age_distribution(csv_age_uk_df)


In [None]:
sampled_df = create_distributed_age_df(spark,  "data/uk_age_population.csv", 5000)

In [None]:
sampled_df.show()

In [None]:
sampled_df_aggregated = sampled_df.groupby(col("Age")).count()

In [None]:
sampled_df_aggregated.orderBy("Age").show(n=200)

In [None]:
plot_age_distribution_with_sd(sampled_df_aggregated)

In [None]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql import DataFrame


def create_unique_id(non_unique_id_df: DataFrame) -> DataFrame:
    """
    Cleans a name column by removing spaces and periods, and then concatenates it with a date column to form a unique ID.

    Args:
        non_unique_id_df (DataFrame): The input DataFrame that contains the name and date of birth columns.
    Returns:
        DataFrame: A DataFrame with an added column that combines cleaned names with dates of birth into a unique ID.
    """
    # Clean the 'name' column by removing periods and replacing spaces with underscores
    cleaned_df = non_unique_id_df.withColumn(
        "clean_name",
        regexp_replace(regexp_replace(col("name"), "\\.", ""), "\\s", "_")
    )

    return cleaned_df.withColumn(
        "unique_id",
        concat_ws("_", col("clean_name"), col("DOB"))
    ).drop("clean_name")


In [None]:
is_female = lower(col('gender')) == 'female'
is_pediatric = col("Age") < 18
is_geriatric = (col("Age") >= 65)

sampled_df_with_name = (sampled_df.withColumn("is_female", is_female)
                                  .withColumn("is_pediatric", is_pediatric)
                                  .withColumn("is_geriatric", is_geriatric))
sampled_df_with_unique_id = create_unique_id(sampled_df_with_name)

In [None]:
sampled_df_with_unique_id.show()

In [None]:
sampled_df_with_unique_id.createOrReplaceGlobalTempView("sampled_df_with_unique_id_gt")

In [None]:
# will be scrapping this shortly 
csv_reader = CSVDataProcessor(spark, "data/healthcare_dataset.csv")

# Read the CSV file
raw_df = csv_reader.runner()


In [None]:
from pyspark.sql.functions import expr, floor, datediff, current_date

min_age_days = 1 * 365  # Minimum age in days (18 years)
max_age_days = 90 * 365  # Maximum age in days (90 years)

raw_df = (raw_df.withColumn("DOB", expr(f"date_sub(current_date(), CAST(round(rand() * ({max_age_days} - {min_age_days}) + {min_age_days}) AS INT))"))
        .withColumn("Age", floor(datediff(current_date(), col("DOB")) / 365.25)))


In [None]:
from utils.read_write import write_postgres_table

write_postgres_table(raw_df, "dob_age_raw_data")

In [None]:
# above is commented out as it's saved in the database:
df = read_postgres_table("dob_age_raw_data")

In [None]:
df.show()

In [None]:
from src.constants.admission_types_dataset import admission_mapping, AdmissionTypes
flattened = [
    (
        top_level.name,
        sub_level_key.name,
        list(sub_level_info.get("stay_types")),
        sub_level_info.get("tests"),  
        random.choice(sub_level_info.get("doctors"))
    )
    for top_level, sub_level_dict in admission_mapping.items()
    for sub_level_key, sub_level_info in sub_level_dict.items()
]



In [None]:
flattened


In [None]:
from src.constants.condition_probabilities import condition_age_probability_dict

flattened_condition_probabilities = [
    (
        sub_admission,
        condition,
        gender if gender in ['male', 'female'] else None,
        float(age_range[0]),
        float(age_range[1]) if len(age_range) > 1 else float('inf'),
        float(probability) 
    )
    for sub_admission, conditions in condition_age_probability_dict.items()
    for condition, genders_or_age_prob_list in conditions.items()
    for gender, age_prob_list in (genders_or_age_prob_list.items() if isinstance(genders_or_age_prob_list, dict) else [(None, genders_or_age_prob_list)])
    for age_range, probability in age_prob_list
]



In [None]:
flattened_condition_probabilities

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

columns = ["condition_admission_type", "condition", "gender", "age_min", "age_max", "probability"]
schema = StructType([
    StructField("condition_admission_type", StringType(), True),
    StructField("condition", StringType(), True),
    StructField("condition_gender", StringType(), True),  # Assuming gender can be specific probability for gender 'male', 'female', or null
    StructField("age_min", FloatType(), False),
    StructField("age_max", FloatType(), False),
    StructField("probability", FloatType(), False)
])



condition_prob_df = spark.createDataFrame(data=flattened_condition_probabilities, schema=schema)


In [None]:
condition_prob_df.show()

In [None]:
"""
STEP 1 
    From the flattened data list we wish to create a DataFrame. This contains all the possible combinations for the given 
    top level admissions, sub level admissions, stay types and list of tests available from the admission_mapping, stay_type and admission_tests lists or dictionaries in admission_types_test_dataset.py
"""

mapping_df = spark.createDataFrame(flattened, ["top_level_admission", "sub_level_admission", "stay_types", "possible_tests", "doctor"])

joined_tbl = mapping_df.join(condition_prob_df, on=[mapping_df.sub_level_admission == condition_prob_df.condition_admission_type], how="left")

In [None]:
gt_df = spark.sql("SELECT * FROM global_temp.sampled_df_with_unique_id_gt")


In [None]:
"""
STEP 3  
Create Enum class df and join it on to main driver df. 
Then create conditions 
"""

from src.data_generator.conditions_creator import ConditionsCreator
from src.utils.thread_operations import runner
from src.constants.type_constants import DepartmentTypes

enum_values = [e.name for e in DepartmentTypes]

enum_df = spark.createDataFrame(enum_values, StringType()).toDF("admission_type")

selected_conditions_df = runner(spark, ConditionsCreator, gt_df, joined_tbl, enum_df)


In [None]:
from utils.util_funcs import get_row_count

get_row_count(selected_conditions_df)

In [None]:
selected_conditions_df.show()

In [None]:
# TODO:  
# tests to be chosen
# admission date to be chosen 
# hospital name 
# patient postcode 
# ethnicity distribution  
# _________________
# Long term 
# investigation which disease affects specific ethnicities
# blood type udf would need to consider ethnicity also eventually



In [None]:
spark.stop()