In [None]:
import random
from data_generator.csv_data_processor import CSVDataProcessor
from utils.util_funcs import get_row_count, display_df, remove_data, verify_ranking, verify_ranking_counts
from utils.read_write import read_postgres_table
from spark_instance import spark
from pyspark.sql import Window

from pyspark.sql.functions import col, lower, rand, array,row_number, concat_ws, date_format, sum, udf
  

In [None]:
average_age = 40.1

csv_age_file_file = CSVDataProcessor(spark, "data/uk_age_population.csv")

csv_age_uk_df = csv_age_file_file.runner()

In [None]:
csv_age_uk_df.show()

In [None]:
import math

# Calculate the squared differences from the known mean, weighted by the population total
csv_age_uk_sq_df = csv_age_uk_df.withColumn("weighted_squared_diff", (col("age") - 40.2) ** 2 * col("population_total"))

# Sum up the weighted squared differences and the total population
total_weighted_squared_diff = csv_age_uk_sq_df.select(sum("weighted_squared_diff")).collect()[0][0]
total_population = csv_age_uk_sq_df.select(sum("population_total")).collect()[0][0]

# Calculate the weighted variance
weighted_variance = total_weighted_squared_diff / total_population

# Calculate the weighted standard deviation
weighted_standard_deviation = math.sqrt(weighted_variance)

print("Weighted Standard Deviation of Age:", weighted_standard_deviation)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


In [None]:

pandas_df = csv_age_uk_sq_df.toPandas()

mean_age = 40.2

pandas_df['age'] = pandas_df['age'].astype(float)
pandas_df['population_total'] = pandas_df['population_total'].astype(int)
pandas_df['weighted_squared_diff'] = pandas_df['population_total'] * (pandas_df['age'] - mean_age) ** 2

total_weighted_squared_diff = np.sum(pandas_df['weighted_squared_diff'])
total_population = np.sum(pandas_df['population_total'])

weighted_variance = total_weighted_squared_diff / total_population
standard_deviation = np.sqrt(weighted_variance)


plt.figure(figsize=(12, 6))
sns.histplot(pandas_df, x='age', weights='population_total', bins=range(int(pandas_df['age'].min()), int(pandas_df['age'].max()) + 1), color='skyblue', kde=False)

plt.axvline(mean_age, color='red', linestyle='dashed', linewidth=1)
plt.axvline(mean_age + standard_deviation, color='green', linestyle='dashed', linewidth=1)
plt.axvline(mean_age - standard_deviation, color='green', linestyle='dashed', linewidth=1)

plt.axvspan(float(mean_age) - standard_deviation, float(mean_age) + standard_deviation, alpha=0.1, color='green')

plt.title('Age Distribution with Standard Deviation')
plt.xlabel('Age')
plt.ylabel('Population Total')
plt.show()



In [None]:


# Set the size of the plot
plt.figure(figsize=(12, 6))

# Create the KDE plot
sns.kdeplot(data=pandas_df, x='age', weights='population_total', fill=True, common_norm=False, bw_adjust=0.5, clip=(pandas_df['age'].min(), pandas_df['age'].max()))

plt.axvline(x=40.2, color='r', linestyle='--')

plt.title('Kernel Density Estimate of Age Distribution')
plt.xlabel('Age')
plt.ylabel('Density')

# Show the plot
plt.show()


In [None]:
from pyspark import Row
from pyspark.sql.functions import col, lit

total_population = csv_age_uk_sq_df.select(sum("population_total")).collect()[0][0]


csv_age_uk_sq_df = csv_age_uk_sq_df.withColumn("density", col("population_total") / lit(total_population))

# Normalize the density to ensure it sums to 1
total_density = csv_age_uk_sq_df.select(sum("density")).collect()[0][0]
new_csv_age_uk_sq_df = csv_age_uk_sq_df.withColumn("normalized_density", col("density") / lit(total_density))


num_samples = 1000000
# Oversample by a small percentage, this is to take care of rounding errors in the system
oversample_factor = 1.1  
oversample_num = int(num_samples * oversample_factor)
sampled_rdd = new_csv_age_uk_sq_df.rdd.flatMap(
    lambda row: [row['age']] * int(row['normalized_density'] * oversample_num)
)

row_rdd = sampled_rdd.map(lambda age: Row(Age=age))
sampled_df = spark.createDataFrame(row_rdd)

print(sampled_df.count())
sampled_df = sampled_df.orderBy(rand()).limit(num_samples)

print(sampled_df.count())

In [None]:
sampled_df.show()

In [None]:
from pyspark.sql.pandas.functions import pandas_udf
from pyspark.sql.types import  DateType, StringType


@pandas_udf(DateType())
def create_random_dob_pandas_udf(ages: pd.Series) -> pd.Series:
    today = pd.Timestamp('today').normalize()
    
    preliminary_dobs = today - pd.to_timedelta(ages * 365, unit='d')
    random_days = np.random.randint(0, 365, size=len(ages))
    # Calculate the final DOB
    final_dobs = preliminary_dobs - pd.to_timedelta(random_days, unit='d')
    return final_dobs


In [None]:
from faker import Faker
from random import choices

@pandas_udf(StringType())
def create_fake_name_udf(size: pd.Series) -> pd.Series: 
    """
    
    Args:
        size: 

    Returns:

    """
    fake = Faker()
    names = []
    for _ in range(len(size)):
        initials_count = choices([1, 2, 3], weights=[1, 0.5, 0.2], k=1)[0]
        initials = ' '.join([f"{fake.random_uppercase_letter()}." for _ in range(initials_count)])
        surname = fake.last_name()
        names.append(f"{initials} {surname}")
    return pd.Series(names)




In [None]:
sampled_df_with_dob = sampled_df.withColumn("DOB", create_random_dob_pandas_udf(col("Age")))


In [None]:
sampled_df_with_dob.show(n=1000)

In [None]:
sampled_df_with_dob = sampled_df_with_dob.withColumn("dummy", lit(1))
sampled_df_with_name = sampled_df_with_dob.withColumn("name", create_fake_name_udf(col("dummy"))).drop("dummy")

In [None]:
sampled_df_with_name.show(n=20000)

In [None]:
# csv_reader = CSVDataProcessor(spark, "data/healthcare_dataset.csv")
# 
# # Read the CSV file
# raw_df = csv_reader.run()


In [None]:
# min_age_days = 1 * 365  # Minimum age in days (18 years)
# max_age_days = 90 * 365  # Maximum age in days (90 years)
# 
# raw_df = (raw_df.withColumn("DOB", expr(f"date_sub(current_date(), CAST(round(rand() * ({max_age_days} - {min_age_days}) + {min_age_days}) AS INT))"))
#         .withColumn("Age", floor(datediff(current_date(), col("DOB")) / 365.25)))




In [None]:
# above is commented out as it's saved in the database:
df = read_postgres_table("dob_age_raw_data")

In [None]:
from constants.admission_types_dataset import admission_mapping, AdmissionTypes
flattened = [
    (
        top_level.name,
        sub_level_key.name,
        list(sub_level_info.get("stay_types")),
        sub_level_info.get("tests"),  
        random.choice(sub_level_info.get("doctors"))
    )
    for top_level, sub_level_dict in admission_mapping.items()
    for sub_level_key, sub_level_info in sub_level_dict.items()
]



In [None]:
flattened

In [None]:
from constants.condition_probabilities import condition_age_probability_dict

flattened_condition_probabilities = [
    (
        sub_admission,
        condition,
        gender if gender in ['male', 'female'] else None,
        float(age_range[0]),
        float(age_range[1]) if len(age_range) > 1 else float('inf'),
        float(probability) 
    )
    for sub_admission, conditions in condition_age_probability_dict.items()
    for condition, genders_or_age_prob_list in conditions.items()
    for gender, age_prob_list in (genders_or_age_prob_list.items() if isinstance(genders_or_age_prob_list, dict) else [(None, genders_or_age_prob_list)])
    for age_range, probability in age_prob_list
]



In [None]:
flattened_condition_probabilities

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, FloatType

columns = ["condition_admission_type", "condition", "gender", "age_min", "age_max", "probability"]
schema = StructType([
    StructField("condition_admission_type", StringType(), True),
    StructField("condition", StringType(), True),
    StructField("condition_gender", StringType(), True),  # Assuming gender can be specific probability for gender 'male', 'female', or null
    StructField("age_min", FloatType(), False),
    StructField("age_max", FloatType(), False),
    StructField("probability", FloatType(), False)
])



condition_prob_df = spark.createDataFrame(data=flattened_condition_probabilities, schema=schema)


In [None]:
condition_prob_df.show()

In [None]:
df.show()

In [None]:
"""
STEP 1 
    From the flattened data list we wish to create a DataFrame. This contains all the possible combinations for the given 
    top level admissions, sub level admissions, stay types and list of tests available from the admission_mapping, stay_type and admission_tests lists or dictionaries in admission_types_test_dataset.py
"""

mapping_df = spark.createDataFrame(flattened, ["top_level_admission", "sub_level_admission", "stay_types", "possible_tests", "doctor"])

joined_tbl = mapping_df.join(condition_prob_df, on=[mapping_df.sub_level_admission == condition_prob_df.condition_admission_type], how="left")

In [None]:
"""
STEP 2 
    Create a list of admission_types randomly assign this to the original patient in the original data set, 
    whilst dropping the original admission_type column. Then join with mapping_df on top_level_admission col to give access to possible 
    conditions, mappings and so on.
"""
admission_type_names = [member.name for member in AdmissionTypes]

keys_array = array([lit(name) for name in admission_type_names])

# Define constants and conditions
female_only = ["MATERNITY", 'obstetrics']
is_female = lower(col('gender')) == 'female'
is_pediatric = col("Age") < 18
is_geriatric = (col("Age") >= 65)

df = (df.withColumn("is_female", is_female)
        .withColumn("is_pediatric", is_pediatric)
        .withColumn("is_geriatric", is_geriatric)
        .withColumn("unique_id", concat_ws("_", "name", date_format("DOB", "yyyyMMdd")))
        .drop("doctor", "medical_condition", "test_results", "medication", "admission_type")
      )



In [None]:
"""
STEP 3  
Create Enum class df and join it on to main driver df. 
Then create conditions 
"""

from data_generator.conditions_creator import ConditionsCreator
from utils.thread_operations import runner
from constants.type_constants import SubAdmissionTypes

enum_values = [e.name for e in SubAdmissionTypes]

enum_df = spark.createDataFrame(enum_values, StringType()).toDF("admission_type")



selected_conditions_df = runner(spark, ConditionsCreator, df, joined_tbl, enum_df)


In [None]:
total_count = selected_conditions_df.count()

distinct_count = selected_conditions_df.select("unique_id").distinct().count()

is_unique = total_count == distinct_count

print(f"final_df is distinct for every row? {is_unique}")

In [None]:
# get_row_count(selected_conditions_df)

In [None]:
# selected_conditions_df.orderBy(col("unique_id")).show(n=20000)

In [None]:
joined_selected_conditions_df = df.join(selected_conditions_df, on="unique_id", how="left")


In [None]:
joined_selected_conditions_df.where(col("chosen_condition").isNull()).show(n=1000) 

In [None]:
# get_row_count(joined_selected_conditions_df)

In [None]:
from utils.util_funcs import create_fake_name

In [None]:
create_fake_name_udf = udf(create_fake_name, StringType())

joined_selected_conditions_df = joined_selected_conditions_df.drop("name")

joined_selected_conditions_df = joined_selected_conditions_df.withColumn("new_name", create_fake_name_udf())



In [None]:
joined_selected_conditions_df.show()

In [None]:
(df_renamed.select("name", "DOB", "Age", "gender", "blood_type", "date_of_admission", "discharge_date", "top_level_admission", "sub_level_admission", "possible_tests", "conditions", "doctor", "hospital", "room_number", "insurance_provider", "billing_amount", "stay_type", "is_female", "is_geriatric", "is_pediatric", "stay_name", "row_num", "unique_id")
 .show(n=200))

In [None]:
df_new = df_renamed.drop("row_num")

windowSpec = Window.partitionBy('stay_name', 'unique_id').orderBy(rand())

# Assign row numbers within each partition in a random order
df_new_part = df_new.withColumn("row_num", row_number().over(windowSpec))

df_new_part.select(["name", "DOB", "Age", "gender", "blood_type", "date_of_admission", "discharge_date", "top_level_admission", "sub_level_admission", "possible_tests", "conditions", "doctor", "hospital", "room_number", "insurance_provider", "billing_amount", "stay_type", "is_female", "is_geriatric", "is_pediatric", "stay_name", "row_num", "unique_id"]).where(col("stay_name") =="Tiffany Ramirez").show() 

In [None]:
get_row_count(df_new_part)

In [None]:
["name", "DOB", "Age", "gender", "blood_type", "date_of_admission", "discharge_date", "top_level_admission", "sub_level_admission", "possible_tests", "conditions", "doctor", "hospital", "room_number", "insurance_provider", "billing_amount", "stay_type", "is_female", "is_geriatric" "is_pediatric", "stay_name", "row_num", "unique_id"]

In [None]:
df_new_part.show(n=2000)

In [None]:
# TODO:  
# medical condition to be  chosen 
# tests to be chosen
# admission date to be checked again dob,  
# TODO filter on is pediatric, geriatric and is_female to be done here and same people with dob? needs  to be considered 
# drop stay_name and unique_id 


In [None]:
# df = join_with_condition_prob_df.select([col(c).cast(StringType()).alias(c) for c in join_with_condition_prob_df.columns])
# df.show()
# df.repartition(10).write.csv('./temp_data/join_with_condition_prob_df/renamed.csv', mode = 'overwrite', header=True)

In [None]:
spark.stop()