# imports

In [None]:
# Use as many python cells as you wish to write your code
from pyspark.sql import SparkSession
import pandas as pd
import re

In [ ]:
# Initialize Spark session
spark = SparkSession.builder.appName("HealthDataAnalysis").getOrCreate()

# defs 

In [ ]:
def create_sql_views(csv_files):
    temp_views = [f.split(".")[0] for f in csv_files]

    # Import CSV files and create temporary views
    for file, view in zip(csv_files, temp_views):
        df = spark.read.csv(file, header=True, inferSchema=True)
        df.createOrReplaceTempView(view)


def merge_all_data(user_health_data, supplement_usage, experiments, user_profiles):
    csv_files = [user_health_data, supplement_usage, experiments, user_profiles]
    create_sql_views(csv_files)

    data_merge_sdf = spark.sql('''

        select 
            -- 1
            up.user_id

            -- 2
            , cast(uhd.date as date) as date

            -- 3
            , up.email

            -- 4
            , case 
                when age is null then 'Unknown'
                when age < 18 then 'Under 18'
                when age between 18 and 25 then '18-25'
                when age between 26 and 35 then '26-35'
                when age between 36 and 45 then '36-45'
                when age between 46 and 55 then '46-55'
                when age between 56 and 65 then '56-65'
                else 'Over 65'
            end as user_age_group

            -- 5
            , e.name as experiment_name

            -- 6
            , coalesce(su.supplement_name, 'No intake') as supplement_name

            -- 7
            , coalesce(su.dosage / 1000, null) as dosage_grams

            -- 8
            , su.is_placebo

            -- 9
            , uhd.average_heart_rate

            -- 10
            , uhd.average_glucose

            -- 11
            , lower(uhd.sleep_hours) as sleep_hours

            -- 12
            , uhd.activity_level

        from user_profiles up
            left join user_health_data uhd 
                on up.user_id = uhd.user_id
            left join supplement_usage su
                on up.user_id = su.user_id 
                    and uhd.date = su.date
            left join experiments e 
                on su.experiment_id = e.experiment_id 

        where 
            up.user_id is not null 
            and uhd.date is not null 
            and up.email is not null
    ''')

    data_merge_sdf.createOrReplaceTempView('data_merge')

    df = data_merge_sdf.toPandas()

    df['date'] = pd.to_datetime(df['date'])
    df['user_id'] = df['user_id'].astype(str)
    df['email'] = df['email'].astype(str)
    df['user_age_group'] = df['user_age_group'].astype(str)
    df['experiment_name'] = df['experiment_name'].astype(str)
    df['supplement_name'] = df['supplement_name'].replace('null', None).astype(str)
    df['dosage_grams'] = pd.to_numeric(df['dosage_grams'])
    df['is_placebo'] = df['is_placebo'].replace('null', None).astype('boolean')
    df['average_heart_rate'] = pd.to_numeric(df['average_heart_rate'])
    df['average_glucose'] = pd.to_numeric(df['average_glucose'])
    df['sleep_hours'] = df['sleep_hours'].astype(str)
    df['activity_level'] = pd.to_numeric(df['activity_level'])

    return df

# protos

In [ ]:
spark.sql('''

    select *
    from data_merge

''').toPandas()

In [ ]:
df = merge_all_data('user_health_data.csv', 'supplement_usage.csv', 'experiments.csv', 'user_profiles.csv')

# Assuming df is the DataFrame returned by merge_all_data
df['date'] = pd.to_datetime(df['date'])
df['user_id'] = df['user_id'].astype(int)
df['email'] = df['email'].astype(str)
df['user_age_group'] = df['user_age_group'].astype(str)

# Clean and standardize 'experiment_name' column
df['experiment_name'] = df['experiment_name'].str.strip().str.lower().replace('null', None)

# Clean and standardize 'supplement_name' column
df['supplement_name'] = df['supplement_name'].str.strip().str.lower().replace('null', 'no intake')

# Convert 'dosage_grams' to numeric, handling 'null' and invalid entries
df['dosage_grams'] = pd.to_numeric(df['dosage_grams'], errors='coerce')

# Convert 'is_placebo' to boolean, handling 'null' values
df['is_placebo'] = df['is_placebo'].replace('null', None).astype('boolean')

# Convert numeric columns, handling 'null' and invalid entries
numeric_columns = ['average_heart_rate', 'average_glucose', 'sleep_hours', 'activity_level']
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Clean 'user_age_group' column to match specified categories
valid_age_groups = ['under 18', '18-25', '26-35', '36-45', '46-55', '56-65', 'over 65', 'unknown']
df['user_age_group'] = df['user_age_group'].str.strip().str.lower()
df.loc[~df['user_age_group'].isin(valid_age_groups), 'user_age_group'] = 'unknown'

print(df.dtypes)
print(df.head(10))


# Merge All Data

In [ ]:
merge_all_data('user_health_data.csv', 'supplement_usage.csv', 'experiments.csv', 'user_profiles.csv')