In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import os

folder_path = '/content/drive/My Drive/FYP'
os.chdir(folder_path)


Mounted at /content/drive


In [None]:
# Load data
users_df = pd.read_csv('users_demographics.csv')
data_df = pd.read_csv('health_data_points.csv')

# Ensure datetime format
data_df['start_date'] = pd.to_datetime(data_df['start_date'])
data_df['end_date'] = pd.to_datetime(data_df['end_date'])

# Create output folder
os.makedirs("wide_format", exist_ok=True)

# Loop over patients
for user_id in users_df['user_id'].unique():
    print(f"\nProcessing patient: {user_id}")

    user_data = data_df[data_df['user_id'] == user_id].copy()
    if user_data.empty:
        print("No data for this user.")
        continue

    # Step 1: Get basal durations using method 1
    basal_entries = user_data[user_data['type'] == 'basal'].copy()
    basal_entries['duration_minutes'] = (basal_entries['end_date'] - basal_entries['start_date']).dt.total_seconds() / 60

    # Keep only necessary columns
    basal_duration = basal_entries[['start_date', 'duration_minutes']]

    # Step 2: Pivot full health data to wide format
    pivot_df = user_data.pivot_table(
        index='start_date',
        columns='type',
        values='value',
        aggfunc='first'
    ).reset_index()

    pivot_df = pivot_df.sort_values(by='start_date')

    # Step 3: Merge in the duration values for basal
    pivot_df = pd.merge(pivot_df, basal_duration, on='start_date', how='left')
    pivot_df.rename(columns={'duration_minutes': 'basal_duration_minutes'}, inplace=True)

    # Step 4: Save to wide_format folder
    filename = f'wide_format/patient_{user_id[:8]}_wide.csv'
    pivot_df.to_csv(filename, index=False)
    print(f"✔ Saved: {filename}")


Processing patient: 60b9c27a-7615-401f-b86d-62a2ac97cbcc
✔ Saved: wide_format/patient_60b9c27a_wide.csv

Processing patient: 47b5aa6b-e0c3-48a9-8feb-8eee68c19f88
✔ Saved: wide_format/patient_47b5aa6b_wide.csv

Processing patient: ab993210-27a1-451a-b8a4-4cddabb329c6
✔ Saved: wide_format/patient_ab993210_wide.csv

Processing patient: b4bdfa85-0735-4868-947b-11cfe178f014
✔ Saved: wide_format/patient_b4bdfa85_wide.csv

Processing patient: 46f119e5-da28-456d-86ed-50bb9452abf9
✔ Saved: wide_format/patient_46f119e5_wide.csv


In [None]:
# Load data
users_df = pd.read_csv('users_demographics.csv')
data_df = pd.read_csv('health_data_points.csv')

# Convert datetime
data_df['start_date'] = pd.to_datetime(data_df['start_date'])
data_df['end_date'] = pd.to_datetime(data_df['end_date'])

# Global reference time (t=0)
reference_time = data_df['start_date'].min()

# Create output folder
os.makedirs("wide_format", exist_ok=True)

print("Unique types in dataset:", data_df['type'].unique())

# Loop through users
for user_id in users_df['user_id'].unique():
    print(f"\nProcessing patient: {user_id}")
    user_data = data_df[data_df['user_id'] == user_id].copy()
    if user_data.empty:
        print("No data for this user.")
        continue

    # Compute time in minutes from reference
    patient_start = user_data['start_date'].min()
    user_data['time_min'] = ((user_data['start_date'] - patient_start).dt.total_seconds() / 60).round().astype(int)

    ### Process basal ###
    basal_data = user_data[user_data['type'] == 'basal'].copy()
    basal_data['duration_min'] = (basal_data['end_date'] - basal_data['start_date']).dt.total_seconds() / 60
    basal_data = basal_data[basal_data['duration_min'] >= 1]

    # Expand each basal entry across its duration
    basal_expanded = []
    for _, row in basal_data.iterrows():
        t_start = int(((row['start_date'] - patient_start).total_seconds()) // 60)
        t_end = int(((row['end_date'] - patient_start).total_seconds()) // 60)
        rate_per_min = row['value'] / 60  # U/hr → U/min
        for t in range(t_start, t_end):
            basal_expanded.append({'time_min': t, 'ut_basal': rate_per_min})
    basal_df = pd.DataFrame(basal_expanded)

    ### Process bolus ###
    bolus_data = user_data[user_data['type'] == 'bolus'].copy()
    bolus_df = bolus_data[['time_min', 'value']].copy()
    bolus_df.rename(columns={'value': 'ut_bolus'}, inplace=True)

    ### Process CHO ###
    cho_data = user_data[user_data['type'] == 'carbs'].copy()
    cho_df = cho_data[['time_min', 'value']].copy()
    cho_df.rename(columns={'value': 'rt'}, inplace=True)

    ### Process glucose ###
    glucose_data = user_data[user_data['type'] == 'glucose'].copy()
    glucose_data.sort_values(by='start_date', inplace=True)
    glucose_df = glucose_data.groupby('time_min').tail(1)[['time_min', 'value']].copy()
    glucose_df.rename(columns={'value': 'glucose'}, inplace=True)

    ### Merge all into a wide-format time series ###
    min_time = user_data['time_min'].min()
    max_time = user_data['time_min'].max()
    combined = pd.DataFrame({'time_min': np.arange(min_time, max_time + 1)})

    # Safely merge each input type, only if DataFrame is non-empty
    if not basal_df.empty:
        combined = combined.merge(basal_df.groupby('time_min').sum(), on='time_min', how='left')
    else:
        combined['ut_basal'] = 0.0

    if not bolus_df.empty:
        combined = combined.merge(bolus_df.groupby('time_min').sum(), on='time_min', how='left')
    else:
        combined['ut_bolus'] = 0.0

    if not cho_df.empty:
        cho_grouped = cho_df.groupby('time_min').sum().reset_index()
        combined = combined.merge(cho_grouped, on='time_min', how='left')
    else:
        combined['rt'] = 0.0

    if not glucose_df.empty:
        combined = combined.merge(glucose_df, on='time_min', how='left')
    else:
        combined['glucose'] = np.nan

    # Fill missing impulses with 0s
    combined['ut_basal'] = combined['ut_basal'].fillna(0)
    combined['ut_bolus'] = combined['ut_bolus'].fillna(0)
    combined['ut'] = combined['ut_basal'] + combined['ut_bolus']
    combined['rt'] = combined['rt'].fillna(0)

    # Trim dataset to end at the last glucose measurement
    if combined['glucose'].notna().any():
        max_glucose_time = combined[combined['glucose'].notna()]['time_min'].max()
        combined = combined[combined['time_min'] <= max_glucose_time]

    # Reorder columns
    combined = combined[['time_min', 'ut_basal', 'ut_bolus', 'ut', 'rt', 'glucose']]

    # Save file
    filename = f"wide_format/patient_{user_id[:8]}_wide.csv"
    combined.to_csv(filename, index=False)
    print(f"✔ Saved: {filename}")


Unique types in dataset: ['energyBurned' 'bolus' 'basal' 'steps' 'exercise' 'carbs' 'workout'
 'distanceWalkingRunning' 'sleep' 'glucose' 'bodyWeight']

Processing patient: 60b9c27a-7615-401f-b86d-62a2ac97cbcc
✔ Saved: wide_format/patient_60b9c27a_wide.csv

Processing patient: 47b5aa6b-e0c3-48a9-8feb-8eee68c19f88
✔ Saved: wide_format/patient_47b5aa6b_wide.csv

Processing patient: ab993210-27a1-451a-b8a4-4cddabb329c6
✔ Saved: wide_format/patient_ab993210_wide.csv

Processing patient: b4bdfa85-0735-4868-947b-11cfe178f014
✔ Saved: wide_format/patient_b4bdfa85_wide.csv

Processing patient: 46f119e5-da28-456d-86ed-50bb9452abf9
✔ Saved: wide_format/patient_46f119e5_wide.csv
