#CGM Data Curation - Manually Labeling
This notebook takes glucose cgm data which contains logged events (meal, exercise), and calculates features for each food event.
First, we create event windows in glucose signal based on timing of logged food events.
Peak features (max, area under curve, slope rise) are calculated for each food event, based on a 2 hour window from logging the meal.
Using LLM, logged food notes are converted to estimated macronutrients. 

#use cgm (Python 3.11.11) kernel

In [None]:
import os
import pandas as pd
import numpy as np
import numpy as np 
import plotly.graph_objs as go
from sklearn.linear_model import LinearRegression
from process_fxns import create_event_windows, compute_metrics_for_all_windows,estimate_baseline_glucose,calculate_gmi,compute_iqr,compute_sd_of_rate,compute_LBGI_HBGI,zone_tir_metrics,zone_bg_risk

from datetime import timedelta
import matplotlib.pyplot as plt


Generate df_clean, which includes glucose vs time
Generate df_event_windows, include select meal logs and 2 hours window. 
Assumption: Time of log happens right when meal starts so peak length takes up 2 hour window after log.

In [2]:
# Load the CSV file

current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
raw_data_dir = os.path.join(parent_dir, 'data')
#cgm_file_path = os.path.join(raw_data_dir, 'JustinHuang_glucose_7-30-2022.csv')
cgm_file_path = os.path.join(raw_data_dir, 'JustinHuang_glucose_1-13-2025.csv')
#cgm_file_path = os.path.join(raw_data_dir, 'CynthiaLee_glucose_1-13-2025.csv')

df = pd.read_csv(cgm_file_path)

df_raw = pd.read_csv(cgm_file_path, skiprows=1)

# Clean and format
df_raw['timestamp'] = pd.to_datetime(df_raw['Device Timestamp'], errors='coerce')  #returns nan instead of error if conversion fails
df_raw['glucose'] = pd.to_numeric(df_raw['Historic Glucose mg/dL'], errors='coerce')

# Custom Crop
#start_time = pd.to_datetime('2022-07-04 12:00')
#end_time = pd.to_datetime('2022-07-15 00:00')
start_time = pd.to_datetime('2024-09-09 12:00')
end_time = pd.to_datetime('2024-09-24 00:00')

df_raw = df_raw[(df_raw['timestamp'] >= start_time) & (df_raw['timestamp'] <= end_time)]

df_clean = df_raw.dropna(subset=['timestamp', 'glucose']).sort_values('timestamp').reset_index(drop=True)

#Separate dataframe for logged events
df_events = df_raw.copy()
df_events = df_events[df_events['Record Type'].isin([6,7])] #6 is food, 7 is exercise, manually labeled.
df_events = df_events.drop(df_events[df_events['Notes']=='Exercise'].index)
df_events['event_type'] = df_events['Record Type'].map({6: 'food', 7: 'exercise'})

df_events = df_events.dropna(subset=['timestamp', 'Notes']).sort_values('timestamp').reset_index(drop=True)

df_food_event_windows = create_event_windows(df_events[df_events['event_type'] == 'food'].copy()
)
df_food_event_windows

Unnamed: 0,event_time,event_note,window_start,window_end
0,2024-09-09 17:40:00,Chips,2024-09-09 17:40:00,2024-09-09 19:40:00
1,2024-09-09 20:44:00,Chicken soup and salty Chinese greens,2024-09-09 20:44:00,2024-09-09 22:44:00
2,2024-09-10 09:30:00,"Juice with berries, protein powder, chia seeds...",2024-09-10 09:30:00,2024-09-10 11:30:00
3,2024-09-10 13:46:00,"Lunch eggs raw tomato cucumber salad greens, a...",2024-09-10 13:46:00,2024-09-10 15:46:00
4,2024-09-10 19:09:00,"Popeye 3 piece, biscuit, salad greens, Cajun f...",2024-09-10 19:09:00,2024-09-10 21:09:00
5,2024-09-11 12:30:00,"Eggs, salad greens, tomato, rice, chocolate pe...",2024-09-11 12:30:00,2024-09-11 14:30:00
6,2024-09-11 15:30:00,Handful of chocolate almonda,2024-09-11 15:30:00,2024-09-11 17:30:00
7,2024-09-11 19:45:00,"Homemade tomato pasta, shrimp, veggies, meatba...",2024-09-11 19:45:00,2024-09-11 21:45:00
8,2024-09-12 07:20:00,"Tofu, electrolytes, juice",2024-09-12 07:20:00,2024-09-12 09:20:00
9,2024-09-12 10:00:00,Chips,2024-09-12 10:00:00,2024-09-12 12:00:00


In [None]:
#calculate metrics.

baseline=estimate_baseline_glucose(df_clean)
mean_glucose=df_clean['glucose'].mean()
gmi = calculate_gmi(mean_glucose)
iqr = compute_iqr(df_clean['glucose'])
roc_sd = compute_sd_of_rate(df_clean['glucose'])
lbgi, hbgi, bgri = compute_LBGI_HBGI(df_clean['glucose'])
risk_zones = zone_bg_risk(lbgi, hbgi)

print("Estimated Baseline:",baseline)
print("Mean Glucose:", )
print("GMI:", gmi)
print("IQR:", iqr)
print("Rate of Change SD:", roc_sd)
print("LBGI / HBGI / BGRI:", lbgi, hbgi, bgri)
print("Zones:", risk_zones)




SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (1990671920.py, line 7)

Calculate glucose metrics for each event:  max, min, auc, slope rise (3 points), slope_fall. Also save to file.

In [None]:
gen_data_dir = os.path.join(parent_dir, 'generated_data')
gen_file = 'df_event_metrics.csv'

df_event_metrics = compute_metrics_for_all_windows(df_food_event_windows, df_clean, df_events,baseline)
output_path = os.path.join(gen_data_dir, gen_file)
df_event_metrics.to_csv(output_path, encoding='utf-8')
df_event_metrics.head()

In [5]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df_clean['timestamp'],
    y=df_clean['glucose'],
    mode='lines+markers',
    name='Glucose',
    hovertemplate='Time: %{x}<br>Glucose: %{y} mg/dL'
))

for _, row in df_event_metrics.iterrows():
    mask = (df_clean['timestamp'] >= row['window_start']) & (df_clean['timestamp'] <= row['window_end'])
    segment = df_clean.loc[mask]
    above = segment[segment['glucose'] > baseline]
    if above.empty:
        continue

    fill_x = list(above['timestamp']) + list(above['timestamp'][::-1])
    fill_y = list(above['glucose']) + [baseline] * len(above)

    # Duplicate customdata for each (x, y) point to match fill shape
    n_points = len(fill_x)
    customdata = np.array([[row['event_note'], row['window_start'], row['window_end'],
                            row['glucose_max'], row['glucose_auc']]] * n_points)

    fig.add_trace(go.Scatter(
    x=fill_x,
    y=fill_y,
    fill='toself',
    mode='lines+markers',  # <-- Enable hover events
    marker=dict(size=1, color='rgba(0,0,0,0)'),  # invisible points
    line=dict(color='rgba(255, 165, 0, 0.2)'),
    fillcolor='rgba(255, 165, 0, 0.3)',
    customdata=customdata,
    hovertemplate=(
        "<b>%{customdata[0]}</b><br>" +
        "Start: %{customdata[1]}<br>" +
        "End: %{customdata[2]}<br>" +
        "Max Glucose: %{customdata[3]} mg/dL<br>" +
        "AUC above baseline: %{customdata[4]:.1f}<extra></extra>"
    ),
    showlegend=False
))


fig.update_layout(
    title='Glucose Readings with Highlighted Event Windows',
    xaxis_title='Timestamp',
    yaxis_title='Glucose (mg/dL)',
    hovermode='closest'
)

fig.show()

Chain prompt templates. For now in notebook and instructions. Later can try tools LangChain.
User input:  Raw free-text food log.
User specific portion setting on a scale of 1-5.

## Processing Steps.
### Step 1: Meal classification
Goal:  Determine the nature and context of the food log
**Prompt includes:**

- Raw log text
- Portion setting
- Instructions to infer:
    - `meal_type`: `"meal"`, `"snack"`, `"drink"`, or `"light"`
    - `source`: `"homemade"`, `"restaurant"`, `"processed"`, `"cafe"`, etc.
    - `estimated_total_weight_g`:
        - Default ranges:
            - Meal: 400–700g
            - Snack: 80–300g
            - Drink: 200–400g
        - **Adjusted using:**
            - Portion setting multiplier (e.g., 1.3 for level 4)
            - **Quantity cues** (e.g., “small”, “1 piece”, “two bites”)
        - Resulting portion weight is **not fixed** per category.

**Enhancement 1**: Use LLM to interpret portion hints in the input (e.g., “small KitKat” → 40g instead of 195g)
**Enhancement 2**: Ask LLM to flag:  
    - "Unrecognized food"
    - "Ethnic/niche item with unclear macros"
    - "Low confidence estimate"
    If flagged, then prompt User (optional future UI)
    Allow user to:
    - Add clarification (e.g., "Kuih = pandan rice flour coconut dessert")
    - Select from presets or describe portion and ingredients
    - Upload image or link for context (future feature)

    Suggested prompt snipped
    > Given the following food log and portion setting {setting}/5, infer:
> 
> - meal_type
> - source
> - estimated_total_weight_g (adjust for portion cues like “small”, “1 piece”)
> - if any items are uncommon (ethnic, restaurant-named, branded), flag them
> - return a `llm_notes` field explaining reasoning
> 
> Example log: “Small KitKat”
> 
> Output:
    json
    CopyEdit
    {
    "meal_type": "snack",
    "source": "processed",
    "estimated_total_weight_g": 40,
    "llm_notes": "Small portion of known chocolate bar; standard serving is 2 sticks. Adjusted accordingly.",
    "flag_user_clarification": false}

    Another example:

> Log: “Aman Cafe Roti”

json
CopyEdit
{
  "meal_type": "snack",
  "source": "cafe",
  "estimated_total_weight_g": 180,
  "llm_notes": "Unusual item; likely savory bread with sauce, based on context.",
  "flag_user_clarification": true}
>
>

### Step 2:  Segmentation
**Goal**: Break composite meals into structured list of items

**Prompt includes:**

- The original event_note
- The `meal_type` and `source` from Step 1
- Instructions:
    - If a single cohesive item (e.g., "cookie", "smoothie"), return as a single-item list
    - If multi-item meal (e.g., “chicken pasta and salad”), return individual components

### Step 3:  Portion Allocation Per Component
**Goal**: Assign a weight (in grams) to each segmented item

**Prompt includes:**

- Total meal weight from Step 1
- `segmented_list` from Step 2
- Instructions:
    - Use **food role logic**: heavier for mains (e.g., pasta), lighter for toppings or sides (e.g., salad)
    - Split total weight accordingly
    - Ensure sum ≈ estimated total

### Step 4: Macronutrient Estimation
**Goal**: Estimate:

- `protein_g`, `carbs_g`, `fat_g`, `calories_kcal`
    
    For each item and for the whole meal
    

**Prompt includes:**

- Item name + estimated weight
- Ask for realistic, typical estimates based on general food knowledge or database
- Tag estimates with `"source"`: `"USDA"`, `"LLM estimate"`, or `"user-specified"`

## Outputs

- Per log: structured list of items with weights and macros
- Per log: summary of total protein, carbs, fat, calories
- Traceable intermediate outputs (classification, segmentation, allocation, macros)



Pretend we did LLM macronutrient estimation. We join the estimates back into event log

In [20]:
from io import StringIO
gen_data_dir = os.path.join(parent_dir, 'generated_data')
#gen_file_path = os.path.join(gen_data_dir, 'df_event_metrics_jh_llm_annotated_batch1_utf8.csv')
gen_file_path = os.path.join(gen_data_dir, 'df_event_metrics_jh_llm_annotated_batch2_utf8.csv')
#gen_file_path = os.path.join(gen_data_dir, 'df_event_metrics_cl_llm_annotated.csv')
gen_file = 'df_output.csv'
output_path = os.path.join(gen_data_dir, gen_file)


#have problem with utf-8
with open(gen_file_path, mode='r', encoding='utf-8', errors='replace') as f:
    content = f.read()
df_event_macronutrients=pd.read_csv(StringIO(content))

df_output=df_event_metrics.merge(df_event_macronutrients, how='left', left_on='event_note',right_on='event_note', suffixes=('', '_drop'))
df_output = df_output[[col for col in df_output.columns if not col.endswith('_drop')]]

df_output.to_csv(output_path)
df_output.head()

Unnamed: 0,event_time,event_note,window_start,window_end,glucose_max,glucose_min,glucose_auc,glucose_rate_rise,glucose_rate_fall,exercise_within_3h,meal_time_category,meal_type,source,estimated_total_weight_g,llm_notes,Segmented List,Protein,Carbs,Fat,Calories
0,2024-09-09 17:40:00,Chips,2024-09-09 17:40:00,2024-09-09 19:40:00,136.0,112.0,79.0,38.0,-26.0,0,2,snack,processed,195.0,Single-item processed snack.,"[""chips""]",3.0,35.0,30.0,500.0
1,2024-09-09 17:40:00,Chips,2024-09-09 17:40:00,2024-09-09 19:40:00,136.0,112.0,79.0,38.0,-26.0,0,2,snack,processed,195.0,Classic processed snack.,"[""chips""]",3.0,35.0,30.0,500.0
2,2024-09-09 17:40:00,Chips,2024-09-09 17:40:00,2024-09-09 19:40:00,136.0,112.0,79.0,38.0,-26.0,0,2,snack,processed,195.0,Classic processed snack.,"[""chips""]",3.0,35.0,30.0,500.0
3,2024-09-09 20:44:00,Chicken soup and salty Chinese greens,2024-09-09 20:44:00,2024-09-09 22:44:00,160.0,99.0,94.4,96.0,-84.0,0,3,meal,homemade,585.0,Balanced homemade meal: soup + veg.,"[""chicken soup"", ""salty Chinese greens""]",36.0,12.0,12.0,360.0
4,2024-09-10 09:30:00,"Juice with berries, protein powder, chia seeds...",2024-09-10 09:30:00,2024-09-10 11:30:00,134.0,88.0,54.375,70.0,-70.0,0,0,meal,homemade,585.0,Nutrient-dense smoothie treated as a full blen...,"[""juice"", ""berries"", ""protein powder"", ""chia s...",38.0,55.0,12.0,500.0


In [20]:
analysis_data_dir = os.path.join(parent_dir, 'generated_data')
analysis_file_path = os.path.join(analysis_data_dir, 'df_event_metrics_jh_llm_annotated_combined_utf8.csv')
df_analysis=pd.read_csv(analysis_file_path)
df_analysis.head()

Unnamed: 0,event_time,event_note,window_start,window_end,glucose_max,glucose_min,glucose_auc,glucose_rate_rise,glucose_rate_fall,exercise_within_3h,meal_time_category,meal_type,source,estimated_total_weight_g,llm_notes,Segmented List,Protein,Carbs,Fat,Calories
0,7/4/2022 13:16,Pork bone soup lettuce gnocci with sauce,7/4/2022 13:16,7/4/2022 15:12,158.0,101.0,56.808333,100.0,-73.402367,0,2,meal,homemade,585,"Multi-item cooked dish with protein, vegetable...","[""pork bone soup"", ""lettuce"", ""gnocci"", ""sauce""]",52.0,36.0,36.0,740
1,7/4/2022 15:12,Eat pastry,7/4/2022 15:12,7/4/2022 17:12,135.0,99.0,32.3,22.0,-66.0,1,2,snack,processed,195,"Single baked item, sweet and calorie-dense.","[""pastry""]",7.0,49.0,29.0,546
2,7/4/2022 20:21,Maitake dimpling eggplant spinach masala crack...,7/4/2022 20:21,7/4/2022 22:21,217.0,98.0,138.1,176.0,-104.0,0,4,meal,homemade,585,Several savory items and dessert; interpreted ...,"[""maitake"", ""dimpling"", ""eggplant"", ""spinach"",...",36.0,68.0,40.0,990
3,7/5/2022 14:42,Eat Aman cafe roti,7/5/2022 14:42,7/5/2022 16:42,156.0,95.0,72.25,86.0,2.0,1,2,snack,cafe,195,"Single cafe item, likely bread-based.","[""Aman cafe roti""]",6.0,28.0,7.0,240
4,7/5/2022 21:30,Eat pizza ceviche maitakr,7/5/2022 21:30,7/5/2022 23:30,110.0,101.0,29.516667,12.0,-15.56172,0,5,meal,restaurant,585,"Contains protein, carbs, and multiple items.","[""pizza"", ""ceviche"", ""maitakr""]",56.0,66.0,34.0,970


Do regression on PPGR w/ carbs and calories. Plot slope and R^2. 
Do for different patients on the same graph.
Then do PPGR w/ fat/carbohydrates.
Then do PDP to visualize.