In [None]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

# For reproducibility
np.random.seed(42)
random.seed(42)

# 1) Generate profiles.csv


N_MEMBERS = 30

first_names = ["Sarah", "Rami", "Nour", "Lina", "Omar", "Jad", "Maya", "Karim", "Layla", "Sami"]
goals = ["stress reduction", "fat loss", "muscle gain", "overall health"]
sex_choices = ["F", "M"]
activity_types = ["low", "moderate", "high"]

profiles = []

for member_id in range(1, N_MEMBERS + 1):
    name = random.choice(first_names) + f" {member_id}"
    age = np.random.randint(18, 60)
    sex = random.choice(sex_choices)
    goal = random.choice(goals)

    # Personality / baseline
    baseline_mood = np.clip(np.random.normal(loc=3.2, scale=0.7), 1.5, 4.8)
    baseline_stress = np.clip(np.random.normal(loc=3.0, scale=0.8), 1.2, 4.8)
    baseline_sleep = np.clip(np.random.normal(loc=7.0, scale=0.8), 5.0, 8.5)

    activity_type = random.choice(activity_types)  # low / moderate / high

    # Volatility: how much they fluctuate day-to-day
    volatility = np.clip(np.random.normal(loc=0.6, scale=0.2), 0.3, 1.0)

    profiles.append({
        "member_id": member_id,
        "name": name,
        "age": age,
        "sex": sex,
        "goal": goal,
        "baseline_mood": round(float(baseline_mood), 2),
        "baseline_stress": round(float(baseline_stress), 2),
        "baseline_sleep": round(float(baseline_sleep), 2),
        "activity_type": activity_type,
        "volatility": round(float(volatility), 2)
    })

profiles_df = pd.DataFrame(profiles)
profiles_df.to_csv("profiles.csv", index=False)
print("Saved profiles.csv:", profiles_df.shape)
print(profiles_df.head())


# 2) Generate checkins.csv with realistic dynamics

MIN_DAYS = 40
MAX_DAYS = 70
END_DATE = datetime(2025, 5, 31)

rows = []

for _, prof in profiles_df.iterrows():
    member_id = prof["member_id"]

    baseline_mood = prof["baseline_mood"]
    baseline_stress = prof["baseline_stress"]
    baseline_sleep = prof["baseline_sleep"]
    activity_type = prof["activity_type"]
    volatility = prof["volatility"]

    # Initial state
    mood_prev = np.clip(np.random.normal(baseline_mood, 0.3), 1, 5)
    stress_prev = np.clip(np.random.normal(baseline_stress, 0.3), 1, 5)
    fatigue_prev = np.clip(np.random.normal(loc=3.0, scale=0.7), 1, 5)

    n_days = np.random.randint(MIN_DAYS, MAX_DAYS + 1)

    for d in range(n_days):
        date = END_DATE - timedelta(days=d)

        # Sleep around baseline, with noise
        sleep_hours = np.clip(
            np.random.normal(loc=baseline_sleep, scale=0.7),
            4.0, 9.0
        )

        # Activity according to activity_type
        if activity_type == "low":
            activity_minutes = int(np.random.choice([0, 0, 20]))
        elif activity_type == "moderate":
            activity_minutes = int(np.random.choice([0, 20, 40]))
        else:  # high
            activity_minutes = int(np.random.choice([20, 40, 60]))

        # Random noise scaled by volatility
        noise_mood = np.random.normal(0, 0.3) * volatility
        noise_stress = np.random.normal(0, 0.3) * volatility
        noise_fatigue = np.random.normal(0, 0.3) * volatility

        # ---- Update fatigue (depends on sleep, stress, activity) ----
        fatigue_today = (
            fatigue_prev
            + 0.3 * (sleep_hours < 6)           # bad sleep -> more fatigue
            + 0.1 * (stress_prev - 3)           # high stress -> more fatigue
            - 0.1 * (activity_minutes / 20.0)   # activity reduces fatigue a bit
            + noise_fatigue
        )
        fatigue_today = float(np.clip(fatigue_today, 1, 5))

        # ---- Update stress (depends on sleep & activity) ----
        stress_today = (
            stress_prev
            + 0.4 * (sleep_hours < 6)           # short sleep -> more stress
            - 0.1 * (activity_minutes / 20.0)   # activity can reduce stress
            + noise_stress
        )
        stress_today = float(np.clip(stress_today, 1, 5))

        # ---- Update mood (depends on fatigue, sleep, stress) ----
        mood_today = (
            mood_prev
            + noise_mood
            - 0.3 * fatigue_today               # tired -> lower mood
            + 0.2 * (sleep_hours - 7)           # more sleep -> better mood
            - 0.1 * (stress_prev - 3)           # high stress -> lower mood
        )
        mood_today = float(np.clip(mood_today, 1, 5))

        possible_notes = ["", "tired", "exam", "busy day", "travel", "rest day"]
        note = random.choice(possible_notes)

        rows.append({
            "member_id": member_id,
            "date": date.strftime("%Y-%m-%d"),
            "mood": round(mood_today),
            "stress": round(stress_today),
            "fatigue": round(fatigue_today),
            "sleep_hours": round(sleep_hours, 1),
            "activity_minutes": activity_minutes,
            "note": note
        })

        # Update previous for next iteration
        mood_prev = mood_today
        stress_prev = stress_today
        fatigue_prev = fatigue_today

checkins_df = pd.DataFrame(rows)
checkins_df.sort_values(by=["member_id", "date"], inplace=True)
checkins_df.reset_index(drop=True, inplace=True)


# 3) Compute bad_day_tomorrow label


bad_labels = []

for member_id, group in checkins_df.groupby("member_id"):
    g = group.sort_values("date").copy()

    # Rolling average of sleep for the last 3 days (including today)
    g["sleep_avg_last3"] = g["sleep_hours"].rolling(window=3, min_periods=1).mean()

    # Rule for a "bad day" (today)
    bad_today = (
        (g["sleep_avg_last3"] < 6) |
        ((g["stress"] >= 4) & (g["mood"] <= 2)) |
        (g["fatigue"] >= 4)
    ).astype(int)

    # Shift to get "bad_day_tomorrow" for each row
    g["bad_day_tomorrow"] = bad_today.shift(-1)  # tomorrow's state
    g["bad_day_tomorrow"] = g["bad_day_tomorrow"].fillna(0).astype(int)

    bad_labels.append(g[["member_id", "date", "bad_day_tomorrow"]])

bad_df = pd.concat(bad_labels, ignore_index=True)

# Merge back into checkins_df
checkins_df = checkins_df.merge(
    bad_df,
    on=["member_id", "date"],
    how="left"
)

checkins_df["bad_day_tomorrow"] = checkins_df["bad_day_tomorrow"].fillna(0).astype(int)

checkins_df.to_csv("checkins.csv", index=False)
print("Saved checkins.csv:", checkins_df.shape)
print(checkins_df.head(10))


Saved profiles.csv: (30, 10)
   member_id     name  age sex              goal  baseline_mood  \
0          1   Rami 1   56   F       muscle gain           2.81   
1          2   Lina 2   36   F  stress reduction           3.13   
2          3  Layla 3   20   F    overall health           2.87   
3          4  Sarah 4   29   F          fat loss           3.22   
4          5  Layla 5   33   F          fat loss           3.02   

   baseline_stress  baseline_sleep activity_type  volatility  
0             3.41            7.38           low        0.87  
1             2.26            6.29          high        0.52  
2             3.43            6.63           low        0.51  
3             2.66            6.57           low        0.58  
4             2.87            5.82          high        0.90  
Saved checkins.csv: (1748, 9)
   member_id        date  mood  stress  fatigue  sleep_hours  \
0          1  2025-04-15     1       4        5          7.5   
1          1  2025-04-16     1  