In [1]:
import pandas as pd

# ==============================
# CONFIG
# ==============================
DATA_PATH = "../data/export.csv"
TOP_SYMPTOMS = 100
TOP_CONDITIONS = 100
TOP_TAGS = 100
TOP_FOODS = 100
TOP_TREATMENTS = 100


# ==============================
# 1. LOAD & CLEAN
# ==============================
print("Loading data...")
df = pd.read_csv(
    DATA_PATH,
    low_memory=False,
    parse_dates=["checkin_date"],
)

# Normalize text fields
for col in ["trackable_type", "trackable_name", "sex", "country"]:
    if col in df.columns:
        df[col] = (
            df[col].astype(str).str.strip().str.lower().replace({"nan": pd.NA})
        )

# Clean age
if "age" in df.columns:
    df["age"] = pd.to_numeric(df["age"], errors="coerce")
    df.loc[(df["age"] < 0) | (df["age"] > 120), "age"] = pd.NA

# Numeric severity conversion
df["trackable_value_numeric"] = pd.to_numeric(
    df["trackable_value"], errors="coerce"
)


# ==============================
# 2. DAILY OVERALL SEVERITY (SYMPTOMS + CONDITIONS)
# ==============================
print("Computing daily overall severity...")

symptom_mask = df["trackable_type"] == "symptom"
condition_mask = df["trackable_type"] == "condition"

symptoms = df[symptom_mask].dropna(subset=["trackable_value_numeric"]).copy()
conditions = df[condition_mask].dropna(subset=["trackable_value_numeric"]).copy()

severity_df = (
    pd.concat([symptoms, conditions], axis=0)
    .groupby(["user_id", "checkin_date"])["trackable_value_numeric"]
    .mean()
    .reset_index()
    .rename(columns={"trackable_value_numeric": "overall_symptom_severity"})
)


# ==============================
# 3. DAILY SYMPTOM MATRIX (TOP 100)
# ==============================
print("Building symptom matrix (top 100)...")

top_symptom_names = (
    symptoms["trackable_name"]
    .value_counts()
    .head(TOP_SYMPTOMS)
    .index
)

symptoms_top = symptoms[symptoms["trackable_name"].isin(top_symptom_names)]

daily_symptom_matrix = (
    symptoms_top
    .groupby(["user_id", "checkin_date", "trackable_name"])["trackable_value_numeric"]
    .mean()
    .unstack("trackable_name")
)

daily_symptom_matrix.columns = [f"symptom_{c}" for c in daily_symptom_matrix.columns]


# ==============================
# 4. DAILY CONDITION MATRIX (TOP 100)
# ==============================
print("Building condition matrix (top 100)...")

top_condition_names = (
    conditions["trackable_name"]
    .value_counts()
    .head(TOP_CONDITIONS)
    .index
)

conditions_top = conditions[conditions["trackable_name"].isin(top_condition_names)]

daily_condition_matrix = (
    conditions_top
    .groupby(["user_id", "checkin_date", "trackable_name"])["trackable_value_numeric"]
    .mean()
    .unstack("trackable_name")
)

daily_condition_matrix.columns = [f"condition_{c}" for c in daily_condition_matrix.columns]


# ==============================
# 5. DAILY TAGS (binary, TOP 100)
# ==============================
print("Building tag matrix (top 100)...")

tags = df[df["trackable_type"] == "tag"].copy()
tags_daily = None

if not tags.empty:
    top_tag_names = tags["trackable_name"].value_counts().head(TOP_TAGS).index
    tags_top = tags[tags["trackable_name"].isin(top_tag_names)].copy()
    if not tags_top.empty:
        tags_top["value"] = 1
        tags_daily = (
            tags_top
            .groupby(["user_id", "checkin_date", "trackable_name"])["value"]
            .max()
            .unstack("trackable_name")
            .fillna(0)
        )
        tags_daily.columns = [f"tag_{c}" for c in tags_daily.columns]


# ==============================
# 6. DAILY FOODS (binary, TOP 100)
# ==============================
print("Building food matrix (top 100)...")

foods = df[df["trackable_type"] == "food"].copy()
foods_daily = None

if not foods.empty:
    top_food_names = foods["trackable_name"].value_counts().head(TOP_FOODS).index
    foods_top = foods[foods["trackable_name"].isin(top_food_names)].copy()
    if not foods_top.empty:
        foods_top["value"] = 1
        foods_daily = (
            foods_top
            .groupby(["user_id", "checkin_date", "trackable_name"])["value"]
            .max()
            .unstack("trackable_name")
            .fillna(0)
        )
        foods_daily.columns = [f"food_{c}" for c in foods_daily.columns]


# ==============================
# 7. DAILY TREATMENTS (binary, TOP 100)
# ==============================
print("Building treatment matrix (top 100)...")

treatments = df[df["trackable_type"] == "treatment"].copy()
treatments_daily = None

if not treatments.empty:
    top_treatment_names = treatments["trackable_name"].value_counts().head(TOP_TREATMENTS).index
    treatments_top = treatments[treatments["trackable_name"].isin(top_treatment_names)].copy()
    if not treatments_top.empty:
        treatments_top["value"] = 1
        treatments_daily = (
            treatments_top
            .groupby(["user_id", "checkin_date", "trackable_name"])["value"]
            .max()
            .unstack("trackable_name")
            .fillna(0)
        )
        treatments_daily.columns = [f"treatment_{c}" for c in treatments_daily.columns]


# ==============================
# 8. DAILY WEATHER (all weather fields)
# ==============================
print("Extracting weather features (if present)...")

weather_cols = [c for c in df.columns if c.startswith("weather_")]
weather_daily = None

if weather_cols:
    weather_daily = (
        df[["user_id", "checkin_date"] + weather_cols]
        .drop_duplicates(["user_id", "checkin_date"])
        .set_index(["user_id", "checkin_date"])
        .groupby(["user_id", "checkin_date"])
        .mean()
    )


# ==============================
# 9. BUILD MASTER DAILY TABLE
# ==============================
print("Combining all daily-level features...")

daily = severity_df.set_index(["user_id", "checkin_date"])

daily = daily.join(daily_symptom_matrix, how="left")
daily = daily.join(daily_condition_matrix, how="left")

if tags_daily is not None:
    daily = daily.join(tags_daily, how="left")

if foods_daily is not None:
    daily = daily.join(foods_daily, how="left")

if treatments_daily is not None:
    daily = daily.join(treatments_daily, how="left")

if weather_daily is not None:
    daily = daily.join(weather_daily, how="left")

# Fill binary-like features with 0 (tags/foods/treatments)
binary_like_cols = [
    c for c in daily.columns
    if c.startswith("tag_") or c.startswith("food_") or c.startswith("treatment_")
]
daily[binary_like_cols] = daily[binary_like_cols].fillna(0)


# ==============================
# 10. ADD DEMOGRAPHICS
# ==============================
print("Adding demographics...")

daily = daily.reset_index()

demo_cols = [c for c in ["user_id", "age", "sex", "country"] if c in df.columns]

demo = (
    df[demo_cols]
    .drop_duplicates("user_id")
    .set_index("user_id")
)

demo = pd.get_dummies(
    demo,
    columns=[c for c in demo.columns if c in ["sex", "country"]],
    dummy_na=True
)

daily = daily.set_index("user_id").join(demo, how="left").reset_index()


# ==============================
# 11. NEXT-DAY LABELING (CONSECUTIVE ONLY)
# ==============================
print("Creating next-day binary label: worse vs not-worse...")

# sort by user and date
daily = daily.sort_values(["user_id", "checkin_date"])

# drop rows with missing baseline severity
daily = daily.dropna(subset=["overall_symptom_severity"])

# shift within each user to get next date + severity
daily["checkin_date_next"] = daily.groupby("user_id")["checkin_date"].shift(-1)
daily["next_severity"] = daily.groupby("user_id")["overall_symptom_severity"].shift(-1)

# compute gap in days to ensure true consecutive days
daily["gap_days"] = (daily["checkin_date_next"] - daily["checkin_date"]).dt.days

# keep only consecutive-day pairs
paired = daily[daily["gap_days"] == 1].copy()

print("Number of consecutive-day pairs:", len(paired))

# delta (for reference/analysis, not used in label now)
paired["delta"] = paired["next_severity"] - paired["overall_symptom_severity"]

# NEW BINARY LABEL:
# 1 if next day is worse (higher severity), else 0
paired["worse_tomorrow"] = (paired["next_severity"] > paired["overall_symptom_severity"]).astype(int)

print("Binary label distribution (worse_tomorrow):")
print(paired["worse_tomorrow"].value_counts(normalize=True).round(3))


# ==============================
# 12. FINAL FEATURE MATRIX
# ==============================
print("Building final X and y...")

exclude_cols = {
    "user_id",
    "checkin_date",
    "checkin_date_next",
    "gap_days",
    "next_severity",
    "delta",
    "worse_tomorrow",
}

feature_cols = [c for c in paired.columns if c not in exclude_cols]

X = paired[feature_cols]
y = paired["worse_tomorrow"]

# impute remaining NaNs with column means
X = X.apply(lambda col: col.fillna(col.mean()))

print("X shape:", X.shape)
print("y length:", len(y))


# ==============================
# 13. SAVE
# ==============================
print("Saving preprocessed data...")
X.to_parquet("../results/X.parquet", index=False)
y.to_csv("../results/y.csv", index=False)

print("Done! Binary data saved for predicting: worse tomorrow (1) vs same/better (0).")

Loading data...
Computing daily overall severity...
Building symptom matrix (top 100)...
Building condition matrix (top 100)...
Building tag matrix (top 100)...
Building food matrix (top 100)...
Building treatment matrix (top 100)...
Extracting weather features (if present)...
Combining all daily-level features...
Adding demographics...
Creating next-day binary label: worse vs not-worse...
Number of consecutive-day pairs: 223865
Binary label distribution (worse_tomorrow):
worse_tomorrow
0    0.561
1    0.439
Name: proportion, dtype: float64
Building final X and y...
X shape: (223865, 672)
y length: 223865
Saving preprocessed data...
Done! Binary data saved for predicting: worse tomorrow (1) vs same/better (0).
