# PERSONAL TRIGGERS

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import joblib

In [2]:
CSV_PATH = "allergydf.csv"
df = pd.read_csv(CSV_PATH, parse_dates=["date"])
print("Loaded", df.shape)

Loaded (2272, 11)


In [3]:
df.head()

Unnamed: 0,user_id,date,season,temperature,humidity,dust_level,smoke_level,outdoor_time_min,medication_used,symptom_score,flare_up
0,user_2,2023-11-09,rainy_late,23.698531,77.22645,22.654872,13.828442,60.500841,1,1,0
1,user_12,2023-11-21,rainy_late,33.646917,73.180224,12.559737,35.16651,83.940761,0,1,1
2,user_2,2023-12-08,dry,27.908462,72.976011,27.472139,22.347591,0.0,0,1,1
3,user_19,2023-06-29,rainy_peak,23.155849,78.412614,9.097156,19.302795,49.542227,1,1,0
4,user_8,2023-06-23,rainy_peak,15.914323,75.607445,0.609801,1.236706,58.015698,0,1,1


In [4]:
exposures = ["dust_level","smoke_level","outdoor_time_min"]
user_scores = []
for uid, g in df.groupby('user_id'):
    if len(g) < 10:
        continue
    corrs = {}
    for e in exposures:
        try:
            corr = np.corrcoef(g[e], g['symptom_score'])[0,1]
        except:
            corr = 0
        corrs[f"{e}_corr"] = corr
    corrs['user_id'] = uid
    user_scores.append(corrs)
user_scores_df = pd.DataFrame(user_scores).set_index('user_id')
print("Per-user correlation scores:")
print(user_scores_df.head())

Per-user correlation scores:
         dust_level_corr  smoke_level_corr  outdoor_time_min_corr
user_id                                                          
user_1          0.073551          0.102108               0.242754
user_10         0.153254          0.363367              -0.081761
user_11              NaN               NaN                    NaN
user_12         0.363678          0.017298               0.360534
user_13         0.170451         -0.016559               0.196668


  c /= stddev[:, None]
  c /= stddev[None, :]


In [6]:
# Random Forest global model for feature importance
feature_cols = ["temperature","humidity","dust_level","smoke_level","medication_used","outdoor_time_min"]
X = df[feature_cols].fillna(0)
y = df['flare_up']
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
print("Global feature importances:")
print(importances)


Global feature importances:
outdoor_time_min    0.205979
humidity            0.197594
smoke_level         0.197427
temperature         0.188633
dust_level          0.181825
medication_used     0.028543
dtype: float64


In [8]:
# Per-user feature importance approximation: train small RF per user (if enough data)
per_user_importances = {}
for uid, g in df.groupby('user_id'):
    if len(g) < 30:
        continue
    X_u = g[feature_cols].fillna(0)
    y_u = (g['symptom_score']>=3.5).astype(int)
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_u, y_u)
    imp = pd.Series(clf.feature_importances_, index=feature_cols)
    per_user_importances[uid] = imp.sort_values(ascending=False)
# Example output for a single user
sample_uid = list(per_user_importances.keys())[0]
print("Top features for", sample_uid)
print(per_user_importances[sample_uid].head())


Top features for user_1
humidity            0.339440
dust_level          0.220502
outdoor_time_min    0.156871
smoke_level         0.155052
temperature         0.096417
dtype: float64


In [10]:
# Unsupervised clustering on exposure patterns
clust_features = ["dust_level","smoke_level","outdoor_time_min"]
scaler = StandardScaler()
Xc = scaler.fit_transform(df[clust_features].fillna(0))
kmeans = KMeans(n_clusters=3, random_state=42)
labels = kmeans.fit_predict(Xc)
df['cluster_label'] = labels
print("Cluster counts:", df['cluster_label'].value_counts())

Cluster counts: cluster_label
1    875
0    788
2    609
Name: count, dtype: int64


In [11]:
# Save outputs
user_scores_df.to_csv("per_user_trigger_correlations.csv")
joblib.dump(rf, "models/trigger_global_rf.joblib")

['trigger_global_rf.joblib']