In [None]:
import os
import requests
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.set_option("display.max_rows", 100)

In [None]:
daily = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/csv_rais_anonymized/daily_fitbit_sema_df_unprocessed.csv")
hourly = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/csv_rais_anonymized/hourly_fitbit_sema_df_unprocessed.csv")
breq = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/scored_surveys/breq.csv")
panas = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/scored_surveys/panas.csv")
personality = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/scored_surveys/personality.csv")
stai = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/scored_surveys/stai.csv")
ttm = pd.read_csv("~/class/f24/cbm/evo-life/data/rais_anonymized/scored_surveys/ttm.csv")

In [None]:
# preprocessing the daily data
daily.drop("Unnamed: 0", axis=1)

# all values will be associated with the ID, let's count how many there are for each
num_ids = daily['id'].nunique() # 71

# is there an even distribution of dates for the number of records we have (does each ID have the same set of dates?)
# subset = daily.loc[daily['id'].isin(["621e2e8e67b776a24055b564"])]
subset = daily.groupby("id").nunique()
subset = subset.reset_index()
date_per_id_counts = subset[['id', 'date']]

In [None]:
nan_values = daily.isna()
nan_values = nan_values.reset_index()
true_nan_values = nan_values.sum()
true_nan_values

In [None]:
ids = list(hourly['id'].unique())

In [None]:
hourly = hourly.drop('Unnamed: 0', axis=1)

In [None]:
all_cols = ["id", "date", "hour","distance", "steps", "calories", "age", "bpm", "bmi", "gender",
             "mindfulness_session", "SAD", "TIRED", "TENSE/ANXIOUS", "ENTERTAINMENT", "GYM", "HOME",
             "OUTDOORS"]
numeric_coerce = ["distance", "steps", "calories","mindfulness_session", "SAD", "TIRED", "TENSE/ANXIOUS", "ENTERTAINMENT", "GYM", "HOME",
             "OUTDOORS"]
tmp = hourly[all_cols]
tmp[numeric_coerce] = tmp[numeric_coerce].apply(pd.to_numeric, errors="coerce").fillna(0).astype(np.int64)
# first 35 ids are good, fails on 36 and after that for some reason

# test_id_subset = ids[36]
# tmp = tmp.loc[hourly['id'] == test_id_subset]

In [None]:
roll_up = tmp.groupby(["id", "date"]).agg({
    "distance": ['sum'],
    "steps": ['sum'],
    "calories": ['sum'],
    "gender": ['first'],
    'age': ['first'],
    # 'bmi': ['max'],
    "mindfulness_session": ['max'],
    "SAD": ['max'],
    "TIRED": ["max"],
    "TENSE/ANXIOUS": ['max'],
    "ENTERTAINMENT": ['max'],
    "GYM": ["max"],
    "HOME": ['max'],
    "OUTDOORS": ['max']
    })
roll_up.reset_index()
roll_up


In [None]:
tmp['age'].unique()

In [None]:
daily = daily.drop('Unnamed: 0', axis=1)
daily

In [None]:
# need to join the aggregated hourly data to the daily data here to get a full dataset for us to use. 
to_use = ['id', 'date', 'nremhr', 'rmssd', 'spo2', 'stress_score', 'sleep_points_percentage',
          'exertion_points_percentage', 'responsiveness_points_percentage', 'distance', 'activityType',
          'bpm', 'lightly_active_minutes', 'moderately_active_minutes', 'very_active_minutes', 'sedentary_minutes',
          'mindfulness_session', 'sleep_duration', 'minutesAsleep', 'minutesAwake', 'sleep_efficiency', 'gender',
          'bmi', 'TENSE/ANXIOUS', 'TIRED', "GYM", "HOME", "OUTDOORS"]
d = daily[to_use]
d

In [None]:
nan_values = d.isna()
nan_values = nan_values.reset_index()
true_nan_values = nan_values.sum()
true_nan_values

In [None]:
median_values_impute = ["nremhr", "rmssd", "spo2", "stress_score", "sleep_points_percentage", 
                        "exertion_points_percentage", "responsiveness_points_percentage", "distance", 
                        "minutesAsleep", "minutesAwake", "sleep_efficiency", "bpm", 
                        "lightly_active_minutes", "moderately_active_minutes", "very_active_minutes"]
d[median_values_impute] = d[median_values_impute].astype(np.float64)

median_values = d.groupby('id')[median_values_impute].median()

d[median_values_impute] = d[median_values_impute].fillna(median_values)
d = d.fillna(0)
d

In [None]:
d.to_csv("~/class/f24/cbm/evo-life/data/prepped/daily.csv")

In [None]:
# find out how many days each id has
days_per_id = d.groupby('id').agg(
    {
        "date": ['count']
    }
)
print("Total number of IDs", days_per_id.__len__())
days_per_id

In [None]:
# convert dates to numbered days
d = d.sort_values(by="date")
d['day'] = d.groupby("id").cumcount()
d.loc[d['id']=="621e346f67b776a24081744f"].sort_values(by="day")


In [None]:
number_records_per_id_breq = breq.groupby("user_id").agg(
    {
        "breq_introjected_regulation": "count"
    }
)
number_records_per_id_breq.count()

In [None]:
breq['user_id'].count()

In [None]:
print("Number of records in panas:", panas['user_id'].count())
number_records_per_id_panas = panas.groupby("user_id").agg(
    {
        "positive_affect_score": "count"
    }
)
print("Number of individual ids represented:", number_records_per_id_panas.count())
number_records_per_id_panas

In [None]:
personality.columns

In [None]:
print("Number of records in personality:", personality['user_id'].count())
number_records_per_id_personality = personality.groupby("user_id").agg(
    {
        "extraversion": "count"
    }
)
print("Number of individual ids represented:", number_records_per_id_personality.count())
number_records_per_id_personality

In [None]:
stai.columns

In [None]:
print("Number of records in stai:", stai['user_id'].count())
number_records_per_id_stai = stai.groupby("user_id").agg(
    {
        "stai_stress": "count"
    }
)
print("Number of individual ids represented:", number_records_per_id_stai.count())
number_records_per_id_stai

In [None]:
ttm.columns

In [None]:

print("Number of records in ttm:", ttm['user_id'].count())
number_records_per_id_ttm = ttm.groupby("user_id").agg(
    {
        "ttm_consciousness_raising": "count"
    }
)
print("Number of individual ids represented:", number_records_per_id_ttm.count())
number_records_per_id_ttm

In [79]:
# let's take subsets of each of the behavioral tables and then join them together so that we have
# one table that contains behavioral information on each person who took the surveys

# problem: some people have multiple records in the behavior tables. Do I want to average those values together

# my decision is to do an average for each value

breq_averaged = breq.groupby("user_id").agg(
    {
        "breq_amotivation":"mean",
        "breq_external_regulation": "mean", 
        "breq_introjected_regulation": "mean", 
        "breq_identified_regulation": "mean", 
        "breq_intrinsic_regulation": "mean", 
        "breq_self_determination": "first"
    }
)
breq_averaged = breq_averaged.reset_index()
print("BREQ ID COUNT:", breq_averaged['user_id'].count())

panas_averaged = panas.groupby("user_id").agg(
    {
        "positive_affect_score":"mean",
        "negative_affect_score": "mean"
    }
)
panas_averaged = panas_averaged.reset_index()
print("PANAS ID COUNT:", panas_averaged['user_id'].count())

personality_averaged = personality.groupby("user_id").agg(
    {
        "agreeableness": "mean",
        "conscientiousness": "mean",
        "stability": "mean", 
        "intellect": "mean"
    }
)

personality_averaged = personality_averaged.reset_index()
print("PERSONALITY ID COUNT:", personality_averaged['user_id'].count())

stai_averaged = stai.groupby("user_id").agg(
    {
        "stai_stress": "mean",
        "stai_stress_category" : "first"
    }
)

stai_averaged = stai_averaged.reset_index()
print("STAI ID COUNT:", stai_averaged['user_id'].count())

ttm_averaged = ttm.groupby("user_id").agg(
    {
        "ttm_consciousness_raising": "mean", 
        "ttm_dramatic_relief": "mean", 
        "ttm_environmental_reevaluation": "mean", 
        "ttm_self_reevaluation": "mean", 
        "ttm_social_liberation": "mean", 
        "ttm_counterconditioning": "mean", 
        "ttm_helping_relationships": "mean",
        "ttm_reinforcement_management": "mean",
        "ttm_self_liberation": "mean", 
        "ttm_stimulus_control": "mean"
    }
)

ttm_averaged = ttm_averaged.reset_index()
print("TTM ID COUNT:", ttm_averaged['user_id'].count())




BREQ ID COUNT: 52
PANAS ID COUNT: 51
PERSONALITY ID COUNT: 50
STAI ID COUNT: 53
TTM ID COUNT: 53


In [89]:
step_one = breq_averaged.merge(panas_averaged, on='user_id', how='outer')
step_two = step_one.merge(personality_averaged, on='user_id', how='outer')
step_three = step_two.merge(stai_averaged, on='user_id', how='outer')
individuals = step_three.merge(ttm_averaged, on='user_id', how='outer')
individuals

Unnamed: 0,user_id,breq_amotivation,breq_external_regulation,breq_introjected_regulation,breq_identified_regulation,breq_intrinsic_regulation,breq_self_determination,positive_affect_score,negative_affect_score,agreeableness,...,ttm_consciousness_raising,ttm_dramatic_relief,ttm_environmental_reevaluation,ttm_self_reevaluation,ttm_social_liberation,ttm_counterconditioning,ttm_helping_relationships,ttm_reinforcement_management,ttm_self_liberation,ttm_stimulus_control
0,621e2e8e67b776a24055b564,1.0,1.0,1.333333,3.5,4.125,intrinsic_regulation,36.0,12.4,33.0,...,1.333333,2.666667,2.333333,4.333333,4.0,4.0,4.0,4.0,4.0,1.666667
1,621e2eaf67b776a2406b14ac,1.125,1.125,2.0,4.25,3.625,identified_regulation,28.666667,20.333333,45.0,...,3.5,2.5,4.166667,4.5,3.666667,2.0,3.0,4.333333,3.333333,2.0
2,621e2ed667b776a24085d8d1,1.875,1.75,2.666667,2.5,1.75,introjected_regulation,29.2,24.4,43.0,...,1.333333,2.166667,3.5,3.333333,3.5,1.166667,1.0,2.833333,2.0,1.333333
3,621e2f1b67b776a240b3d87c,,,,,,,,,,...,,,,,,,,,,
4,621e2f3967b776a240c654db,1.0,2.0,1.0,2.5,3.0,intrinsic_regulation,29.0,19.666667,34.0,...,2.0,2.0,2.666667,4.0,5.0,2.333333,2.0,3.0,2.333333,2.0
5,621e2f6167b776a240e082a9,1.0,1.375,3.166667,4.875,4.75,identified_regulation,24.428571,24.571429,41.0,...,2.5,4.166667,2.833333,5.0,2.5,2.166667,2.333333,4.5,4.666667,3.833333
6,621e2f7a67b776a240f14425,1.0,1.166667,1.111111,3.416667,3.666667,intrinsic_regulation,35.333333,17.888889,38.0,...,2.0,1.5,2.0,4.666667,2.333333,3.333333,1.333333,4.5,3.166667,2.166667
7,621e2f9167b776a240011ccb,1.0,1.0,4.166667,4.0,5.0,intrinsic_regulation,39.0,33.4,45.0,...,3.333333,2.833333,1.833333,4.833333,2.166667,1.666667,1.833333,4.833333,3.5,4.166667
8,621e2fb367b776a24015accd,1.125,1.625,3.166667,3.25,3.375,intrinsic_regulation,31.0,19.125,36.0,...,1.833333,2.333333,2.333333,3.833333,3.5,3.333333,4.0,3.833333,3.833333,2.666667
9,621e2fce67b776a240279baa,1.0,2.5,3.0,2.5,4.5,intrinsic_regulation,26.4,17.6,42.0,...,2.666667,2.666667,2.833333,4.5,4.5,1.333333,3.166667,3.833333,2.333333,2.666667


In [91]:
individuals[individuals.select_dtypes(include=['float']).columns] = individuals.select_dtypes(include=['float']).fillna(0)
individuals[individuals.select_dtypes(include=['object']).columns] = individuals.select_dtypes(include=['object']).fillna('undefined')
individuals

Unnamed: 0,user_id,breq_amotivation,breq_external_regulation,breq_introjected_regulation,breq_identified_regulation,breq_intrinsic_regulation,breq_self_determination,positive_affect_score,negative_affect_score,agreeableness,...,ttm_consciousness_raising,ttm_dramatic_relief,ttm_environmental_reevaluation,ttm_self_reevaluation,ttm_social_liberation,ttm_counterconditioning,ttm_helping_relationships,ttm_reinforcement_management,ttm_self_liberation,ttm_stimulus_control
0,621e2e8e67b776a24055b564,1.0,1.0,1.333333,3.5,4.125,intrinsic_regulation,36.0,12.4,33.0,...,1.333333,2.666667,2.333333,4.333333,4.0,4.0,4.0,4.0,4.0,1.666667
1,621e2eaf67b776a2406b14ac,1.125,1.125,2.0,4.25,3.625,identified_regulation,28.666667,20.333333,45.0,...,3.5,2.5,4.166667,4.5,3.666667,2.0,3.0,4.333333,3.333333,2.0
2,621e2ed667b776a24085d8d1,1.875,1.75,2.666667,2.5,1.75,introjected_regulation,29.2,24.4,43.0,...,1.333333,2.166667,3.5,3.333333,3.5,1.166667,1.0,2.833333,2.0,1.333333
3,621e2f1b67b776a240b3d87c,0.0,0.0,0.0,0.0,0.0,undefined,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,621e2f3967b776a240c654db,1.0,2.0,1.0,2.5,3.0,intrinsic_regulation,29.0,19.666667,34.0,...,2.0,2.0,2.666667,4.0,5.0,2.333333,2.0,3.0,2.333333,2.0
5,621e2f6167b776a240e082a9,1.0,1.375,3.166667,4.875,4.75,identified_regulation,24.428571,24.571429,41.0,...,2.5,4.166667,2.833333,5.0,2.5,2.166667,2.333333,4.5,4.666667,3.833333
6,621e2f7a67b776a240f14425,1.0,1.166667,1.111111,3.416667,3.666667,intrinsic_regulation,35.333333,17.888889,38.0,...,2.0,1.5,2.0,4.666667,2.333333,3.333333,1.333333,4.5,3.166667,2.166667
7,621e2f9167b776a240011ccb,1.0,1.0,4.166667,4.0,5.0,intrinsic_regulation,39.0,33.4,45.0,...,3.333333,2.833333,1.833333,4.833333,2.166667,1.666667,1.833333,4.833333,3.5,4.166667
8,621e2fb367b776a24015accd,1.125,1.625,3.166667,3.25,3.375,intrinsic_regulation,31.0,19.125,36.0,...,1.833333,2.333333,2.333333,3.833333,3.5,3.333333,4.0,3.833333,3.833333,2.666667
9,621e2fce67b776a240279baa,1.0,2.5,3.0,2.5,4.5,intrinsic_regulation,26.4,17.6,42.0,...,2.666667,2.666667,2.833333,4.5,4.5,1.333333,3.166667,3.833333,2.333333,2.666667


In [92]:
individuals.to_csv("~/class/f24/cbm/evo-life/data/prepped/individuals.csv")