In [1]:
import pandas as pd

In [7]:
df = pd.read_csv("HealthAutoExport-2023-01-02-2023-06-20 Data.csv")
cols = [
        "Date",
        "Active Energy (kcal)",
        "Apple Exercise Time (min)",
        "Apple Stand Time (min)",
        "Basal Energy Burned (kcal)",
        "Flights Climbed (count)",
        "Heart Rate [Min] (count/min)",
        "Heart Rate [Max] (count/min)",
        "Heart Rate [Avg] (count/min)",
        "Resting Heart Rate (count/min)",
        "Sleep Analysis [Asleep] (hr)",
        "Sleep Analysis [In Bed] (hr)",
        "Sleep Analysis [Core] (hr)",
        "Sleep Analysis [Deep] (hr)",
        "Sleep Analysis [REM] (hr)",
        "Sleep Analysis [Awake] (hr)",
        "Stair Speed: Down (ft/s)",
        "Stair Speed: Up (ft/s)",
        "Step Count (count)",
    ]

df1 = df.loc[:,cols]
#print(df1.head())
#print(df1.info())
df2 = df1.set_index("Date")
df3 = df2.stack().reset_index()
df3.rename(columns = {"level_1":"health_data_type",0:"value"}, inplace = True)
df3.loc[:,"type_unit_of_measurement"] = df3.health_data_type.apply(lambda x: x.split(" (")[1][:-1] if " (" in x else None)
df3.loc[:,"type"] = df3.health_data_type.apply(lambda x: x.split(" (")[0].replace(":","").replace("Analysis ","").replace("Apple ","").replace(" ", "_").replace("[","").replace("]","").lower() if " (" in x else None)
df3.loc[:,"day_of_month"] = df3.Date.apply(lambda x: x.split(" ")[0].split("-")[2])
df3.loc[:,"month"] = df3.Date.apply(lambda x: x.split(" ")[0].split("-")[1])
df3.loc[:,"year"] = df3.Date.apply(lambda x: x.split(" ")[0].split("-")[0])
#print(df3.head())
#print(df3["type"].unique())

types_index = {x:ind for ind,x in enumerate(df3["type"].unique())}
date_index = {x:ind for ind,x in enumerate(df3["Date"].unique())}

df3.loc[:,"date_index"] = df3["Date"].apply(lambda x: date_index[x])
df3.loc[:,"type_index"] = df3["type"].apply(lambda x: types_index[x])

healthDataFact = df3.loc[:,["date_index","type_index","value"]]
healthDataDim = df3.loc[:,["type_index","type","type_unit_of_measurement"]].drop_duplicates()
dateDim = df3.loc[:,["date_index","day_of_month","month","year"]].drop_duplicates()

In [8]:
print(healthDataFact)

      date_index  type_index        value
0              0           0  1521.000000
1              0           1     0.000000
2              0           2    11.104746
3              0           3     0.000000
4              0           4     0.000000
...          ...         ...          ...
2345         169           5     0.708333
2346         169           6     0.416667
2347         169          16     1.110000
2348         169          17     0.983000
2349         169           7  6069.000000

[2350 rows x 3 columns]


In [9]:
print(healthDataDim)

    type_index                 type type_unit_of_measurement
0            0  basal_energy_burned                     kcal
1            1         sleep_asleep                       hr
2            2         sleep_in_bed                       hr
3            3           sleep_core                       hr
4            4           sleep_deep                       hr
5            5            sleep_rem                       hr
6            6          sleep_awake                       hr
7            7           step_count                    count
8            8        active_energy                     kcal
9            9        exercise_time                      min
10          10           stand_time                      min
12          11      flights_climbed                    count
13          12       heart_rate_min                count/min
14          13       heart_rate_max                count/min
15          14       heart_rate_avg                count/min
16          15   resting

In [11]:
print(dateDim)

      date_index day_of_month month  year
0              0           02    01  2023
8              1           03    01  2023
26             2           04    01  2023
43             3           05    01  2023
52             4           06    01  2023
...          ...          ...   ...   ...
2285         165           16    06  2023
2293         166           17    06  2023
2311         167           18    06  2023
2320         168           19    06  2023
2332         169           20    06  2023

[170 rows x 4 columns]


In [12]:
import sqlite3

In [13]:
conn = sqlite3.connect('FinalProject.db')
c = conn.cursor()

In [31]:
c.execute("DROP TABLE IF EXISTS HealthFact;")

<sqlite3.Cursor at 0x13d55609880>

In [14]:
c.execute('''CREATE TABLE HealthFact(
    id INTEGER PRIMARY KEY,
    user_id INTEGER,
    date_id INTEGER,
    healthData_id INTEGER,
    value NUMERIC (10,2)
);''')

<sqlite3.Cursor at 0x24d5b2013b0>

In [15]:
c.execute('''CREATE TABLE UserDim(
    id INTEGER PRIMARY KEY,
    name VARCHAR (100),
    email VARCHAR (100)
);''')

<sqlite3.Cursor at 0x24d5b2013b0>

In [16]:
c.execute('''CREATE TABLE DateDim(
    id INTEGER PRIMARY KEY,
    day INTEGER (2),
    month INTEGER (2),
    year INTEGER (4)
);''')

<sqlite3.Cursor at 0x24d5b2013b0>

In [25]:
c.execute("DROP TABLE IF EXISTS HealthDim;")

<sqlite3.Cursor at 0x24d5b2013b0>

In [26]:
c.execute('''CREATE TABLE HealthDim(
    id INTEGER PRIMARY KEY,
    type VARCHAR (25),
    desc VARCHAR (255),
    unit_of_measurement VARCHAR (20)
);''')

<sqlite3.Cursor at 0x24d5b2013b0>

In [18]:
c.execute('''CREATE TABLE HealthClassifier(
    id INTEGER PRIMARY KEY,
    healthData_id INTEGER,
    classifier VARCHAR (25),
    lower_bound NUMERIC,
    upper_bound NUMERIC
);''')

<sqlite3.Cursor at 0x24d5b2013b0>

In [19]:
for _, row in dateDim.iterrows():
    day = row["day_of_month"]
    month = row["month"]
    year = row["year"]

    c.execute("INSERT INTO DateDim (day, month, year) VALUES (?, ?, ?)", (day, month, year))
    
conn.commit()

In [20]:
c.execute("SELECT * FROM DateDim")
colnames = [row[0] for row in c.description]
df = pd.DataFrame(c.fetchall(), columns=colnames)
df.head()

Unnamed: 0,id,day,month,year
0,1,2,1,2023
1,2,3,1,2023
2,3,4,1,2023
3,4,5,1,2023
4,5,6,1,2023


In [21]:
user_id = 1

for _, row in healthDataFact.iterrows():
    date_id = row["date_index"]
    healthData_id = row["type_index"]
    value = row["value"]

    c.execute("INSERT INTO HealthFact (user_id, date_id, healthData_id, value) VALUES (?, ?, ?, ?)", (user_id, date_id, healthData_id, value))

conn.commit()

In [22]:
c.execute("SELECT * FROM HealthFact")
colnames = [row[0] for row in c.description]
df = pd.DataFrame(c.fetchall(), columns=colnames)
df.head()

Unnamed: 0,id,user_id,date_id,healthData_id,value
0,1,1,0,0,1521.0
1,2,1,0,1,0.0
2,3,1,0,2,11.104746
3,4,1,0,3,0.0
4,5,1,0,4,0.0


In [27]:
descriptions = {
    "basal_energy_burned": "Calories burned during rest",
    "sleep_asleep": "Duration of deep sleep",
    "sleep_in_bed": "Time spent in bed",
    "sleep_core": "Duration of sleep cycle",
    "sleep_deep": "Duration of deep sleep stage",
    "sleep_rem": "Duration of REM sleep stage",
    "sleep_awake": "Duration of awake time during sleep",
    "step_count": "Number of steps taken",
    "active_energy": "Calories burned during activity",
    "exercise_time": "Duration of exercise",
    "stand_time": "Duration of standing",
    "flights_climbed": "Number of flights of stairs climbed",
    "heart_rate_min": "Minimum heart rate",
    "heart_rate_max": "Maximum heart rate",
    "heart_rate_avg": "Average heart rate",
    "resting_heart_rate": "Resting heart rate",
    "stair_speed_down": "Speed while descending stairs",
    "stair_speed_up": "Speed while ascending stairs"
}

for _, row in healthDataDim.iterrows():
    health_type = row["type"]
    unit_of_measurement = row["type_unit_of_measurement"]

    if health_type in descriptions:
        desc = descriptions[health_type]
    else:
        desc = None
    
    c.execute("INSERT INTO HealthDim (type, desc, unit_of_measurement) VALUES (?, ?, ?)",
              ( health_type, desc, unit_of_measurement))
    
conn.commit()

In [28]:
c.execute("SELECT * FROM HealthDim")
colnames = [row[0] for row in c.description]
df = pd.DataFrame(c.fetchall(), columns=colnames)
df

Unnamed: 0,id,type,desc,unit_of_measurement
0,1,basal_energy_burned,Calories burned during rest,kcal
1,2,sleep_asleep,Duration of deep sleep,hr
2,3,sleep_in_bed,Time spent in bed,hr
3,4,sleep_core,Duration of sleep cycle,hr
4,5,sleep_deep,Duration of deep sleep stage,hr
5,6,sleep_rem,Duration of REM sleep stage,hr
6,7,sleep_awake,Duration of awake time during sleep,hr
7,8,step_count,Number of steps taken,count
8,9,active_energy,Calories burned during activity,kcal
9,10,exercise_time,Duration of exercise,min
