In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("HealthAutoExport-2021-06-01-2022-01-01 Data.csv")
cols = [
        "Date",
        "Active Energy (kcal)",
        "Apple Exercise Time (min)",
        "Apple Stand Time (min)",
        "Basal Energy Burned (kcal)",
        "Flights Climbed (count)",
        "Heart Rate [Min] (count/min)",
        "Heart Rate [Max] (count/min)",
        "Heart Rate [Avg] (count/min)",
        "Resting Heart Rate (count/min)",
        "Sleep Analysis [Asleep] (hr)",
        "Sleep Analysis [In Bed] (hr)",
        "Sleep Analysis [Core] (hr)",
        "Sleep Analysis [Deep] (hr)",
        "Sleep Analysis [REM] (hr)",
        "Sleep Analysis [Awake] (hr)",
        "Stair Speed: Down (ft/s)",
        "Stair Speed: Up (ft/s)",
        "Step Count (count)",
    ]

df1 = df.loc[:,cols]
#print(df1.head())
#print(df1.info())
df2 = df1.set_index("Date")
df3 = df2.stack().reset_index()
df3.rename(columns = {"level_1":"health_data_type",0:"value"}, inplace = True)
df3.loc[:,"type_unit_of_measurement"] = df3.health_data_type.apply(lambda x: x.split(" (")[1][:-1] if " (" in x else None)
df3.loc[:,"type"] = df3.health_data_type.apply(lambda x: x.split(" (")[0].replace(":","").replace("Analysis ","").replace("Apple ","").replace(" ", "_").replace("[","").replace("]","").lower() if " (" in x else None)
df3.loc[:,"day_of_month"] = df3.Date.apply(lambda x: x.split(" ")[0].split("-")[2])
df3.loc[:,"month"] = df3.Date.apply(lambda x: x.split(" ")[0].split("-")[1])
df3.loc[:,"year"] = df3.Date.apply(lambda x: x.split(" ")[0].split("-")[0])
#print(df3.head())
#print(df3["type"].unique())

types_index = {x:ind for ind,x in enumerate(df3["type"].unique())}
date_index = {x:ind for ind,x in enumerate(df3["Date"].unique())}

df3.loc[:,"date_index"] = df3["Date"].apply(lambda x: date_index[x])
df3.loc[:,"type_index"] = df3["type"].apply(lambda x: types_index[x])

healthDataFact = df3.loc[:,["date_index","type_index","value"]]
healthDataDim = df3.loc[:,["type_index","type","type_unit_of_measurement"]].drop_duplicates()
dateDim = df3.loc[:,["date_index","day_of_month","month","year"]].drop_duplicates()

In [3]:
print(healthDataFact)

      date_index  type_index       value
0              0           0    2.000000
1              0           1    0.000000
2              0           2    6.824425
3              0           3    0.000000
4              0           4    0.000000
...          ...         ...         ...
3291         214           3    0.000000
3292         214           4    0.000000
3293         214           5    0.000000
3294         214           6    0.000000
3295         214           7  485.000000

[3296 rows x 3 columns]


In [4]:
print(healthDataDim)

     type_index                 type type_unit_of_measurement
0             0      flights_climbed                    count
1             1         sleep_asleep                       hr
2             2         sleep_in_bed                       hr
3             3           sleep_core                       hr
4             4           sleep_deep                       hr
5             5            sleep_rem                       hr
6             6          sleep_awake                       hr
7             7           step_count                    count
146           8        active_energy                     kcal
147           9        exercise_time                      min
148          10           stand_time                      min
149          11  basal_energy_burned                     kcal
151          12       heart_rate_min                count/min
152          13       heart_rate_max                count/min
153          14       heart_rate_avg                count/min
154     

In [5]:
print(dateDim)

      date_index day_of_month month  year
0              0           01    06  2021
8              1           02    06  2021
16             2           03    06  2021
24             3           04    06  2021
32             4           05    06  2021
...          ...          ...   ...   ...
3217         210           28    12  2021
3233         211           29    12  2021
3251         212           30    12  2021
3263         213           31    12  2021
3281         214           01    01  2022

[215 rows x 4 columns]


In [6]:
import sqlite3

In [7]:
conn = sqlite3.connect('FinalProject.db')
c = conn.cursor()

In [31]:
c.execute("DROP TABLE IF EXISTS HealthFact;")

<sqlite3.Cursor at 0x13d55609880>

In [32]:
c.execute('''CREATE TABLE HealthFact(
    id INTEGER PRIMARY KEY,
    user_id INTEGER,
    date_id INTEGER,
    healthData_id INTEGER,
    value NUMERIC (10,2)
);''')

<sqlite3.Cursor at 0x13d55609880>

In [10]:
c.execute('''CREATE TABLE UserDim(
    id INTEGER PRIMARY KEY,
    name VARCHAR (100),
    email VARCHAR (100)
);''')

<sqlite3.Cursor at 0x13d55609880>

In [11]:
c.execute('''CREATE TABLE DateDim(
    id INTEGER PRIMARY KEY,
    day INTEGER (2),
    month INTEGER (2),
    year INTEGER (4)
);''')

<sqlite3.Cursor at 0x13d55609880>

In [12]:
c.execute('''CREATE TABLE HealthDim(
    id INTEGER PRIMARY KEY,
    type VARCHAR (25),
    desc VARCHAR (255),
    unit_of_measurement VARCHAR (20)
);''')

<sqlite3.Cursor at 0x13d55609880>

In [38]:
c.execute('''CREATE TABLE HealthClassifier(
    id INTEGER PRIMARY KEY,
    healthData_id INTEGER,
    classifier VARCHAR (25),
    min NUMERIC,
    max NUMERIC
);''')

<sqlite3.Cursor at 0x13d55609880>

In [13]:
for _, row in dateDim.iterrows():
    day = row["day_of_month"]
    month = row["month"]
    year = row["year"]

    c.execute("INSERT INTO DateDim (day, month, year) VALUES (?, ?, ?)", (day, month, year))

In [29]:
c.execute("SELECT * FROM DateDim")
colnames = [row[0] for row in c.description]
df = pd.DataFrame(c.fetchall(), columns=colnames)
df.head()

Unnamed: 0,id,day,month,year
0,1,1,6,2021
1,2,2,6,2021
2,3,3,6,2021
3,4,4,6,2021
4,5,5,6,2021


In [34]:
user_id = 1

for _, row in healthDataFact.iterrows():
    date_id = row["date_index"]
    healthData_id = row["type_index"]
    value = row["value"]

    c.execute("INSERT INTO HealthFact (user_id, date_id, healthData_id, value) VALUES (?, ?, ?, ?)", (user_id, date_id, healthData_id, value))

In [37]:
c.execute("SELECT * FROM HealthFact")
colnames = [row[0] for row in c.description]
df = pd.DataFrame(c.fetchall(), columns=colnames)
df.head()

Unnamed: 0,id,user_id,date_id,healthData_id,value
0,1,1,0,0,2.0
1,2,1,0,1,0.0
2,3,1,0,2,6.824425
3,4,1,0,3,0.0
4,5,1,0,4,0.0


In [19]:
desc = "Change this"

for _, row in healthDataDim.iterrows():
    health_type = row["type"]
    unit_of_measurement = row["type_unit_of_measurement"]

    c.execute("INSERT INTO HealthDim (type, desc, unit_of_measurement) VALUES (?, ?, ?)",
              ( health_type, desc, unit_of_measurement))

In [30]:
c.execute("SELECT * FROM HealthDim")
colnames = [row[0] for row in c.description]
df = pd.DataFrame(c.fetchall(), columns=colnames)
df

Unnamed: 0,id,type,desc,unit_of_measurement
0,1,flights_climbed,Change this,count
1,2,sleep_asleep,Change this,hr
2,3,sleep_in_bed,Change this,hr
3,4,sleep_core,Change this,hr
4,5,sleep_deep,Change this,hr
5,6,sleep_rem,Change this,hr
6,7,sleep_awake,Change this,hr
7,8,step_count,Change this,count
8,9,active_energy,Change this,kcal
9,10,exercise_time,Change this,min
