# **Dexcom Clarity Readings - SQLite**

This Python program automates the integration of new glucose readings from a CSV file into an SQLite database. It processes the downloaded data, verifies its integrity by checking for duplicates and consecutive date ranges, and appends the new entries to the existing dataset. The program ensures data consistency by retaining all historical readings in the SQL table, while exporting only the most recent 90 days of data to a CSV file for use in Tableau visualizations.

Run on Python 3.12 | No errors | No warnings

In [1]:
# Import packages

# For data manipulation
import numpy as np
import pandas as pd

# for displaying and modifying the working directory
import os as os

# For working with datetime objects
from datetime import datetime

# For working with SQLite databases
import sqlite3

In [3]:
# Load the SQL magic extension
%load_ext sql

In [4]:
# Create connection to the SQLite database
%sql sqlite:///ClarityHistory.db

In [5]:
# Get the record count for CLARITY_DATA
%sql SELECT COUNT(*) FROM CLARITY_DATA

 * sqlite:///ClarityHistory.db
Done.


COUNT(*)
9230


In [6]:
# Display the first 5 rows of the CLARITY_DATA table
%sql SELECT * FROM CLARITY_DATA LIMIT 5

 * sqlite:///ClarityHistory.db
Done.


Date,Time,DateTime,Value,Treatment,Source
2024-08-29,04:53 PM,2024-08-29 16:53:35,100,Mounjaro 12.5,CGM
2024-08-29,04:58 PM,2024-08-29 16:58:35,112,Mounjaro 12.5,CGM
2024-08-29,05:03 PM,2024-08-29 17:03:36,93,Mounjaro 12.5,CGM
2024-08-29,05:08 PM,2024-08-29 17:08:35,83,Mounjaro 12.5,CGM
2024-08-29,05:13 PM,2024-08-29 17:13:36,78,Mounjaro 12.5,CGM


In [7]:
# Display column names and data types for the CLARITY_DATA table
%sql PRAGMA table_info(CLARITY_DATA)

 * sqlite:///ClarityHistory.db
Done.


cid,name,type,notnull,dflt_value,pk
0,Date,TEXT,0,,0
1,Time,TEXT,0,,0
2,DateTime,TEXT,0,,0
3,Value,INTEGER,0,,0
4,Treatment,TEXT,0,,0
5,Source,TEXT,0,,0


In [8]:
%%sql
-- Display the first and last dates in the table
SELECT MIN(DATE(Date)) AS Min_Date, MAX(DATE(Date)) AS Max_Date
FROM CLARITY_DATA;

 * sqlite:///ClarityHistory.db
Done.


Min_Date,Max_Date
2024-08-29,2024-09-30


In [9]:
# Load new data from Clarity
df0 = pd.read_csv("Clarity_Export_Smith_Jeffrey.csv", usecols=['Index', 'Timestamp (YYYY-MM-DDThh:mm:ss)', 'Event Type', 'Glucose Value (mg/dL)'])

In [10]:
# Display the first 10 rows of the dataframe (df0)
df0.head(10)

Unnamed: 0,Index,Timestamp (YYYY-MM-DDThh:mm:ss),Event Type,Glucose Value (mg/dL)
0,1,,FirstName,
1,2,,LastName,
2,3,,Device,
3,4,2024-10-01T00:04:42,EGV,90.0
4,5,2024-10-01T00:09:42,EGV,90.0
5,6,2024-10-01T00:14:42,EGV,91.0
6,7,2024-10-01T00:19:42,EGV,89.0
7,8,2024-10-01T00:24:42,EGV,89.0
8,9,2024-10-01T00:29:42,EGV,90.0
9,10,2024-10-01T00:34:42,EGV,91.0


In [11]:
# Display basic information about the data 
df0.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291 entries, 0 to 290
Data columns (total 4 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Index                            291 non-null    int64  
 1   Timestamp (YYYY-MM-DDThh:mm:ss)  288 non-null    object 
 2   Event Type                       291 non-null    object 
 3   Glucose Value (mg/dL)            288 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 9.2+ KB


In [12]:
# Keep only the rows where Event Type is EGV (estimated glucose value)
df0 = df0[df0['Event Type'] == 'EGV']

In [13]:
# Replace Glucose Value (mg/dL) with 40 where it is 'Low'
df0['Glucose Value (mg/dL)'] = df0['Glucose Value (mg/dL)'].replace('Low', 40)

In [14]:
# Convert Glucose Value (mg/dL) to integer
df0['Glucose Value (mg/dL)'] = df0['Glucose Value (mg/dL)'].astype(int)

In [15]:
# Convert the column to datetime format
df0['Timestamp'] = pd.to_datetime(df0['Timestamp (YYYY-MM-DDThh:mm:ss)'], format='%Y-%m-%dT%H:%M:%S')

In [16]:
# Delete the Timestamp (YYYY-MM-DDThh:mm:ss) and Event Type columns
df0 = df0.drop(['Timestamp (YYYY-MM-DDThh:mm:ss)', 'Event Type'], axis=1)

In [17]:
# Delete the Index column
df0 = df0.drop(['Index'], axis=1)

In [18]:
# Rename Glucose Value (mg/dL) to Value CGM
df0 = df0.rename(columns={'Glucose Value (mg/dL)': 'Value'})

In [19]:
# Create a Date column from the Timestamp column using the date attribute
df0['Date'] = df0['Timestamp'].dt.date

In [20]:
# Create a column for Treatment
df0['Date'] = pd.to_datetime(df0['Date'])

conditions = [
    (df0['Date'] > '2024-06-18'),
    (df0['Date'] > '2024-01-30'),
    (df0['Date'] > '2023-07-18'),
    (df0['Date'] > '2023-02-28'),
    (df0['Date'] > '2023-01-31')
]

choices = [
    'Mounjaro 12.5',
    'Mounjaro 10',
    'Mounjaro 7.5',
    'Mounjaro 5',
    'Mounjaro 2.5'
]

df0['Treatment'] = np.select(conditions, choices, default='Untreated')

In [21]:
# Creat a column named Time to extract the time from the Timestamp column
df0['Time'] = df0['Timestamp'].dt.time

In [22]:
# Rename Timestamp to DateTime
df0 = df0.rename(columns={'Timestamp': 'DateTime'})

In [23]:
# Format Time as AM/PM
df0['Time'] = df0['Time'].apply(lambda x: x.strftime('%I:%M %p'))

In [24]:
# Make Sure Date is only the date, not date and time
df0['Date'] = df0['Date'].dt.date

In [25]:
# Drop rows where Date is current date
today = datetime.today().date()
df0 = df0[df0['Date'] < today]

In [26]:
# Reorder the columns
df0 = df0[['Date', 'Time', 'DateTime', 'Value', 'Treatment']]

In [27]:
# Add a column for Source
df0['Source'] = 'CGM'

In [28]:
# Display the min and max dates
min_date0 = df0['Date'].min()
max_date0 = df0['Date'].max()

print("New Data")
print("Min Date:", min_date0)
print("Max Date:", max_date0)

New Data
Min Date: 2024-10-01
Max Date: 2024-10-01


In [29]:
# Display the first 5 rows of the dataframe (df0)
df0.head()

Unnamed: 0,Date,Time,DateTime,Value,Treatment,Source
3,2024-10-01,12:04 AM,2024-10-01 00:04:42,90,Mounjaro 12.5,CGM
4,2024-10-01,12:09 AM,2024-10-01 00:09:42,90,Mounjaro 12.5,CGM
5,2024-10-01,12:14 AM,2024-10-01 00:14:42,91,Mounjaro 12.5,CGM
6,2024-10-01,12:19 AM,2024-10-01 00:19:42,89,Mounjaro 12.5,CGM
7,2024-10-01,12:24 AM,2024-10-01 00:24:42,89,Mounjaro 12.5,CGM


In [30]:
# Display basic information about the data 
df0.info()

<class 'pandas.core.frame.DataFrame'>
Index: 288 entries, 3 to 290
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       288 non-null    object        
 1   Time       288 non-null    object        
 2   DateTime   288 non-null    datetime64[ns]
 3   Value      288 non-null    int64         
 4   Treatment  288 non-null    object        
 5   Source     288 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 15.8+ KB


In [31]:
# Display basic statistics about the data
df0.describe()

Unnamed: 0,DateTime,Value
count,288,288.0
mean,2024-10-01 12:02:13.177083648,107.809028
min,2024-10-01 00:04:42,79.0
25%,2024-10-01 06:03:27.750000128,93.0
50%,2024-10-01 12:02:13,102.0
75%,2024-10-01 18:00:59,115.25
max,2024-10-01 23:59:44,184.0
std,,20.48408


In [32]:
# Reindex df0
df0 = df0.reset_index(drop=True)

In [33]:
# Add the new data to the existing CLARITY_DATA table
df0.to_sql('CLARITY_DATA', con=sqlite3.connect('ClarityHistory.db'), if_exists='append', index=False)

288

In [34]:
%%sql
-- Find the total count of duplicate rows in the CLARITY_DATA table
SELECT SUM(duplicate_count - 1) AS total_duplicates
FROM (
    SELECT COUNT(*) AS duplicate_count
    FROM CLARITY_DATA
    GROUP BY Date, Time, DateTime, Value, Treatment, Source
    HAVING COUNT(*) > 1
) as duplicates;

 * sqlite:///ClarityHistory.db
Done.


total_duplicates
""


In [35]:
%%sql
-- Delete duplicate rows in the CLARITY_DATA table
DELETE FROM CLARITY_DATA
WHERE rowid NOT IN (
    SELECT MIN(rowid)
    FROM CLARITY_DATA
    GROUP BY Date, Time, DateTime, Value, Treatment, Source
);

 * sqlite:///ClarityHistory.db
0 rows affected.


[]

In [36]:
%%sql
-- Find missing dates in CLARITY_DATA
WITH DateRange AS (
    -- Generate a range of consecutive dates between the minimum and maximum dates in the dataset
    SELECT date(min(Date)) as StartDate, date(max(Date)) as EndDate
    FROM CLARITY_DATA
),
AllDates AS (
    -- Recursive query to generate all dates between StartDate and EndDate
    SELECT StartDate as Date
    FROM DateRange
    UNION ALL
    SELECT date(Date, '+1 day')
    FROM AllDates, DateRange
    WHERE Date < EndDate
)
-- Find dates in the generated date range that do not exist in your table
SELECT Date
FROM AllDates
WHERE Date NOT IN (SELECT DISTINCT Date FROM CLARITY_DATA);

 * sqlite:///ClarityHistory.db
Done.


Date


In [37]:
# Load the CLARITY_DATA table into a DataFrame
df = %sql SELECT * FROM CLARITY_DATA

# Convert the ResultSet to a DataFrame
df1 = df.DataFrame()

 * sqlite:///ClarityHistory.db
Done.


In [38]:
# Display the first 5 rows of the dataframe (df1)
df1.head()

Unnamed: 0,Date,Time,DateTime,Value,Treatment,Source
0,2024-08-29,04:53 PM,2024-08-29 16:53:35,100,Mounjaro 12.5,CGM
1,2024-08-29,04:58 PM,2024-08-29 16:58:35,112,Mounjaro 12.5,CGM
2,2024-08-29,05:03 PM,2024-08-29 17:03:36,93,Mounjaro 12.5,CGM
3,2024-08-29,05:08 PM,2024-08-29 17:08:35,83,Mounjaro 12.5,CGM
4,2024-08-29,05:13 PM,2024-08-29 17:13:36,78,Mounjaro 12.5,CGM


In [39]:
# Display basic information about the data
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9518 entries, 0 to 9517
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       9518 non-null   object
 1   Time       9518 non-null   object
 2   DateTime   9518 non-null   object
 3   Value      9518 non-null   int64 
 4   Treatment  9518 non-null   object
 5   Source     9518 non-null   object
dtypes: int64(1), object(5)
memory usage: 446.3+ KB


In [40]:
# Convert Date to datetime format and find the most recent 90 days by Date in df1
df1['Date'] = pd.to_datetime(df1['Date'])
most_recent_date = df1['Date'].max()
ninety_days_ago = most_recent_date - pd.DateOffset(days=90)

In [41]:
# Delete rows where Date is before ninety_days_ago and convert Date back to a date object
df1 = df1[df1['Date'] > ninety_days_ago]
df1['Date'] = df1['Date'].dt.date

In [42]:
# Get the min and max dates
min_date = df1['Date'].min()
max_date = df1['Date'].max()

print ("Combined Data")
print("Min Date:", min_date)
print("Max Date:", max_date)

# Create a complete date range from min to max date
complete_date_range = pd.date_range(start=min_date, end=max_date, freq='D')

# Get the unique dates from your dataset
unique_dates = pd.to_datetime(df1['Date']).sort_values().unique()

# Check for missing dates using numpy set difference
missing_dates = np.setdiff1d(complete_date_range, unique_dates)

# Display missing dates, if any
if len(missing_dates) > 0:
    print("Missing dates:")
    print(missing_dates)
else:
    print("No missing dates, all dates are consecutive.")

Combined Data
Min Date: 2024-08-29
Max Date: 2024-10-01
No missing dates, all dates are consecutive.


In [43]:
# Verify the changes
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9518 entries, 0 to 9517
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       9518 non-null   object
 1   Time       9518 non-null   object
 2   DateTime   9518 non-null   object
 3   Value      9518 non-null   int64 
 4   Treatment  9518 non-null   object
 5   Source     9518 non-null   object
dtypes: int64(1), object(5)
memory usage: 446.3+ KB


In [44]:
# Save df1 to a csv file
df1.to_csv('Clarity Readings for Analysis.csv', index=False)