In [154]:
# Set preprocessing parameters

b = 0.5 # Bin size for insulin and food history vectors, in hours
d = 3   # Total duration of insulin and food history vectors. 
        # Impact of data longer than d hours ago should not continue to change.
    
segmentDays = 30 # how long a segment of data to use for inference at all, in days
segment = 5      # which segment of to use, starting at 1 for first segment.

sequenceLength = 10 # how many glucose measurements to bin into one sequence
maxGapHours = 16 # how long a single delT (time between glucose measurements) to accept in a sequence


In [200]:
# Read in the data

import pandas
import datetime

dfFull = pandas.read_csv('./data/mdi.csv')
dfFull = dfFull.reindex(index=dfFull.index[::-1]) # order earliest - latest
dfFull.tail(10)

Unnamed: 0,CorrectTimestamp,Glucose,Meal,MealCarbs,Humalog,Lantus,WhatEating,Notes
9,8/8/17 3:44,312.0,,,1.5,,,
8,8/8/17 7:42,221.0,Breakfast,20.0,2.0,,Chia pudding yogurt eggs blueberries,
7,8/8/17 12:01,105.0,,,,,,
6,8/8/17 14:05,76.0,Lunch,46.0,1.5,,"Wwpretzel,tortilla,carrot,greenbean,strawberry...",
5,8/8/17 18:53,,Exercise snack,6.0,,,2 triscuits,
4,8/8/17 19:22,105.0,Exercise snack,6.0,,,2 triscuits,
3,8/8/17 20:00,137.0,Dinner,72.0,2.0,8.0,"Peanut chickpea dish, green beans, blueberries...",
2,8/8/17 22:07,111.0,Other,11.0,,,Bread and cc,
1,8/9/17 6:30,162.0,Breakfast,10.0,1.0,,Chia eggs blueberries,
0,8/9/17 9:33,224.0,,,0.99,,,First pump insulin bolus!


In [162]:
# Pick out data from a particular segment to use, and get time in hours

timestamps = dfFull['CorrectTimestamp']
dfFull['Time'] = [datetime.datetime.strptime(t, "%m/%d/%y %H:%M") for t in timestamps] # datetime object times
df = dfFull.copy()


startDate = min(df['Time']) + datetime.timedelta(segmentDays * (segment - 1))
endDate   = min(df['Time']) + datetime.timedelta(segmentDays * segment)

df = df[df['Time'].between(startDate, endDate)]

df['Hour'] = [t.timestamp()/3600 for t in df['Time']]
df['Hour'] = df['Hour'] - min(df['Hour'])

'Using {} data points between {} and {}'.format(df.shape[0], startDate.strftime('%Y-%m-%d'), endDate.strftime('%Y-%m-%d'))

'Using 195 data points between 2017-02-16 and 2017-03-18'

For each GLUCOSE measurement we need to obtain:
* Humalog vector: "New injection info" - include only since last glucose measurement (including this timepoint). Binned: [0-b, 1-2b, 2-3b, ... (D-b)-D, D+]. 
* Carbs vector: "New carbs info" - include only since last glucose measurement (including this timepoint), as for Humalog.
* delT: time since last glucose measurement, in hours.
* Current basal rate: approximate as most recent Lantus dose.

In [198]:
import math
nBins = math.ceil(d/b)

newInsulinEntries = [] # List of insulin since last BG measurement, each in form (timeInHours, insulinUnits)
newCarbEntries = [] # List of carbs since last BG measurement, each in form (timeInHours, carbGrams)
lastTime = 0

# Get Lantus doses from before this time period, if available. Otherwise don't have info until first dose.
prevLantus = dfFull.loc[(pandas.notna(dfFull["Lantus"])) & (dfFull["Time"] <= startDate), "Lantus"]
if prevLantus.empty:
    lastLantus = 0
else:
    lastLantus = prevLantus.values[-1]

singleEntries = []

for (index, row) in df.iterrows(): # Loop through data entries. For each entry...
    if pandas.notna(row['Lantus']): # Store most recent Lantus dose
        lastLantus = row['Lantus']
    if pandas.notna(row['Glucose']): # If glucose is recorded, we'll make a new row   
        now = row['Hour']
        # Use list of insulin (and current value) to construct Humalog vector
        humalog = [sum([h for (t,h) in newInsulinEntries 
                        if now-t >= iBin * b and now-t < (iBin+1) * b]) 
                   for iBin in range(nBins)]
        humalog[0] += row['Humalog'] if pandas.notna(row['Humalog']) else 0
        
        # Use list of carbs (and current value) to construct carb vector
        carbs = [sum([h for (t,h) in newCarbEntries 
                        if now-t >= iBin * b and now-t < (iBin+1) * b]) 
                 for iBin in range(nBins)]
        carbs[0] += row['MealCarbs'] if pandas.notna(row['MealCarbs']) else 0
        
        # Get time since last glucose measurement, in hours
        delT = row['Hour'] - lastTime
        
        # Create a row with delT, glucose, insulin, carbs, basal=lastLantus
        singleEntries.append({
            'MeasBG': row['Glucose'],
            'delT': delT,
            'NewInsulin': humalog,
            'NewCarbs': carbs,
            'Basal': lastLantus
        })
        
        # Clear the lists of new insulin & carbs, and set last T to this T
        newInsulinEntries = []
        newCarbEntries = []
        lastTime = now
    else: # Otherwise, add (T, H) and/or (T, C) to list of current insulin and/or carbs.
        if pandas.notna(row['Humalog']):
            newInsulinEntries.append((row['Hour'], row['Humalog']))
        if pandas.notna(row['MealCarbs']):
            newCarbEntries.append((row['Hour'], row['MealCarbs']))
        
singleEntries = pandas.DataFrame(singleEntries)
singleEntries.tail(10)



Unnamed: 0,Basal,MeasBG,NewCarbs,NewInsulin,delT
172,3.0,106.0,"[0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",0.783333
173,3.0,143.0,"[0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",1.716667
174,3.0,117.0,"[0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",1.0
175,3.0,69.0,"[14.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",1.5
176,3.0,111.0,"[57.0, 0, 0, 0, 0, 0]","[1.5, 0, 0, 0, 0, 0]",0.416667
177,3.0,280.0,"[0, 0, 0, 0, 0, 0]","[1.0, 0, 0, 0, 0, 0]",4.25
178,3.0,172.0,"[53.0, 0, 0, 0, 0, 0]","[2.0, 0, 0, 0, 0, 0]",9.166667
179,3.0,215.0,"[64.0, 0, 0, 0, 0, 0]","[2.5, 0, 0, 0, 0, 0]",4.083333
180,3.0,44.0,"[12.0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",1.9
181,3.0,123.0,"[0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]",0.3


Then we need to group the data together into sequences. 

Collect rows 1 thru s, 2 thru (s+1), etc. and gather each set into a new row, with appropriate labels.
Truncate sequences where there's longer than a 12-hour gap in measurement.
Output a spreadsheet.

In [201]:
sequences = []

for iSeq in range(len(singleEntries) - sequenceLength):
    s = singleEntries[iSeq:(iSeq+sequenceLength)].reset_index(drop=True)
    dels = s['delT'].tolist()
    basals = s['Basal'].tolist()
    if all(d <= maxGapHours for d in dels) and all(b > 0 for b in basals):
        sequences.append({
            'MeasBG': s['MeasBG'].tolist(),
            'delT': s['delT'].tolist(),
            'Basal': s['Basal'].tolist(),
            'NewCarbs': s['NewCarbs'].tolist(),
            'NewInsulin': s['NewInsulin'].tolist()
        })
    
sequences = pandas.DataFrame(sequences)
sequences.to_csv('segment_' + str(segment) + '.csv', index=False)
sequences.tail(10)

Unnamed: 0,Basal,MeasBG,NewCarbs,NewInsulin,delT
152,"[2.5, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[465.0, 187.0, 243.0, 280.0, 346.0, 70.0, 107....","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [60.0...","[[2.0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [2....","[1.5, 5.116666666639503, 8.933333333348855, 4...."
153,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[187.0, 243.0, 280.0, 346.0, 70.0, 107.0, 272....","[[0, 0, 0, 0, 0, 0], [60.0, 0, 0, 0, 0, 0], [7...","[[0, 0, 0, 0, 0, 0], [2.5, 0, 0, 0, 0, 0], [3....","[5.116666666639503, 8.933333333348855, 4.25, 4..."
154,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[243.0, 280.0, 346.0, 70.0, 107.0, 272.0, 432....","[[60.0, 0, 0, 0, 0, 0], [75.0, 0, 0, 0, 0, 0],...","[[2.5, 0, 0, 0, 0, 0], [3.0, 0, 0, 0, 0, 0], [...","[8.933333333348855, 4.25, 4.5, 2.4833333333372..."
155,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[280.0, 346.0, 70.0, 107.0, 272.0, 432.0, 135....","[[75.0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [7...","[[3.0, 0, 0, 0, 0, 0], [1.5, 0, 0, 0, 0, 0], [...","[4.25, 4.5, 2.483333333337214, 3.6666666666860..."
156,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[346.0, 70.0, 107.0, 272.0, 432.0, 135.0, 106....","[[0, 0, 0, 0, 0, 0], [70.0, 0, 0, 0, 0, 0], [0...","[[1.5, 0, 0, 0, 0, 0], [2.0, 0, 0, 0, 0, 0], [...","[4.5, 2.483333333337214, 3.666666666686069, 8...."
157,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[70.0, 107.0, 272.0, 432.0, 135.0, 106.0, 143....","[[70.0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [5...","[[2.0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [2....","[2.483333333337214, 3.666666666686069, 8.98333..."
158,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[107.0, 272.0, 432.0, 135.0, 106.0, 143.0, 117...","[[0, 0, 0, 0, 0, 0], [51.0, 0, 0, 0, 0, 0], [5...","[[0, 0, 0, 0, 0, 0], [2.5, 0, 0, 0, 0, 0], [3....","[3.666666666686069, 8.983333333337214, 3.86666..."
159,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[272.0, 432.0, 135.0, 106.0, 143.0, 117.0, 69....","[[51.0, 0, 0, 0, 0, 0], [53.0, 0, 0, 0, 0, 0],...","[[2.5, 0, 0, 0, 0, 0], [3.5, 0, 0, 0, 0, 0], [...","[8.983333333337214, 3.866666666639503, 1.5, 0...."
160,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[432.0, 135.0, 106.0, 143.0, 117.0, 69.0, 111....","[[53.0, 0, 0, 0, 0, 0], [15.0, 0, 0, 0, 0, 0],...","[[3.5, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0,...","[3.866666666639503, 1.5, 0.7833333333255723, 1..."
161,"[3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, ...","[135.0, 106.0, 143.0, 117.0, 69.0, 111.0, 280....","[[15.0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0...","[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0...","[1.5, 0.7833333333255723, 1.7166666666744277, ..."
