In [227]:
# Set preprocessing parameters

b = 0.5 # Bin size for insulin and food history vectors, in hours
d = 3   # Total duration of insulin and food history vectors. 
        # Impact of data longer than d hours ago should not continue to change.
    
segmentDays = 30 # how long a segment of data to use for inference at all, in days
segment = 8      # which segment of to use, starting at 1 for first segment.

sequenceLength = 10 # how many glucose measurements to bin into one sequence
maxGapHours = 16 # how long a single delT (time between glucose measurements) to accept in a sequence

# Read in the data

import pandas
import datetime

dfFull = pandas.read_csv('./data/mdi.csv')
dfFull = dfFull.reindex(index=dfFull.index[::-1]) # order earliest - latest

# Pick out data from a particular segment to use, and get time in hours

timestamps = dfFull['CorrectTimestamp']
dfFull['Time'] = [datetime.datetime.strptime(t, "%m/%d/%y %H:%M") for t in timestamps] # datetime object times
df = dfFull.copy()

startDate = min(df['Time']) + datetime.timedelta(segmentDays * (segment - 1))
endDate   = min(df['Time']) + datetime.timedelta(segmentDays * segment)

df = df[df['Time'].between(startDate, endDate)]

df['Hour'] = [t.timestamp()/3600 for t in df['Time']]
df['Hour'] = df['Hour'] - min(df['Hour'])

print('Using {} data points between {} and {}'.format(df.shape[0], startDate.strftime('%Y-%m-%d'), endDate.strftime('%Y-%m-%d')))

Using 233 data points between 2017-05-17 and 2017-06-16


Unnamed: 0,CorrectTimestamp,Glucose,Meal,MealCarbs,Humalog,Lantus,WhatEating,Notes,Time,Hour
327,6/15/17 18:00,,Dinner,67.0,1.5,,"Rice, tofu, blueberries, milk","Went low during dinner just to 68, so accelera...",2017-06-15 18:00:00,693.316667
326,6/15/17 18:30,68.0,,,,,,During dinner again--just accelerated rest of ...,2017-06-15 18:30:00,693.816667
325,6/15/17 19:32,,,,,7.5,,,2017-06-15 19:32:00,694.85
324,6/16/17 7:54,207.0,Breakfast,19.0,1.5,,"Plain Greek yogurt, stevia, carrots, Caesar dr...","CF 135, CR 30",2017-06-16 07:54:00,707.216667
323,6/16/17 11:50,76.0,Lunch,41.0,1.5,,"Rice, blueberries, carrots, peas, tofu, 1/3 po...",At field trip. Gave popsicle right away to avo...,2017-06-16 11:50:00,711.15
322,6/16/17 13:09,58.0,Sugar to treat,10.0,,,Glucose,Walking back,2017-06-16 13:09:00,712.466667
321,6/16/17 13:19,75.0,,,,,,,2017-06-16 13:19:00,712.633333
320,6/16/17 16:44,81.0,Afternoon snack,18.0,0.5,,Pirates booty,,2017-06-16 16:44:00,716.05
319,6/16/17 18:17,180.0,Dinner,58.0,1.5,,Grilled cheese sandwich broccoli carrots straw...,,2017-06-16 18:17:00,717.6
318,6/16/17 19:32,,,,,7.5,,,2017-06-16 19:32:00,718.85


For each GLUCOSE measurement we need to obtain:
* Humalog vector: "New injection info" - include only since last glucose measurement (including this timepoint). Binned: [0-b, 1-2b, 2-3b, ... (D-b)-D, D+]. 
* Carbs vector: "New carbs info" - include only since last glucose measurement (including this timepoint), as for Humalog.
* delT: time since last glucose measurement, in hours.
* Current basal rate: approximate as most recent Lantus dose.

In [228]:
import math
nBins = math.ceil(d/b)

newInsulinEntries = [] # List of insulin since last BG measurement, each in form (timeInHours, insulinUnits)
newCarbEntries = [] # List of carbs since last BG measurement, each in form (timeInHours, carbGrams)
lastTime = 0

# Get Lantus doses from before this time period, if available. Otherwise don't have info until first dose.
prevLantus = dfFull.loc[(pandas.notna(dfFull["Lantus"])) & (dfFull["Time"] <= startDate), "Lantus"]
if prevLantus.empty:
    lastLantus = 0
else:
    lastLantus = prevLantus.values[-1]

singleEntries = []

for (index, row) in df.iterrows(): # Loop through data entries. For each entry...
    if pandas.notna(row['Lantus']): # Store most recent Lantus dose
        lastLantus = row['Lantus']
    if pandas.notna(row['Glucose']): # If glucose is recorded, we'll make a new row   
        now = row['Hour']
        # Use list of insulin (and current value) to construct Humalog vector
        humalog = [sum([h for (t,h) in newInsulinEntries 
                        if now-t >= iBin * b and now-t < (iBin+1) * b]) 
                   for iBin in range(nBins+1)]
        # Make sure last bin contains ALL insulin from > nBins * b hours ago
        humalog[-1] = sum([h for (t,h) in newInsulinEntries if now-t >= nBins * b]) 
        humalog[0] += row['Humalog'] if pandas.notna(row['Humalog']) else 0
        
        # Use list of carbs (and current value) to construct carb vector
        carbs = [sum([h for (t,h) in newCarbEntries 
                        if now-t >= iBin * b and now-t < (iBin+1) * b]) 
                 for iBin in range(nBins + 1)]
        # Make sure last bin contains ALL carbs from > nBins * b hours ago
        carbs[-1] = sum([h for (t,h) in newCarbEntries if now-t >= nBins * b]) 
        carbs[0] += row['MealCarbs'] if pandas.notna(row['MealCarbs']) else 0
        
        # Get time since last glucose measurement, in hours
        delT = row['Hour'] - lastTime
        
        # Create a row with delT, glucose, insulin, carbs, basal=lastLantus
        singleEntries.append({
            'MeasBG': row['Glucose'],
            'delT': delT,
            'NewInsulin': humalog,
            'NewCarbs': carbs,
            'Basal': lastLantus
        })
        
        # Clear the lists of new insulin & carbs, and set last T to this T
        newInsulinEntries = []
        newCarbEntries = []
        lastTime = now
    else: # Otherwise, add (T, H) and/or (T, C) to list of current insulin and/or carbs.
        if pandas.notna(row['Humalog']):
            newInsulinEntries.append((row['Hour'], row['Humalog']))
        if pandas.notna(row['MealCarbs']):
            newCarbEntries.append((row['Hour'], row['MealCarbs']))
        
singleEntries = pandas.DataFrame(singleEntries)
singleEntries.tail(10)


Unnamed: 0,Basal,MeasBG,NewCarbs,NewInsulin,delT
187,7.5,130.0,"[28.0, 0, 0, 0, 0, 0, 0]","[1.0, 0, 0, 0, 0, 0, 0]",2.2
188,7.5,194.0,"[39.0, 0, 0, 0, 0, 0, 0]","[2.0, 0, 0, 0, 0, 0, 0]",3.366667
189,7.5,75.0,"[3.0, 0, 0, 0, 0, 23.0, 0]","[0, 0, 0, 0, 0, 1.0, 0]",5.333333
190,7.5,68.0,"[0, 67.0, 0, 0, 0, 0, 0]","[0, 1.5, 0, 0, 0, 0, 0]",1.75
191,7.5,207.0,"[19.0, 0, 0, 0, 0, 0, 0]","[1.5, 0, 0, 0, 0, 0, 0]",13.4
192,7.5,76.0,"[41.0, 0, 0, 0, 0, 0, 0]","[1.5, 0, 0, 0, 0, 0, 0]",3.933333
193,7.5,58.0,"[10.0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",1.316667
194,7.5,75.0,"[0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0]",0.166667
195,7.5,81.0,"[18.0, 0, 0, 0, 0, 0, 0]","[0.5, 0, 0, 0, 0, 0, 0]",3.416667
196,7.5,180.0,"[58.0, 0, 0, 0, 0, 0, 0]","[1.5, 0, 0, 0, 0, 0, 0]",1.55


Then we need to group the data together into sequences. 

Collect rows 1 thru s, 2 thru (s+1), etc. and gather each set into a new row, with appropriate labels.
Truncate sequences where there's longer than a 12-hour gap in measurement.
Output a spreadsheet.

In [229]:
sequences = []

for iSeq in range(len(singleEntries) - sequenceLength):
    s = singleEntries[iSeq:(iSeq+sequenceLength)].reset_index(drop=True)
    dels = s['delT'].tolist()
    basals = s['Basal'].tolist()
    if all(d <= maxGapHours for d in dels) and all(b > 0 for b in basals):
        sequences.append({
            'MeasBG': s['MeasBG'].tolist(),
            'delT': s['delT'].tolist(),
            'Basal': s['Basal'].tolist(),
            'NewCarbs': s['NewCarbs'].tolist(),
            'NewInsulin': s['NewInsulin'].tolist()
        })
    
sequences = pandas.DataFrame(sequences)
sequences.to_csv('./processed/segment_' + str(segment) + '.csv', index=False)
sequences.tail(10)

Unnamed: 0,Basal,MeasBG,NewCarbs,NewInsulin,delT
177,"[7.0, 7.0, 7.0, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[243.0, 100.0, 133.0, 231.0, 127.0, 108.0, 257...","[[28.0, 0, 0, 0, 0, 0, 0], [49.0, 0, 0, 0, 0, ...","[[2.0, 0, 0, 0, 0, 0, 0], [2.0, 0, 0, 0, 0, 0,...","[3.3833333333022892, 3.2000000000116415, 7.583..."
178,"[7.0, 7.0, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[100.0, 133.0, 231.0, 127.0, 108.0, 257.0, 81....","[[49.0, 0, 0, 0, 0, 0, 0], [44.0, 0, 0, 0, 0, ...","[[2.0, 0, 0, 0, 0, 0, 0], [1.0, 0, 0, 0, 0, 0,...","[3.2000000000116415, 7.583333333313931, 6.4166..."
179,"[7.0, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[133.0, 231.0, 127.0, 108.0, 257.0, 81.0, 363....","[[44.0, 0, 0, 0, 0, 3.0, 0], [15.0, 0, 0, 45.0...","[[1.0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1.0, 0, 0,...","[7.583333333313931, 6.416666666686069, 1.0, 5...."
180,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[231.0, 127.0, 108.0, 257.0, 81.0, 363.0, 384....","[[15.0, 0, 0, 45.0, 0, 0, 0], [0, 0, 0, 0, 0, ...","[[0, 0, 0, 1.0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0...","[6.416666666686069, 1.0, 5.150000000023283, 3...."
181,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[127.0, 108.0, 257.0, 81.0, 363.0, 384.0, 130....","[[0, 0, 0, 0, 0, 0, 0], [30.0, 0, 0, 0, 0, 0, ...","[[0, 0, 0, 0, 0, 0, 0], [1.0, 0, 0, 0, 0, 0, 0...","[1.0, 5.150000000023283, 3.849999999976717, 4...."
182,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[108.0, 257.0, 81.0, 363.0, 384.0, 130.0, 194....","[[30.0, 0, 0, 0, 0, 0, 0], [44.0, 0, 0, 0, 0, ...","[[1.0, 0, 0, 0, 0, 0, 0], [2.5, 0, 0, 0, 0, 0,...","[5.150000000023283, 3.849999999976717, 4.73333..."
183,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[257.0, 81.0, 363.0, 384.0, 130.0, 194.0, 75.0...","[[44.0, 0, 0, 0, 0, 0, 0], [9.0, 0, 0, 0, 0, 0...","[[2.5, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0...","[3.849999999976717, 4.733333333337214, 8.75, 4..."
184,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[81.0, 363.0, 384.0, 130.0, 194.0, 75.0, 68.0,...","[[9.0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 4...","[[0, 0, 0, 0, 0, 0, 0], [1.5, 0, 0, 0, 0, 0, 1...","[4.733333333337214, 8.75, 4.866666666639503, 2..."
185,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[363.0, 384.0, 130.0, 194.0, 75.0, 68.0, 207.0...","[[0, 0, 0, 0, 0, 0, 48.0], [0, 0, 0, 0, 0, 0, ...","[[1.5, 0, 0, 0, 0, 0, 1.0], [2.0, 0, 0, 0, 0, ...","[8.75, 4.866666666639503, 2.2000000000116415, ..."
186,"[7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, 7.5, ...","[384.0, 130.0, 194.0, 75.0, 68.0, 207.0, 76.0,...","[[0, 0, 0, 0, 0, 0, 0], [28.0, 0, 0, 0, 0, 0, ...","[[2.0, 0, 0, 0, 0, 0, 0], [1.0, 0, 0, 0, 0, 0,...","[4.866666666639503, 2.2000000000116415, 3.3666..."
