In [None]:
# Set preprocessing parameters

b = 0.5 # Bin size for insulin and food history vectors, in hours
d = 3   # Total duration of insulin and food history vectors. 
        # Impact of data longer than d hours ago should not continue to change.
    
segmentDays = 30 # how long a segment of data to use for inference at all, in days
segment = 8      # which segment of to use, starting at 1 for first segment.

sequenceLength = 10 # how many glucose measurements to bin into one sequence
maxGapHours = 16 # how long a single delT (time between glucose measurements) to accept in a sequence

# Read in the data

import pandas
import datetime

dfFull = pandas.read_csv('./data/mdi.csv')
dfFull = dfFull.reindex(index=dfFull.index[::-1]) # order earliest - latest

# Pick out data from a particular segment to use, and get time in hours

timestamps = dfFull['CorrectTimestamp']
dfFull['Time'] = [datetime.datetime.strptime(t, "%m/%d/%y %H:%M") for t in timestamps] # datetime object times
df = dfFull.copy()

startDate = min(df['Time']) + datetime.timedelta(segmentDays * (segment - 1))
endDate   = min(df['Time']) + datetime.timedelta(segmentDays * segment)

df = df[df['Time'].between(startDate, endDate)]

df['Hour'] = [t.timestamp()/3600 for t in df['Time']]
df['Hour'] = df['Hour'] - min(df['Hour'])

print('Using {} data points between {} and {}'.format(df.shape[0], startDate.strftime('%Y-%m-%d'), endDate.strftime('%Y-%m-%d')))

For each GLUCOSE measurement we need to obtain:
* Humalog vector: "New injection info" - include only since last glucose measurement (including this timepoint). Binned: [0-b, 1-2b, 2-3b, ... (D-b)-D, D+]. 
* Carbs vector: "New carbs info" - include only since last glucose measurement (including this timepoint), as for Humalog.
* delT: time since last glucose measurement, in hours.
* Current basal rate: approximate as most recent Lantus dose.

In [None]:
import math
nBins = math.ceil(d/b)

newInsulinEntries = [] # List of insulin since last BG measurement, each in form (timeInHours, insulinUnits)
newCarbEntries = [] # List of carbs since last BG measurement, each in form (timeInHours, carbGrams)
lastTime = 0

# Get Lantus doses from before this time period, if available. Otherwise don't have info until first dose.
prevLantus = dfFull.loc[(pandas.notna(dfFull["Lantus"])) & (dfFull["Time"] <= startDate), "Lantus"]
if prevLantus.empty:
    lastLantus = 0
else:
    lastLantus = prevLantus.values[-1]

singleEntries = []

for (index, row) in df.iterrows(): # Loop through data entries. For each entry...
    if pandas.notna(row['Lantus']): # Store most recent Lantus dose
        lastLantus = row['Lantus']
    if pandas.notna(row['Glucose']): # If glucose is recorded, we'll make a new row   
        now = row['Hour']
        # Use list of insulin (and current value) to construct Humalog vector
        humalog = [sum([h for (t,h) in newInsulinEntries 
                        if now-t >= iBin * b and now-t < (iBin+1) * b]) 
                   for iBin in range(nBins+1)]
        # Make sure last bin contains ALL insulin from > nBins * b hours ago
        humalog[-1] = sum([h for (t,h) in newInsulinEntries if now-t >= nBins * b]) 
        humalog[0] += row['Humalog'] if pandas.notna(row['Humalog']) else 0
        
        # Use list of carbs (and current value) to construct carb vector
        carbs = [sum([h for (t,h) in newCarbEntries 
                        if now-t >= iBin * b and now-t < (iBin+1) * b]) 
                 for iBin in range(nBins + 1)]
        # Make sure last bin contains ALL carbs from > nBins * b hours ago
        carbs[-1] = sum([h for (t,h) in newCarbEntries if now-t >= nBins * b]) 
        carbs[0] += row['MealCarbs'] if pandas.notna(row['MealCarbs']) else 0
        
        # Get time since last glucose measurement, in hours
        delT = row['Hour'] - lastTime
        
        # Create a row with delT, glucose, insulin, carbs, basal=lastLantus
        singleEntries.append({
            'MeasBG': row['Glucose'],
            'delT': delT,
            'NewInsulin': humalog,
            'NewCarbs': carbs,
            'Basal': lastLantus
        })
        
        # Clear the lists of new insulin & carbs, and set last T to this T
        newInsulinEntries = []
        newCarbEntries = []
        lastTime = now
    else: # Otherwise, add (T, H) and/or (T, C) to list of current insulin and/or carbs.
        if pandas.notna(row['Humalog']):
            newInsulinEntries.append((row['Hour'], row['Humalog']))
        if pandas.notna(row['MealCarbs']):
            newCarbEntries.append((row['Hour'], row['MealCarbs']))
        
singleEntries = pandas.DataFrame(singleEntries)
singleEntries.tail(10)


Then we need to group the data together into sequences. 

Collect rows 1 thru s, 2 thru (s+1), etc. and gather each set into a new row, with appropriate labels.
Truncate sequences where there's longer than a 12-hour gap in measurement.
Output a spreadsheet.

In [None]:
sequences = []

for iSeq in range(len(singleEntries) - sequenceLength):
    s = singleEntries[iSeq:(iSeq+sequenceLength)].reset_index(drop=True)
    dels = s['delT'].tolist()
    basals = s['Basal'].tolist()
    if all(d <= maxGapHours for d in dels) and all(b > 0 for b in basals):
        sequences.append({
            'MeasBG': s['MeasBG'].tolist(),
            'delT': s['delT'].tolist(),
            'Basal': s['Basal'].tolist(),
            'NewCarbs': s['NewCarbs'].tolist(),
            'NewInsulin': s['NewInsulin'].tolist()
        })
    
sequences = pandas.DataFrame(sequences)
sequences.to_csv('./processed/segment_' + str(segment) + '.csv', index=False)
sequences.tail(10)