In [46]:
# Set preprocessing parameters

b = 0.5 # Bin size for insulin and food history vectors, in hours
d = 3   # Total duration of insulin and food history vectors. 
        # Impact of data longer than d hours ago should not continue to change.
    
segmentDays = 30 # how long a segment of data to use for inference at all, in days
segment = 5      # which segment of to use, starting at 1 for first segment.

sequenceLength = 10 # how many glucose measurements to bin into one sequence


In [64]:
# Read in the data

import pandas
import datetime

dfFull = pandas.read_csv('./data/mdi.csv')
dfFull = dfFull.reindex(index=dfFull.index[::-1]) # order earliest - latest
dfFull

Unnamed: 0,CorrectTimestamp,Glucose,Meal,MealCarbs,Humalog,Lantus,WhatEating,Notes
1971,10/19/16 19:50,385.0,,,1.00,2.0,,
1970,10/20/16 2:00,366.0,,,,,,
1969,10/20/16 6:46,338.0,,,,,,
1968,10/20/16 8:40,263.0,Breakfast,,1.00,,,
1967,10/20/16 12:50,295.0,Lunch,,1.50,,,
1966,10/20/16 15:40,534.0,Snack,,2.00,,,
1965,10/20/16 18:40,344.0,Dinner,48.0,1.50,,,
1964,10/20/16 20:00,,,,,2.0,,
1963,10/21/16 2:00,385.0,,,1.00,,,
1962,10/21/16 8:10,299.0,Breakfast,51.0,1.00,,,


In [82]:
# Pick out data from a particular segment to use, and get time in hours

timestamps = dfFull['CorrectTimestamp']
df = dfFull.copy()
df['Time'] = [datetime.datetime.strptime(t, "%m/%d/%y %H:%M") for t in timestamps] # datetime object times

startDate = min(times) + datetime.timedelta(segmentDays * (month - 1))
endDate   = min(times) + datetime.timedelta(segmentDays * month)

df = df[df['Time'].between(startDate, endDate)]

df['Hour'] = [t.timestamp()/3600 for t in df['Time']]
df['Hour'] = df['Hour'] - min(df['Hour'])

'Using {} data points between {} and {}'.format(df.shape[0], startDate.strftime('%Y-%m-%d'), endDate.strftime('%Y-%m-%d'))

'Using 195 data points between 2017-02-16 and 2017-03-18'

For each GLUCOSE measurement we need to obtain:
* Humalog vector: "New injection info" - include only since last glucose measurement (including this timepoint). Binned: [0-b, 1-2b, 2-3b, ... (D-b)-D, D+]. 
* Carbs vector: "New carbs info" - include only since last glucose measurement (including this timepoint), as for Humalog.
* delT: time since last glucose measurement, in hours.
* Current basal rate: approximate as most recent Lantus dose.

In [79]:
newInsulinEntries = [] # List of insulin since last BG measurement, each in form (timeInHours, insulinUnits)
newCarbEntries = [] # List of carbs since last BG measurement, each in form (timeInHours, carbGrams)
lastTime = 0
lastLantus = 0

for (index, row) in df.iterrows(): # Loop through data entries. For each entry...
    if pandas.notna(row['Lantus']): # Store most recent Lantus dose
        lastLantus = row['Lantus']
    if pandas.notna(row['Glucose']): # If glucose is recorded,
        print(row['Glucose'])        
        # TODO: Use list of insulin (and current value) to construct insulin vector
        
        # TODO: Use list of carbs (and current value) to construct carb vector
        
        # Get time since last glucose measurement, in hours
        delT = row['Hour'] - lastTime
        
        # TODO: Create a row with delT, glucose, insulin, carbs, basal=lastLantus
        
        # Clear the lists of new insulin & carbs, and set last T to this T
        newInsulinEntries = []
        newCarbEntries = []
        lastTime = row['Hour']
    else: # Otherwise, add (T, H) and/or (T, C) to list of current insulin and/or carbs.
        if pandas.notna(row['Humalog']):
            newInsulinEntries.append((row['Hour'], row['Humalog']))
        if pandas.notna(row['MealCarbs']):
            newCarbEntries.append((row['Hour'], row['MealCarbs']))
        print('No glucose measurement')


    

94.0
76.0
89.0
199.0
166.0
209.0
87.0
93.0
123.0
2.5
NA
61.0
105.0
162.0
162.0
185.0
158.0
131.0
152.0
144.0
NA
245.0
355.0
232.0
239.0
313.0
168.0
171.0
139.0
95.0
104.0
321.0
122.0
114.0
297.0
343.0
80.0
81.0
83.0
115.0
151.0
160.0
206.0
211.0
201.0
NA
188.0
167.0
103.0
169.0
357.0
77.0
2.5
NA
129.0
155.0
269.0
250.0
257.0
96.0
164.0
195.0
151.0
169.0
179.0
175.0
66.0
2.5
107.0
107.0
68.0
100.0
147.0
121.0
183.0
86.0
2.5
97.0
115.0
101.0
157.0
167.0
2.5
NA
232.0
237.0
335.0
2.5
167.0
182.0
184.0
334.0
170.0
2.5
237.0
258.0
244.0
207.0
316.0
2.5
162.0
67.0
100.0
186.0
201.0
291.0
145.0
2.5
NA
220.0
71.0
95.0
295.0
2.5
NA
72.0
82.0
74.0
105.0
127.0
376.0
246.0
2.5
NA
56.0
102.0
115.0
217.0
227.0
55.0
105.0
92.0
2.5
NA
125.0
165.0
175.0
59.0
101.0
2.5
83.0
119.0
202.0
2.5
219.0
272.0
165.0
206.0
56.0
114.0
64.0
90.0
155.0
295.0
2.5
NA
116.0
301.0
116.0
176.0
2.5
192.0
NA
60.0
302.0
139.0
127.0
2.5
263.0
365.0
307.0
72.0
2.5
252.0
302.0
182.0
2.5
458.0
141.0
289.0
177.0
2.5
219.0
281.0
1

Then we need to group the data together into sequences. 

Collect rows 1-s, 2-(s+1), ... and gather each set into a new row, with appropriate labels.
Truncate sequences where there's longer than a 12-hour gap in measurement.
Output a spreadsheet.