In [2]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import os

In [3]:
filepath = 'data/mfp-diaries.tsv'

# import dataset as pandas.DataFrame
import json
with open(filepath) as f:
    data = pd.read_csv(f, sep='\t', header = None)

data.rename(columns = {0: 'user_id', 1: 'date', 2: 'food_entries', 3: 'intakes_goals'}, inplace=True)

In [4]:
# read i-th row
i = 40000
user_id = data.iloc[i,0]
date = data.iloc[i,1]
food_entries = json.loads(data.iloc[i, 2])
aggregate_intake_goal = json.loads(data.iloc[i,3])

In [5]:
user_id

674

In [6]:
date

'2015-02-25'

In [7]:
food_entries

[{'meal': 'Breakfast',
  'dishes': [{'nutritions': [{'name': 'Calories', 'value': '2'},
     {'name': 'Carbs', 'value': '0'},
     {'name': 'Sat Fat', 'value': '0'},
     {'name': 'Protein', 'value': '0'},
     {'name': 'Chol', 'value': '0'},
     {'name': 'Vit C', 'value': '0'}],
    'name': 'Coffee - Brewed from grounds, 1 cup (8 fl oz)'},
   {'nutritions': [{'name': 'Calories', 'value': '27'},
     {'name': 'Carbs', 'value': '0'},
     {'name': 'Sat Fat', 'value': '1'},
     {'name': 'Protein', 'value': '0'},
     {'name': 'Chol', 'value': '0'},
     {'name': 'Vit C', 'value': '0'}],
    'name': 'Deli Continental - Smooth Reduced Fat Brussels Pâté, 13 g'},
   {'nutritions': [{'name': 'Calories', 'value': '58'},
     {'name': 'Carbs', 'value': '11'},
     {'name': 'Sat Fat', 'value': '0'},
     {'name': 'Protein', 'value': '2'},
     {'name': 'Chol', 'value': '0'},
     {'name': 'Vit C', 'value': '0'}],
    'name': 'Greenhalghs - Honey and Sunflower Bread, 1 slice from small 400g loa

In [8]:
aggregate_intake_goal

{'total': [{'name': 'Calories', 'value': 1278},
  {'name': 'Carbs', 'value': 188},
  {'name': 'Sat Fat', 'value': 12},
  {'name': 'Protein', 'value': 43},
  {'name': 'Chol', 'value': 6},
  {'name': 'Vit C', 'value': 4}],
 'goal': [{'name': 'Calories', 'value': 1310},
  {'name': 'Carbs', 'value': 164},
  {'name': 'Sat Fat', 'value': 15},
  {'name': 'Protein', 'value': 66},
  {'name': 'Chol', 'value': 300},
  {'name': 'Vit C', 'value': 100}]}

In [42]:
import pathlib

def create_folder(p):
    pathlib.Path(p).mkdir(parents=True, exist_ok=True)
    pathlib.Path(p+os.sep+'.gitkeep').touch()

# root folder to put the many folders
subpath = 'data'

#loop over all values and create a folder for each possible date
for i in range(data.shape[0]): 
    this_path = subpath + os.sep + str(data.user_id[i]) + os.sep + str(data.date[i]) + os.sep
    create_folder(this_path)
    
    with open(this_path + 'food_entries.json', 'w') as f:
        json.dump(data.food_entries[i], f)
        
    with open(this_path + 'intakes_goals.json', 'w') as f:
        json.dump(data.intakes_goals[i], f)

In [9]:
data

Unnamed: 0,user_id,date,food_entries,intakes_goals
0,1,2014-09-14,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2924}..."
1,1,2014-09-15,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2430}..."
2,1,2014-09-16,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 1862}..."
3,1,2014-09-17,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2251}..."
4,1,2014-09-18,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2001}..."
...,...,...,...,...
587182,9897,2015-03-02,"[{""meal"": ""Breakfast"", ""dishes"": [{""nutritions...","{""total"": [{""name"": ""Calories"", ""value"": 1979}..."
587183,9897,2015-03-03,"[{""meal"": ""Breakfast"", ""dishes"": [{""nutritions...","{""total"": [{""name"": ""Calories"", ""value"": 2141}..."
587184,9897,2015-03-04,"[{""meal"": ""Breakfast"", ""dishes"": [{""nutritions...","{""total"": [{""name"": ""Calories"", ""value"": 543},..."
587185,9897,2015-03-14,"[{""meal"": ""Breakfast"", ""dishes"": [{""nutritions...","{""total"": [{""name"": ""Calories"", ""value"": 2024}..."


In [10]:
data.date = pd.to_datetime(data.date)

In [28]:
data.date.max() - data.date.min()

Timedelta('207 days 00:00:00')

In [24]:
num_days_logged = data.groupby('user_id').agg({'date': lambda x: (x.max() - x.min()).days + 1})

In [25]:
num_days_logged

Unnamed: 0_level_0,date
user_id,Unnamed: 1_level_1
1,180
2,60
3,10
4,133
5,179
...,...
9893,28
9894,166
9895,179
9896,35


In [26]:
num_days_logged['date'].describe()

count    9896.000000
mean      115.121968
std        62.997824
min         1.000000
25%        61.000000
50%       137.000000
75%       175.000000
max       193.000000
Name: date, dtype: float64

In [32]:
data.intakes_goals = data.intakes_goals.astype(str)

In [33]:
def extract(ser, target):
    for i in range(len(ser)):
        if ser[i]['name'] == target:
            return ser[i]['value']
        
    return None

data['calorie_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Calories'))

data['carb_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Carbs'))
data['fiber_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Fiber'))
data['sugar_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Sugar'))
data['protein_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Protein'))
data['fat_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Fat'))
data['sodium_goal'] = data.intakes_goals.apply(lambda x: extract(json.loads(x)['goal'], 'Sodium'))

In [34]:
data.head()

Unnamed: 0,user_id,date,food_entries,intakes_goals,calorie_goal,carb_goal,fiber_goal,sugar_goal,protein_goal,fat_goal,sodium_goal
0,1,2014-09-14,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2924}...",3173.0,396.0,,119.0,160.0,105.0,2300.0
1,1,2014-09-15,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2430}...",1572.0,196.0,,59.0,79.0,52.0,2300.0
2,1,2014-09-16,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 1862}...",1832.0,229.0,,69.0,92.0,61.0,2300.0
3,1,2014-09-17,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2251}...",1685.0,210.0,,63.0,85.0,56.0,2300.0
4,1,2014-09-18,"[{""meal"": ""MY food"", ""dishes"": [{""nutritions"":...","{""total"": [{""name"": ""Calories"", ""value"": 2001}...",1597.0,199.0,,60.0,80.0,53.0,2300.0


In [35]:
t = data.groupby('user_id')[['calorie_goal', 'carb_goal', 'fiber_goal', \
                            'sugar_goal', 'protein_goal', 'fat_goal', 'sodium_goal']].nunique()

In [36]:
t

Unnamed: 0_level_0,calorie_goal,carb_goal,fiber_goal,sugar_goal,protein_goal,fat_goal,sodium_goal
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,168,120,0,65,76,61,1
2,16,17,0,14,16,15,1
3,4,4,3,0,3,3,1
4,25,24,0,0,18,17,1
5,34,0,16,25,31,27,1
...,...,...,...,...,...,...,...
9893,22,21,20,20,20,0,1
9894,1,1,0,1,1,1,0
9895,152,108,28,62,99,46,0
9896,3,4,3,3,4,3,0


In [None]:
data.groupby('user_id').calorie_goal.nunique()

In [120]:
# Number of times calorie goal was changed
t.describe()

count    9896.000000
mean       31.109539
std        37.420116
min         1.000000
25%         4.000000
50%        16.000000
75%        44.000000
max       174.000000
Name: calorie_goal, dtype: float64