In [1]:
%matplotlib inline

! pip install plotly

import numpy as np
import pandas as pd
from pandas.stats.api import ols
import plotly.plotly as py
import matplotlib.pyplot as plt
import statsmodels.api as sm
from time import localtime



In [2]:
NOT_NAN_PCT = 90

In [4]:
# read the data set
csv = pd.read_csv('FoodFacts.csv')
frame = pd.DataFrame(csv)
print(frame.shape) # 65503 x 159

# explicitly set type
print(frame.columns.get_loc('no_nutriments')) #41, 42
print(frame.columns.get_loc('ingredients_from_palm_oil_n')) #46, 47
print(frame.columns.get_loc('ingredients_that_may_be_from_palm_oil_n')) #49, 50
print(frame.columns.get_loc('nutrition_grade_uk')) #52
print(frame.columns.get_loc('energy_100g')) #63~
frame[frame.columns[0 :41]] = frame[frame.columns[0 :41]].astype('str', raise_on_error=False)
frame[frame.columns[41:43]] = frame[frame.columns[41:43]].astype('int', raise_on_error=False)
frame[frame.columns[43:46]] = frame[frame.columns[43:46]].astype('str', raise_on_error=False)
frame[frame.columns[46:48]] = frame[frame.columns[46:48]].astype('int', raise_on_error=False)
frame[frame.columns[48:49]] = frame[frame.columns[48:49]].astype('str', raise_on_error=False)
frame[frame.columns[49:51]] = frame[frame.columns[49:51]].astype('int', raise_on_error=False)
frame[frame.columns[51:52]] = frame[frame.columns[51:52]].astype('str', raise_on_error=False)
frame[frame.columns[52:53]] = frame[frame.columns[52:53]].astype('int', raise_on_error=False)
frame[frame.columns[53:63]] = frame[frame.columns[53:63]].astype('str', raise_on_error=False)
frame[frame.columns[63:  ]] = frame[frame.columns[63:  ]].astype('int', raise_on_error=False)

# extract useful attributes
frame = frame.loc[:, frame.isnull().sum()/len(frame) < NOT_NAN_PCT/100]
cols = list(frame.columns)
i = 0
while i < len(cols):
    if cols[i] == cols[i-1]+'_tags':
        cols.remove(cols[i])
    elif cols[i] == cols[i-1]+'_en':
        cols.remove(cols[i-1])
    elif '_datetime' in cols[i] or cols[i] == 'url' or cols[i] == 'creator':
        cols.remove(cols[i])
    else:
        i = i+1
cols.pop()
frame = frame[cols]
frame.columns = frame.columns.str.replace('_en', '')
frame.columns = frame.columns.str.replace('_fr', '')
frame.columns = frame.columns.str.replace('_uk', '')
print(frame.columns)
print(frame.shape) # 65503 x 44

# food list
food_for_fat = frame[frame.fat_100g.notnull()]
food_for_sat = frame[frame.saturated_fat_100g.notnull()]
food_for_carbs = frame[frame.carbohydrates_100g.notnull()]
food_for_sugars = frame[frame.sugars_100g.notnull()]
food_for_fiber = frame[frame.fiber_100g.notnull()]
food_for_proteins = frame[frame.proteins_100g.notnull()]
food_for_salt = frame[frame.salt_100g.notnull()]
food_for_cal = frame[frame.energy_100g.notnull()]
print(food_for_fat.shape) # 36362 x 44
print(food_for_sat.shape) # 32429 x 44
print(food_for_carbs.shape) # 36065 x 44
print(food_for_sugars.shape) # 32639 x 44
print(food_for_fiber.shape) # 22546 x 44
print(food_for_proteins.shape) # 35930 x 44
print(food_for_salt.shape) # 32908 x 44
print(food_for_cal.shape) # 32898 x 44

# select all the unique countries:
countries = []
countries_raw = frame.countries.unique()
for country in countries_raw:
    cc = country.split(',')
    for c in cc:
        if ':' not in c:
            if ('Other-' in c):
                c = c.replace('Other-', '')
            if ('other-' in c):
                c = c.replace('other-', '')
            countries.append(c)
countries = list(set(countries))
print(countries)

# select countries with top 10 frenquencies:
occurences = frame.countries.value_counts().index.tolist()
occurences = occurences[0:10]
country_files = []
count = 0
for occurence in occurences:
    if occurence == 'United Kingdom':
        country_files.append(open('UK.txt', 'a'))
    elif occurence == 'United States':
        country_files.append(open('US.txt', 'a'))
    else:
        country_files.append(open(occurence+'.txt', 'a'))
    count = count+1
# 'France', 'United Kingdom', 'Spain', 'Germany', 'United States', 'Belgium', 
# 'Switzerland', 'Australia', 'Italy', and 'Portugal' are the countries of top 10 occurences.

"""
cal = [[] for x in range(10)]
fat = [[] for x in range(10)]
sat = [[] for x in range(10)]
carbs = [[] for x in range(10)]
sugars = [[] for x in range(10)]
fiber = [[] for x in range(10)]
proteins = [[] for x in range(10)]
na = [[] for x in range(10)]

# countries and their calorie (energy) intake:
for idx in range(len(food_for_cal.index)):
    country = food_for_cal.iloc[idx][food_for_cal.columns.get_loc('countries')]
    energy_100g = food_for_cal.iloc[idx][food_for_cal.columns.get_loc('energy_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            cal[i].append(energy_100g)
mean_cal = [np.mean(cal[x]) for x in range(len(cal))]
for idx in range(len(mean_cal)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')

# countries and their fat intake:
for idx in range(len(food_for_fat.index)):
    country = food_for_fat.iloc[idx][food_for_fat.columns.get_loc('countries')]
    fat_100g = food_for_fat.iloc[idx][food_for_fat.columns.get_loc('fat_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            fat[i].append(fat_100g)
mean_fat = [np.mean(fat[x]) for x in range(len(fat))]
for idx in range(len(mean_fat)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')

# countries and their saturated fat intake:
for idx in range(len(food_for_sat.index)):
    country = food_for_sat.iloc[idx][food_for_sat.columns.get_loc('countries')]
    saturated_fat_100g = food_for_sat.iloc[idx][food_for_sat.columns.get_loc('saturated_fat_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            sat[i].append(saturated_fat_100g)
mean_sat = [np.mean(sat[x]) for x in range(len(sat))]
for idx in range(len(mean_sat)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')
    
# countries and their carb intake:
for idx in range(len(food_for_carbs.index)):
    country = food_for_carbs.iloc[idx][food_for_carbs.columns.get_loc('countries')]
    carbohydrates_100g = food_for_carbs.iloc[idx][food_for_carbs.columns.get_loc('carbohydrates_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            carbs[i].append(carbohydrates_100g)
mean_carbs = [np.mean(carbs[x]) for x in range(len(carbs))]
for idx in range(len(mean_carbs)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')
    
# countries and their sugar intake:
for idx in range(len(food_for_sugars.index)):
    country = food_for_sugars.iloc[idx][food_for_sugars.columns.get_loc('countries')]
    sugars_100g = food_for_sugars.iloc[idx][food_for_sugars.columns.get_loc('sugars_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            sugars[i].append(sugars_100g)
mean_sugars = [np.mean(sugars[x]) for x in range(len(sugars))]
for idx in range(len(mean_sugars)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')

# countries and their fiber intake: 
for idx in range(len(food_for_fiber.index)):
    country = food_for_fiber.iloc[idx][food_for_fiber.columns.get_loc('countries')]
    fiber_100g = food_for_fiber.iloc[idx][food_for_fiber.columns.get_loc('fiber_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            fiber[i].append(fiber_100g)
mean_fiber = [np.mean(fiber[x]) for x in range(len(fiber))]
for idx in range(len(mean_fiber)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')

# countries and their protein intake:
for idx in range(len(food_for_proteins.index)):
    country = food_for_proteins.iloc[idx][food_for_proteins.columns.get_loc('countries')]
    proteins_100g = food_for_proteins.iloc[idx][food_for_proteins.columns.get_loc('proteins_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            proteins[i].append(proteins_100g)
mean_proteins = [np.mean(proteins[x]) for x in range(len(proteins))]
for idx in range(len(mean_proteins)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')
    
# countries and their salt intake: 
for idx in range(len(food_for_na.index)):
    country = food_for_na.iloc[idx][food_for_na.columns.get_loc('countries')]
    sodium_100g = food_for_na.iloc[idx][food_for_na.columns.get_loc('sodium_100g')]
    for i in range(len(occurences)):
        if occurences[i] in country:
            na[i].append(sodium_100g)
mean_na = [np.mean(na[x]) for x in range(len(na))]
for idx in range(len(mean_na)):
    country_files[idx].write(occurences[idx])
    country_files[idx].write(' ')
    country_files[idx].write(str(mean_cal[idx]))
    country_files[idx].write('\n')
"""  

food_for_sugars.head()


Columns (0,3,5,27,36) have mixed types. Specify dtype option on import or set low_memory=False.



(65503, 159)
41
46
49
52
63
Index(['code', 'created_t', 'last_modified_t', 'product_name', 'generic_name',
       'quantity', 'packaging', 'brands', 'categories', 'origins',
       'manufacturing_places', 'labels', 'emb_codes',
       'first_packaging_code_geo', 'cities', 'purchase_places', 'stores',
       'countries', 'ingredients_text', 'allergens', 'traces', 'serving_size',
       'additives_n', 'additives', 'ingredientsom_palm_oil_n',
       'ingredientsom_palm_oil_tags', 'ingredients_that_may_beom_palm_oil_n',
       'ingredients_that_may_beom_palm_oil_tags', 'nutrition_grade',
       'pnns_groups_1', 'pnns_groups_2', 'states', 'main_category',
       'image_url', 'image_small_url', 'energy_100g', 'fat_100g',
       'saturated_fat_100g', 'carbohydrates_100g', 'sugars_100g', 'fiber_100g',
       'proteins_100g', 'salt_100g', 'sodium_100g', 'nutrition_score_100g'],
      dtype='object')
(65503, 45)
(36362, 45)
(32429, 45)
(36065, 45)
(32639, 45)
(22546, 45)
(35930, 45)
(32908, 45)


Unnamed: 0,code,created_t,last_modified_t,product_name,generic_name,quantity,packaging,brands,categories,origins,...,energy_100g,fat_100g,saturated_fat_100g,carbohydrates_100g,sugars_100g,fiber_100g,proteins_100g,salt_100g,sodium_100g,nutrition_score_100g
10,27533024,1418732959,1442914346,Luxury Christmas Pudding,,454g,"plastic,bowl","Asda,Asda Extra Special","Desserts,Puddings,Biscuits-et-gateaux,Gateaux,...",,...,1284.0,7.0,3.6,54.2,44.0,4.7,3.9,0.2,0.07874,10.0
11,27533048,1418732915,1439141741,Luxury Christmas Pudding,,907g,"plastic,bowl","Asda,Asda Extra Special","Sugary snacks,Desserts,Biscuits and cakes,Cake...",,...,1284.0,7.0,3.6,54.2,44.0,4.7,3.9,0.199898,0.0787,10.0
13,40608754,1345024108,1439141731,"Pepsi, Nouveau goût !",Boisson gazeuse rafraîchissante aux extraits n...,15 cl,Canette aluminium,Pepsi,"Beverages,Carbonated drinks,Sodas,Colas,Sugare...",,...,177.0,0.0,0.0,10.4,10.4,0.0,0.0,0.0254,0.01,13.0
14,758,1409582884,1424687936,Cauliflower,Cauliflower,1,"Plastic,Bag","Sainsbury's,by sainsbury's","Plant-based foods and beverages,Plant-based fo...",United Kingdom,...,144.0,0.9,,2.9,2.5,1.8,3.6,0.0,0.0,
16,87177756,1433586486,1433589206,7Up,Boisson gazeuse aux extraits naturels de citro...,33 cl,Canette,"7Up,Pepsico","Beverages,Plant-based beverages,Carbonated dri...",,...,177.0,0.0,0.0,10.4,10.4,,0.0,0.1,0.03937,13.0


In [None]:
# having the means of each nutrient for each country (in corresponding files), let's do some regression.
# sugars
subframe = frame[frame.sugars_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.sugars_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

'''
print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')
'''

model.summary()

In [None]:
# sodium

subframe = frame[frame.sodium_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.sodium_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

'''
print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')
'''

model.summary()

In [None]:
# fat

subframe = frame[frame.fat_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.fat_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')

model.summary()

In [None]:
# saturated fat

subframe = frame[frame.saturated_fat_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.saturated_fat_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')

model.summary()

In [None]:
# carbohydrates

subframe = frame[frame.carbohydrates_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.carbohydrates_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')

model.summary()

In [None]:
# proteins

subframe = frame[frame.proteins_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.proteins_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')

model.summary()

In [None]:
# energy

subframe = frame[frame.energy_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.energy_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')

model.summary()

In [None]:
# fiber

subframe = frame[frame.fiber_100g.notnull()]
subframe = subframe[subframe.nutrition_score_100g.notnull()]
rows = len(subframe.index)
print(rows)
Y = subframe.nutrition_score_100g
X = subframe.fiber_100g
X = sm.add_constant(X)
model = sm.OLS(Y, X)
model = model.fit()

print(X.iloc[:,1].shape)
print(X.iloc[:,1].min())
print(X.iloc[:,1].max())
points = np.linspace(X.iloc[:,1].min(), X.iloc[:,1].max(), rows)
plt.plot(points, model.params[1]*points + model.params[0], '-')
plt.plot(X.iloc[:,1], Y, '.')

model.summary()