# Should My Child Eat This?

### One of the biggest challenges I’ve experienced as a mother is getting my son to eat healthy food. He prefers processed food such as crackers or cookies. I'm always hesitant to given him what he wants in fear of the food being unhealthy. For that reason, I've decided to build a model that can determine if a food is healthy or not. 

In [1]:
# Imports
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import math

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
import math
from sklearn.linear_model import Lasso

%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('en.openfoodfacts.org.products.tsv', sep='\t', low_memory=False)

In [3]:
data.columns

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity',
       ...
       'fruits-vegetables-nuts_100g', 'fruits-vegetables-nuts-estimate_100g',
       'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g',
       'carbon-footprint_100g', 'nutrition-score-fr_100g',
       'nutrition-score-uk_100g', 'glycemic-index_100g',
       'water-hardness_100g'],
      dtype='object', length=163)

In [4]:
for column in data.columns:
    if len(data[str(column)].value_counts()) == 1:
        data.drop([str(column)],axis=1,inplace=True)

In [5]:
data.drop(['url','creator','created_t', 'created_datetime', 'last_modified_datetime', 'generic_name',
          'no_nutriments', 'ingredients_from_palm_oil','ingredients_that_may_be_from_palm_oil', 
           'nutrition_grade_uk', 'chlorophyl_100g','glycemic-index_100g','water-hardness_100g', 
           '-butyric-acid_100g', '-caproic-acid_100g','image_small_url','image_url', 'main_category', 
           'states_tags', 'states', 'additives_tags', 'traces_tags', 'last_modified_t', 'packaging_tags', 
           'brands_tags', 'categories_tags', 'categories', 'origins_tags', 'manufacturing_places_tags', 
           'labels', 'labels_tags', 'emb_codes_tags', 'countries_tags','countries','traces'], axis=1, inplace=True)

In [6]:
data.shape

(356027, 123)

In [7]:
def describe_categorical(X):
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes == "object"]].describe().to_html()))

In [8]:
describe_categorical(data)

Unnamed: 0,code,product_name,quantity,packaging,brands,categories_en,origins,manufacturing_places,labels_en,emb_codes,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries_en,ingredients_text,allergens,allergens_en,traces_en,serving_size,additives,additives_en,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_tags,nutrition_grade_fr,pnns_groups_1,pnns_groups_2,states_en,main_category_en
count,356001,338515,119285,89960,326977,103301,25050,42008,59204,32490,20872,26,22502,66565,57701,355752,283893,37176,20,28420,216621,283825,170189,6572,14328,254856,129006,132689,355975,103249
unique,356001,249245,15563,16466,66961,24112,5244,8076,16004,9210,1696,3,2733,5741,3646,779,238920,16575,20,3926,25928,228280,39941,14,180,5,14,42,1068,3639
top,4311501472972,Ice Cream,500 g,"Sachet,Plastique",Carrefour,"Beverages,Non-sugared beverages",France,France,"Organic,EU Organic,fr:AB Agriculture Biologique",EMB 56251E,"47.633333,-2.666667",a,theix-morbihan-france,France,Carrefour,United States,"Carbonated water, natural flavor.",blé,http://en.openfoodfacts.org/images/products/30...,Nuts,240 ml (8 fl oz),[ extra-virgin-olive-oil -> en:extra-virgin-o...,E330 - Citric acid,huile-de-palme,e471-mono-et-diglycerides-d-acides-gras-alimen...,d,unknown,unknown,"To be completed,Nutrition facts completed,Ingr...",Plant-based foods and beverages
freq,1,411,5285,2633,3670,2345,6159,11225,5080,229,297,11,286,14543,6878,173159,222,1495,1,2491,5496,397,8656,6239,3584,72436,43603,43603,167862,22950


In [9]:
data.drop(['code', 'product_name', 'quantity', 'packaging', 'brands', 'categories_en', 'origins', 
           'manufacturing_places', 'labels_en', 'emb_codes', 'first_packaging_code_geo', 'cities', 
           'cities_tags', 'purchase_places', 'stores', 'countries_en', 'ingredients_text', 
           'allergens', 'allergens_en', 'traces_en', 'serving_size', 'additives', 'additives_en', 
           'ingredients_from_palm_oil_tags', 'ingredients_that_may_be_from_palm_oil_tags', 
           'nutrition_grade_fr', 'pnns_groups_1', 'pnns_groups_2', 'states_en', 'main_category_en'], axis=1, inplace=True)


data.describe()

Unnamed: 0,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-capric-acid_100g,-lauric-acid_100g,-arachidic-acid_100g,...,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
count,283867.0,283867.0,283867.0,295367.0,869.0,279497.0,263823.0,2.0,4.0,27.0,...,91.0,33.0,52.0,3228.0,404.0,182.0,1383.0,278.0,254856.0,254856.0
mean,1.876851,0.02343,0.059736,1125.45332,587.216617,56065.87,5.09246,6.04,36.136182,10.383852,...,2.075503,12.262055,6.476138,33.39268,60.360124,15.362637,52.102675,335.790664,9.166137,8.980656
std,2.501022,0.153094,0.28066,936.825952,713.255708,29633850.0,7.965148,0.226274,24.101433,3.939718,...,7.725321,69.605988,2.01349,32.906834,29.26235,3.692658,19.028361,423.244817,8.99987,9.151757
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.88,0.04473,0.064,...,0.0,0.00072,0.0,0.0,0.0,8.0,6.0,0.0,-15.0,-15.0
25%,0.0,0.0,0.0,382.0,49.4,0.1,0.0,5.96,34.661183,7.1,...,0.0165,0.035,6.275,0.0,45.0,12.0,33.0,82.65,1.0,1.0
50%,1.0,0.0,0.0,1092.0,300.0,5.29,1.79,6.04,47.6,12.8,...,0.021,0.039,7.2,25.0,58.0,15.0,52.0,190.95,10.0,9.0
75%,3.0,0.0,0.0,1674.0,900.0,20.0,7.14,6.12,49.075,13.2,...,0.0425,0.4,7.5,55.0,93.0,15.0,70.0,378.7,16.0,16.0
max,30.0,2.0,6.0,231199.0,3830.0,15666670000.0,550.0,6.2,49.3,15.4,...,44.0,400.0,8.4,100.0,100.0,25.0,100.0,2842.0,40.0,40.0


In [10]:
#Impute all variables with mean
for v in data:
    data[str(v)].fillna(data[str(v)].mean(), inplace=True)

In [11]:
data.describe()

Unnamed: 0,additives_n,ingredients_from_palm_oil_n,ingredients_that_may_be_from_palm_oil_n,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-capric-acid_100g,-lauric-acid_100g,-arachidic-acid_100g,...,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g
count,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,...,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0,356027.0
mean,1.876851,0.02343,0.059736,1125.45332,587.216617,56065.87,5.09246,6.04,36.136183,10.383852,...,2.075503,12.262055,6.476138,33.39268,60.360124,15.362637,52.102675,335.790664,9.166137,8.980656
std,2.233229,0.136702,0.250608,853.293243,35.217965,26256390.0,6.856595,0.000379,0.069962,0.033668,...,0.122828,0.659904,0.024099,3.132887,0.984512,0.08326,1.185535,11.805671,7.61451,7.743016
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.88,0.04473,0.064,...,0.0,0.00072,0.0,0.0,0.0,8.0,6.0,0.0,-15.0,-15.0
25%,0.0,0.0,0.0,494.0,587.216617,0.9,0.3,6.04,36.136182,10.383852,...,2.075503,12.262055,6.476138,33.39268,60.360124,15.362637,52.102675,335.790664,3.0,3.0
50%,1.876851,0.0,0.0,1125.45332,587.216617,12.24,5.0,6.04,36.136182,10.383852,...,2.075503,12.262055,6.476138,33.39268,60.360124,15.362637,52.102675,335.790664,9.166137,8.980656
75%,2.0,0.0,0.0,1569.0,587.216617,47.5,5.09246,6.04,36.136182,10.383852,...,2.075503,12.262055,6.476138,33.39268,60.360124,15.362637,52.102675,335.790664,14.0,14.0
max,30.0,2.0,6.0,231199.0,3830.0,15666670000.0,550.0,6.2,49.3,15.4,...,44.0,400.0,8.4,100.0,100.0,25.0,100.0,2842.0,40.0,40.0


In [12]:
for column in data.columns:
    if data[column].isnull().any():
            print (column)

-lignoceric-acid_100g
-cerotic-acid_100g
-melissic-acid_100g
-elaidic-acid_100g
-mead-acid_100g
-erucic-acid_100g
-nervonic-acid_100g


In [13]:
data.drop(['-lignoceric-acid_100g', '-cerotic-acid_100g', '-melissic-acid_100g', '-elaidic-acid_100g', 
           '-mead-acid_100g', '-erucic-acid_100g', '-nervonic-acid_100g'], axis=1, inplace=True)

In [14]:
y1=data.pop('nutrition-score-fr_100g')
y2=data.pop('nutrition-score-uk_100g')

In [15]:
# want to compute the average of our ys
# y = []

In [16]:
scaler = StandardScaler()
X = scaler.fit_transform(data)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y2, test_size=.2, random_state=42)

In [18]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [19]:
list(zip (y_test, model.predict(X_test)))

[(8.980655742850864, 8.9737441030328604),
 (20.0, 20.0),
 (18.0, 16.82),
 (8.980655742850864, 8.9737441030328604),
 (12.0, 11.69),
 (8.980655742850864, 8.9806557428510843),
 (15.0, 14.99),
 (20.0, 23.800000000000001),
 (8.980655742850864, 8.9806557428508409),
 (8.980655742850864, 8.9806557428507858),
 (11.0, 10.56),
 (18.0, 20.66),
 (-3.0, -3.0299999999999998),
 (23.0, 22.920000000000002),
 (8.980655742850864, 8.9737441030328604),
 (5.0, 14.68),
 (9.0, 9.0),
 (-1.0, -2.1000000000000001),
 (0.0, -0.02),
 (16.0, 15.56),
 (18.0, 18.0),
 (19.0, 19.030000000000001),
 (2.0, 2.2296131148570173),
 (-2.0, -0.77000000000000002),
 (8.980655742850864, 8.9806557428510843),
 (23.0, 23.0),
 (21.0, 21.280000000000001),
 (0.0, 0.0),
 (9.0, 9.3200000000000003),
 (10.0, 10.869806557428507),
 (12.0, 11.789999999999999),
 (-3.0, -2.25),
 (-6.0, -5.8949999999999996),
 (8.980655742850864, 8.9806557428507023),
 (14.0, 14.0),
 (9.0, 9.0199999999999996),
 (0.0, 0.40000000000000002),
 (-6.0, -6.0),
 (6.0, 4.6066

In [20]:
y_hat = model.predict(X_test)
mse = mean_squared_error(y_test, y_hat)
print(mse)

1.74166565433


In [21]:
rmse = math.sqrt(mse)
print(rmse)

1.3197218094486232


In [22]:
r2 = r2_score(y_test, y_hat)
print(r2)

0.971024410667
