In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
import boto3
import io

The dataset comes from *Open Food Facts* and was downloaded from: https://www.kaggle.com/openfoodfacts/world-food-facts/home.
'Open Food Facts is a free, open, collbarative database of food products from around the world, with ingredients, allergens, nutrition facts and all the tidbits of information we can find on product labels. Open Food Facts is a non-profit association of volunteers.' Over 5,000 contributors 'have added 100 000+ products from 150 countries using our Android, iPhone or Windows Phone app or their camera to scan barcodes and upload pictures of products and their labels.'

# DATA UPLOAD

In [2]:
#food = pd.read_csv(r'/Users/admin/Documents/food.tsv', delimiter = '\t', encoding='ISO-8859-1')

In [3]:
#s3 data

s3 = boto3.client('s3') 
obj = s3.get_object(Bucket='innawendell', Key='food.tsv') 
food = pd.read_csv(io.BytesIO(obj['Body'].read()), delimiter = '\t', encoding='ISO-8859-1')

In [4]:
food.shape

(356027, 163)

In [5]:
food_sample = food.sample(frac=0.2, replace=True, random_state=108)

In [6]:
food_sample.columns

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity',
       ...
       'fruits-vegetables-nuts_100g', 'fruits-vegetables-nuts-estimate_100g',
       'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g',
       'carbon-footprint_100g', 'nutrition-score-fr_100g',
       'nutrition-score-uk_100g', 'glycemic-index_100g',
       'water-hardness_100g'],
      dtype='object', length=163)

In [7]:
pd.set_option('display.max_columns', 500)
food_sample.head()

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,packaging,packaging_tags,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,no_nutriments,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil,ingredients_that_may_be_from_palm_oil_tags,nutrition_grade_uk,nutrition_grade_fr,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,main_category,main_category_en,image_url,image_small_url,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-lignoceric-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,-dihomo-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-elaidic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g
43467,41290381257,http://world-en.openfoodfacts.org/product/0041...,usda-ndb-import,1489070248,2017-03-09T14:37:28Z,1489070249,2017-03-09T14:37:29Z,"Mr. Pig, Soda",,,,,"Piggly Wiggly, Piggly Wiggly Company","piggly-wiggly,piggly-wiggly-company",,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Carbonated water, high fructose corn syrup, ca...",,,,,,355 ml (1 CAN),,5.0,[ carbonated-water -> en:carbonated-water ] ...,"en:e150a,en:e212,en:e270,en:e338,en:e340,en:e340i","E150a - Plain caramel,E212 - Potassium benzoat...",0.0,,,0.0,,,,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,176.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.55,11.55,,,,,,,,,,0.0,,,,0.0254,0.01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
46446,41345565120,http://world-en.openfoodfacts.org/product/0041...,usda-ndb-import,1489065648,2017-03-09T13:20:48Z,1489065648,2017-03-09T13:20:48Z,"Thompson Grapes, Seedless Grapes In Light Syrup",,,,,Oregon,oregon,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Grapes, water and cane sugar.",,,,,,140 g (0.5 cup),,0.0,[ grapes -> en:grapes ] [ water-and-cane-su...,,,0.0,,,0.0,,,,c,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,297.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,16.43,15.0,,,,,,,,,0.7,0.71,,,,0.01778,0.007,,0.0,,,,,0.0,,,,,,,,,,,,0.104,,0.0,,0.00051,,,,,,,,,,,,,,,,,,,3.0,3.0,,
127830,688267165153,http://world-en.openfoodfacts.org/product/0688...,usda-ndb-import,1489079860,2017-03-09T17:17:40Z,1489079860,2017-03-09T17:17:40Z,Oat & Ancient Grain Granola,,,,,Ahold,ahold,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Whole rolled oats, milled cane sugar, oat syru...",,,,,,55 g (0.67 cup),,0.0,[ whole-rolled-oats -> en:whole-rolled-oats ...,,,0.0,,,0.0,,,,a,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,1598.0,,9.09,0.91,,,,,,,,,,,,,,,3.64,3.64,,,,,,,,,,,,,,,,,0.0,0.0,74.55,21.82,,,,,,,,,9.1,10.91,,,,0.34544,0.136,,0.0,,,,,0.0,,,,,,,,,,,,,,0.073,,0.00327,,,,,,,,,,,,,,,,,,,-1.0,-1.0,,
223348,3257984450902,http://world-en.openfoodfacts.org/product/3257...,tacite,1448644918,2015-11-27T17:21:58Z,1448741919,2015-11-28T20:18:39Z,Crevettes de Madagascar,,300 g,"Barquette,plastique","barquette,plastique",Cora,cora,"Produits de la mer,CrustacÃ©s,Crevettes,Crevet...","en:seafood,en:crustaceans,en:shrimps,en:prawns","Seafood,Crustaceans,Shrimps,Prawns",Madagascar,madagascar,France,france,Label Rouge,en:label-rouge,Label Rouge,FR 62.474.100 EC,fr-62-474-100-ec,"50.666667,1.65",,isques-pas-de-calais-france,"France,CourriÃ¨res",Cora,France,en:france,France,"_Crevettes_ (Penaeus monodon), sel, sucre, ant...","Crevettes, disulfite",,,,,300 g,,2.0,[ crevettes -> fr:crevettes ] [ penaeus-mon...,"en:e223,en:e330","E223 - Sodium metabisulphite,E330 - Citric acid",0.0,,,0.0,,,,b,Fish Meat Eggs,Fish and seafood,"en:to-be-checked, en:complete, en:nutrition-fa...","en:to-be-checked,en:complete,en:nutrition-fact...","To be checked,Complete,Nutrition facts complet...",en:seafood,Seafood,http://en.openfoodfacts.org/images/products/32...,http://en.openfoodfacts.org/images/products/32...,416.2,,0.5,0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,0.5,,,,,,,,,0.7,23.5,,,,1.5,0.590551,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,
348725,8712566410231,http://world-en.openfoodfacts.org/product/8712...,date-limite-app,1475477889,2016-10-03T06:58:09Z,1486676576,2017-02-09T21:42:56Z,Spargel Cremesuppe mit Schnittlauch verfeinert,,,,,Knorr,knorr,Suppen,"en:meals,en:soups","Meals,Soups",,,,,,,,,,,,,,,Deutschland,en:germany,Germany,,,,,,,,,,,,,,,,,,,,,Composite foods,One-dish meals,"en:to-be-completed, en:nutrition-facts-to-be-c...","en:to-be-completed,en:nutrition-facts-to-be-co...","To be completed,Nutrition facts to be complete...",en:meals,Meals,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Selecting Retail Country: US

For our analysis, we will only focus on food that is on the market in the US.

In [8]:
food_sample['countries'].value_counts().head(20)

US                34048
France            21197
en:FR              3112
Suisse             2358
Deutschland        1444
EspaÃ±a             735
United Kingdom      690
United States       503
en:CH               453
en:BE               342
en:GB               339
Australia           304
en:FR,France        277
Ð Ð¾ÑÑÐ¸Ñ        261
en:DE               247
en:ES               210
en:US               176
Belgique            159
en:AU               144
Portugal            139
Name: countries, dtype: int64

# Explanation of the code

**All the '_smaller' and '_copy' instances will be creates for the final model to check its performance with the vocabulary items of the smaller frequency and with all the instances from the US, not just a sample of 0.2**

In [9]:
food_abbr = food_sample[food_sample['countries'].isin(['US', 'United States', 'en:US'])]
food_abbr_smaller = food[food['countries'].isin(['US', 'United States', 'en:US'])]

In [10]:
food_abbr.shape

(34727, 163)

## Selecting Ingredients and The Target - Energy Value

In [11]:
food_abbr.shape

(34727, 163)

In [12]:
features_energy = food_abbr.loc[:, ['ingredients_text', 'energy_100g']]

In [13]:
features_energy = features_energy.dropna(how='any')
features_energy_copy = food_abbr_smaller.loc[:, ['ingredients_text', 'energy_100g']]
features_energy_copy = features_energy_copy.dropna(how='any')

In [14]:
#verify that all the nan values were excluded
features_energy['ingredients_text'].isnull().sum()

0

In [15]:
features_energy['energy_100g'].isnull().sum()

0

In [16]:
features_energy.head()

Unnamed: 0,ingredients_text,energy_100g
43467,"Carbonated water, high fructose corn syrup, ca...",176.0
46446,"Grapes, water and cane sugar.",297.0
127830,"Whole rolled oats, milled cane sugar, oat syru...",1598.0
293254,"Soy bean (gmo) ,calcium sulfate, water, salt, ...",544.0
116086,"Organic concentrated apple puree, organic conc...",1343.0


# Parsing The Ingredients Text

In [17]:
#making all word lower case
lower = []
for entry in features_energy['ingredients_text']:
    lower.append(entry.lower())    

In [18]:
lower_copy = []
for entry in features_energy_copy['ingredients_text']:
    lower_copy.append(entry.lower())

In [19]:
#removing the fullstops from the strings

no_fullstops = []
for entry in lower_copy:
    entry = entry.replace('.', '')
    no_fullstops.append(entry)

no_fullstops_copy = []
for entry in lower_copy:
    entry = entry.replace('.', '')
    no_fullstops_copy.append(entry)

In [20]:
#splitting strings at commas
split = []
for entry in no_fullstops:
    entry = entry.split(',')
    split.append(entry)

split_copy = []
for entry in no_fullstops_copy:
    entry = entry.split(',')
    split_copy.append(entry)

In [21]:
cleaned = []
for entry in split:
    for item in entry:
        item = item.split("(")
        cleaned.append(item)

cleaned_copy = []
for entry in split_copy:
    for item in entry:
        item = item.split("(")
        cleaned_copy.append(item)

In [22]:
cleaned_new = []
for entry in cleaned:
    for item in entry:
        item= item.strip(')')
        cleaned_new.append(item)

cleaned_new_copy = []
for entry in cleaned_copy:
    for item in entry:
        item= item.strip(')')
        cleaned_new_copy.append(item)

In [23]:
cleaned_newer = []
for entry in cleaned_new:
        if entry.count('[') >0:
            entry = entry.split('[') 
            cleaned_newer.append(entry)
        else:
            cleaned_newer.append(entry)

cleaned_newer_copy = []
for entry in cleaned_new_copy:
        if entry.count('[') >0:
            entry = entry.split('[') 
            cleaned_newer_copy.append(entry)
        else:
            cleaned_newer_copy.append(entry)

In [24]:
cleaned_newer_one = []
for entry in cleaned_newer:
    if entry.count('and/or') >0:
        entry = entry.split('and/or')
        cleaned_newer_one.append(entry)
    else:
        cleaned_newer_one.append(entry)

cleaned_newer_one_copy = []
for entry in cleaned_newer_copy:
    if entry.count('and/or') >0:
        entry = entry.split('and/or')
        cleaned_newer_one_copy.append(entry)
    else:
        cleaned_newer_one_copy.append(entry)

In [25]:
unique_words = {}
for entry in cleaned_newer_one:
    if type(entry) == list:
        for word in entry:
            if word.strip() not in unique_words:
                unique_words[word.strip()] = 1
            else:   
                unique_words[word.strip()] += 1
    else:
        if entry.strip() not in unique_words:
            unique_words[entry.strip()] = 1
        else:
            unique_words[entry.strip()] += 1
            
unique_words_copy = {}
for entry in cleaned_newer_one_copy:
    if type(entry) == list:
        for word in entry:
            if word.strip() not in unique_words_copy:
                unique_words_copy[word.strip()] = 1
            else:   
                unique_words_copy[word.strip()] += 1
    else:
        if entry.strip() not in unique_words_copy:
            unique_words_copy[entry.strip()] = 1
        else:
            unique_words_copy[entry.strip()] += 1

In [26]:
frequent = {key:value for (key,value) in unique_words.items() if value >= 500}

frequent_smaller = {key:value for (key,value) in unique_words_copy.items() if value >= 200}

In [27]:
frequent_vocab = frequent.keys()
frequent_vocab_smaller = frequent_smaller.keys()

In [28]:
for word in frequent_vocab:
    features_energy[word] = features_energy['ingredients_text'].str.contains(word, case=False, regex=False)
    
for word in frequent_vocab_smaller:
    features_energy_copy[word] = features_energy_copy['ingredients_text'].str.contains(word, case=False, regex=False)

In [29]:
features_energy.head()

Unnamed: 0,ingredients_text,energy_100g,vegetable oil,coconut oil,corn oil,peanuts,wheat flour,sugar,rice flour,tapioca starch,salt,leavening,ammonium bicarbonate,baking soda,soy sauce,water,soybeans,wheat,potato starch,organic sunflower oil,sea salt,rolled oats,expeller pressed canola oil,sunflower seeds,almonds,sesame seeds,cashews,evaporated cane juice,unbleached wheat flour,sunflower oil,turmeric,rice,corn starch,salt],brown rice syrup,paprika,onion powder,garlic powder,honey,raisins,walnuts,molasses,cinnamon,organic sugar,cranberries,pineapple,cocoa butter,soy lecithin,an emulsifier],safflower,peanut,Unnamed: 52,canola oil,dried cranberries,blueberries,onion,natural flavors,garlic,maltodextrin,spices,citric acid,peanut butter,palm oil,brown rice flour,calcium carbonate,barley malt,an emulsifier,locust bean gum,black pepper,torula yeast,yeast extract,spice,coconut,unsweetened chocolate,hazelnuts,organic spices,canola,natural flavor,corn flour,apples,annatto,for color,parmesan cheese,soybean oil,whole milk,egg whites,thiamine,riboflavin,niacin,iron,folic acid,corn,sodium phosphate,cornstarch,thiamin mononitrate,sucrose,wheat starch,xanthan gum,hydrolyzed corn protein,natural smoke flavor,pecans,lactose,whole milk powder,vanilla,onions,carrots,lemon juice,whey,lecithin,chocolate chips,whey powder,milk],glucose syrup,milk,mono and diglycerides,seasoning,malted barley flour,reduced iron,malt,yeast,oats,sulfur dioxide,milk chocolate,chocolate liquor,vanillin,enriched wheat flour,thiamine mononitrate,silicon dioxide,sodium bicarbonate,soybean,apple juice concentrate,butter,cream,eggs,wheat gluten,ascorbic acid,enzymes,sodium acid pyrophosphate,calcium sulfate,sorbic acid,cream cheese,pasteurized milk and cream,cheese culture,carob bean gum,cheddar cheese,enriched flour,bleached wheat flour,beta carotene,brown sugar,unbleached enriched flour,distilled vinegar,sour cream,cultured pasteurized milk,soy flour,vitamin a palmitate,unbleached enriched wheat flour,modified corn starch,dextrose,corn syrup,nonfat dry milk,shortening,apple,chocolate,cocoa mass,natural vanilla flavor,egg,extra virgin olive oil,soya lecithin,colors,sodium citrate,palm kernel oil,potassium sorbate,preservative,vitamin e,artificial flavor,calcium disodium edta,to preserve freshness,vitamin d3,flour,milk fat,emulsifier,thiamin,palm,filtered water,pasteurized milk,cheese cultures,enriched bleached flour,ferrous sulfate,datem,acetic acid,lactic acid,alcohol,high fructose corn syrup,vinegar,calcium chloride,polysorbate 80,sodium bisulfite,sorbitol,glycerine,vitamin c,carnauba wax,soy,skim milk powder,invert sugar,guar gum,cottonseed oil,gum arabic,cucumbers,malic acid,natural and artificial flavors,sucralose,sodium benzoate and potassium sorbate,as a preservative,glycerin,yellow #5,blue #1,spinach,celery,palm kernel,pure cane sugar,pectin,gelatin,caramel color,yellow 5,yellow 6,blue 1,maltitol,inulin,whey protein concentrate,cocoa powder,processed with alkali,red 40,modified food starch,artificial colors,strawberries,color],dried cane syrup,natural and artificial flavor,red 3,grape juice concentrate,natural & artificial flavors,corn syrup solids,vital wheat gluten,cocoa,skim milk,rennet,fruit pectin,durum wheat semolina,tomatoes,ginger,chili powder,caramel,mustard flour,cilantro,tamarind,sodium benzoate,monocalcium phosphate,potatoes,olives,potassium benzoate,red #40,niacinamide,carbonated water,orange juice concentrate,ester gum,to protect flavor,lemon juice concentrate,flavor,pepper,shrimp,lime juice,tomato puree,tomato paste,mustard seed,granulated garlic,organic honey,pyridoxine hydrochloride,zinc oxide,monosodium glutamate,food starch-modified,peanut oil,sunflower lecithin,vanilla extract,sunflower,semolina,folic acid],pasteurized cow's milk,pasteurized cream,hydrogenated vegetable oil,cottonseed,sodium alginate,chicken broth,egg yolk,tricalcium phosphate,gellan gum,black beans,color,tomato,coriander,dipotassium phosphate,red bell pepper,dextrin,disodium guanylate,organic wheat flour,organic dried cane syrup,tapioca syrup,purified water,cellulose gel,fd&c red #40,peppers,jalapeno peppers,parsley,basil,oregano,thyme,cumin,mustard,carrot,disodium inosinate,sodium diacetate,hydrolyzed soy protein,nonfat milk,partially hydrogenated soybean oil,carrageenan,whole wheat flour,mono- and diglycerides,calcium propionate,sodium aluminum phosphate,fumaric acid,enzyme,baking powder,mushrooms,dehydrated onions,cultured milk,safflower oil,enzymes],powdered cellulose,culture,monoglycerides,potato flour,white wine,bread crumbs,cultures,buttermilk,olive oil,artificial flavors,disodium phosphate,autolyzed yeast extract,vegetable shortening,sesame oil,natural flavoring,partially hydrogenated vegetable oil,powdered sugar,organic cane sugar,milk protein concentrate,acesulfame potassium,magnesium oxide,calcium phosphate,sodium ascorbate,ferric orthophosphate,vitamin b12,oat fiber,fructose,cellulose gum,soy protein isolate,potassium citrate,rice starch,whey protein isolate,tomato powder,cocoa processed with alkali,apple cider vinegar,tomato concentrate,red pepper,jalapeno pepper,dark chocolate,milkfat,stabilizer,natural flavorings,emulsifier],partially hydrogenated palm kernel oil,artificial color,preservative],polysorbate 60,confectioner's glaze,popcorn,cane sugar,vitamin b1],vitamin b2],stabilizers,titanium dioxide,dehydrated garlic,propylene glycol,preservatives,potassium chloride,nutmeg,pork,sodium lactate,sodium erythorbate,flavoring,aspartame,sodium caseinate,heavy cream,color added,glucose,tartaric acid,dehydrated onion,red wine vinegar,beef,chili pepper,organic garlic,broccoli,bha,bht,yellow corn flour,enriched bleached wheat flour,mixed tocopherols,tapioca flour,gum acacia,phosphoric acid,xanthan,margarine,modified tapioca starch,mono & diglycerides,egg yolks,peaches,sodium tripolyphosphate,cured with water,sodium nitrite,extractives of paprika,a preservative,yellow #6,calcium lactate,chicken,modified cornstarch,sodium stearoyl lactylate,red 40 lake,yellow 5 lake,vitamin d,wheat bran,corn meal,dough conditioners,onion*,garlic*,mono-and diglycerides,and salt,caffeine,contains 2% or less of: salt,whole eggs,dried onion,dried garlic,romano cheese,corn maltodextrin,for tartness,tapioca dextrin,chicken fat,natamycin,cherries,green beans,mayonnaise,organic onions,pasteurized part-skim milk,flavorings,lactic acid starter culture,potassium lactate,sodium phosphates,spice extractives,soy protein concentrate,a natural mold inhibitor,to maintain freshness,annatto extract,annatto color,tomato juice,whole grain rolled oats,diced tomatoes,peas,contains one or more of the following: corn,durum flour,s thermophilus,l bulgaricus,l acidophilus,mechanically separated chicken,vitamin b6,vitamin b2,vitamin b1,red bell peppers,vitamin b3,fd&c yellow #5,milkfat and nonfat milk
43467,"Carbonated water, high fructose corn syrup, ca...",176.0,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
46446,"Grapes, water and cane sugar.",297.0,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
127830,"Whole rolled oats, milled cane sugar, oat syru...",1598.0,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
293254,"Soy bean (gmo) ,calcium sulfate, water, salt, ...",544.0,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
116086,"Organic concentrated apple puree, organic conc...",1343.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [30]:
features_energy.shape

(34304, 488)

In [31]:
type(features_energy.iloc[0, 2])

numpy.bool_

In [32]:
X = features_energy.iloc[:, 2:]
y = features_energy['energy_100g']

In [33]:
X_smaller = features_energy_copy.iloc[:, 2:]
y_smaller = features_energy_copy['energy_100g']

# PCA

In [34]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.90)
X_pca = pca.fit_transform(X)

In [35]:
print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    len(pca.explained_variance_ratio_)
)

The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 209


In [36]:
X_smaller_pca = pca.fit_transform(X_smaller)

In [37]:
print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    len(pca.explained_variance_ratio_)
)

The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 327


# Ridge

In [38]:
from sklearn.metrics import mean_squared_error

In [39]:
rmse_scores = []
alphas = []

for value in [1e-10,   1e-3,  1, 5, 20]:
    ridge = Ridge(alpha=value)
    rmse = np.sqrt(np.mean(-cross_val_score(ridge, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmse_scores.append(rmse)
    alphas.append(value)

In [40]:
df = pd.DataFrame(rmse_scores, columns = ['rmse'])
df['alphas'] = alphas

In [41]:
#alpha 20 is the winner

df.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
4,575.006623,20.0
3,575.065683,5.0
2,575.096283,1.0
1,575.105004,0.001
0,575.105013,1e-10


# Lasso

In [42]:
alphas_l = []
rmses_l = []

for value in [1e-15, 1e-3,  10]:
    lasso = Lasso(alpha=value)
    rmse_l = np.sqrt(np.mean(-cross_val_score(lasso, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmses_l.append(rmse_l)
    alphas_l.append(value)

In [43]:
df_lasso = pd.DataFrame(rmses_l, columns = ['rmse'])
df_lasso['alphas'] = alphas_l
df_lasso.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
1,575.103629,0.001
0,575.105013,1e-15
2,629.78877,10.0


# Elastic Net

In [44]:
from sklearn.model_selection import ParameterGrid

param_grid = {'alpha': [1e-4, 1e-2, 1, 6], 
              'l1_ratio': [0.1, 0.5, 0.95]}

grid = ParameterGrid(param_grid)

rmses_net = []
parameters = []

for params in grid:
    net = ElasticNet(**params)
    rmse_net = np.sqrt(np.mean(-cross_val_score(net, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmses_net.append(rmse_net)
    parameters.append(params)

In [45]:
df_net = pd.DataFrame(rmses_net, columns = ['rmse'])
df_net['alphas'] = parameters
df_net.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
5,575.024493,"{'alpha': 0.01, 'l1_ratio': 0.95}"
0,575.08751,"{'alpha': 0.0001, 'l1_ratio': 0.1}"
1,575.095001,"{'alpha': 0.0001, 'l1_ratio': 0.5}"
2,575.10386,"{'alpha': 0.0001, 'l1_ratio': 0.95}"
4,575.892832,"{'alpha': 0.01, 'l1_ratio': 0.5}"
3,577.63789,"{'alpha': 0.01, 'l1_ratio': 0.1}"
8,603.39892,"{'alpha': 1, 'l1_ratio': 0.95}"
11,671.559426,"{'alpha': 6, 'l1_ratio': 0.95}"
7,684.025296,"{'alpha': 1, 'l1_ratio': 0.5}"
6,713.800769,"{'alpha': 1, 'l1_ratio': 0.1}"


# Random Forest

### Rough run to get an estimate on the time of one run

In [46]:
rfr_one = RandomForestRegressor(n_estimators = 20, max_depth = 10, n_jobs = -1)
rmse_rfr_one = np.sqrt(np.mean(-cross_val_score(rfr_one, X_pca, y, scoring = 'neg_mean_squared_error')))

In [47]:
print(rmse_rfr_one)

465.9065772313596


In [None]:
n_est = []
rmses_rfr = []

for value in [100, 500,700]:
    rfr = RandomForestRegressor(n_jobs=-1, n_estimators=value)
    rmse_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_rfr.append(rmse_rfr)
    n_est.append(value)

In [None]:
df_rfr_one = pd.DataFrame(rmses_rfr, columns = ['rmse'])
df_rfr_one['n_estimators'] = n_est
df_rfr_one.sort_values(by=['rmse'])

In [None]:
max_depth = []
rmses_rfr = []

for value in [8, 12, 20]:
    rfr = RandomForestRegressor(n_jobs=-1, max_depth = value, n_estimators=700)
    rmse_rfr = np.sqrt(np.mean(-cross_val_score(rfr,X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_rfr.append(rmse_rfr)
    max_depth.append(value)

In [None]:
df_rfr_two = pd.DataFrame(rmses_rfr, columns = ['rmse'])
df_rfr_two['max_depth'] = max_depth
df_rfr_two.sort_values(by=['rmse'])

# XGBoost

In [None]:
n_est = []
rmses_xgb = []
for value in [100, 500, 700]:
    xgb = XGBRegressor(n_jobs = -1,  n_estimators = value, booster= 'gbtree')
    rmse_xgb = np.sqrt(np.mean(-cross_val_score(xgb, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_xgb.append(rmse_xgb)
    n_est.append(value)

In [None]:
df_xgb = pd.DataFrame(rmses_xgb, columns = ['rmse'])
df_xgb['n_est'] = n_est
df_xgb.sort_values(by=['rmse'])

In [None]:
max_depth = []
rmses_xgb = []

for value in [2, 4, 6, 8]:
    xgb = XGBRegressor(n_jobs = -1,  max_depth = value, n_estimators = 700,booster= 'gbtree')
    rmse_xgb = np.sqrt(np.mean(-cross_val_score(xgb, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_xgb.append(rmse_xgb)
    max_depth.append(value)

In [None]:
df_xgb_one = pd.DataFrame(rmses_xgb, columns = ['rmse'])
df_xgb_one['max_depth'] = max_depth
df_xgb_one.sort_values(by=['rmse'])

In [None]:
learning_rates = []
rmses_xgb = []

for value in [0.5, 0.1, 0.01]:
    xgb = XGBRegressor(n_jobs = -1,  max_depth = 8, n_estimators = 700,booster= 'gbtree', learning_rate=value)
    rmse_xgb = np.sqrt(np.mean(-cross_val_score(xgb, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_xgb.append(rmse_xgb)
    learning_rates.append(value)

In [None]:
df_xgb_two = pd.DataFrame(rmses_xgb, columns = ['rmse'])
df_xgb_two['l_rates'] = learning_rates
df_xgb_two.sort_values(by=['rmse'])

In [None]:
subsample = []
rmses_xgb = []

for value in [0.25, 0.5, 1]:
    xgb = XGBRegressor(n_jobs = -1,  max_depth = 8, n_estimators = 700,booster= 'gbtree', learning_rate= 0.1, subsample = value)
    rmse_xgb = np.sqrt(np.mean(-cross_val_score(xgb, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_xgb.append(rmse_xgb)
    subsample.append(value)

In [None]:
df_xgb_three = pd.DataFrame(rmses_xgb, columns = ['rmse'])
df_xgb_three['subsample'] = subsample
df_xgb_three.sort_values(by=['rmse'])

# Final Testing with a Larger Set

In [None]:
#A larger set with the PCA
xgb = XGBRegressor(n_jobs = -1,  max_depth = 8, n_estimators = 700,booster= 'gbtree', learning_rate= 0.1, subsample = 1)
rmse_xgb = np.sqrt(np.mean(-cross_val_score(xgb, X_smaller_pca, y_smaller, scoring = 'neg_mean_squared_error', n_jobs=-1)))

In [None]:
#A larget set without PCA
xgb = XGBRegressor(n_jobs = -1,  max_depth = 8, n_estimators = 700,booster= 'gbtree', learning_rate= 0.1, subsample = 1)
rmse_xgb = np.sqrt(np.mean(-cross_val_score(xgb, X_smaller, y_smaller, scoring = 'neg_mean_squared_error', n_jobs=-1)))