In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn import ensemble

  from collections import Sequence


The dataset comes from *Open Food Facts* and was downloaded from: https://www.kaggle.com/openfoodfacts/world-food-facts/home.
'Open Food Facts is a free, open, collbarative database of food products from around the world, with ingredients, allergens, nutrition facts and all the tidbits of information we can find on product labels. Open Food Facts is a non-profit association of volunteers.' Over 5,000 contributors 'have added 100 000+ products from 150 countries using our Android, iPhone or Windows Phone app or their camera to scan barcodes and upload pictures of products and their labels.'

# DATA UPLOAD

In [2]:
food = pd.read_csv(r'/Users/admin/Documents/food.tsv', delimiter = '\t', encoding='ISO-8859-1')

In [3]:
food.shape

(356027, 163)

In [4]:
food.columns

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity',
       ...
       'fruits-vegetables-nuts_100g', 'fruits-vegetables-nuts-estimate_100g',
       'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g',
       'carbon-footprint_100g', 'nutrition-score-fr_100g',
       'nutrition-score-uk_100g', 'glycemic-index_100g',
       'water-hardness_100g'],
      dtype='object', length=163)

In [5]:
pd.set_option('display.max_columns', 500)
food.head()

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,packaging,packaging_tags,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,no_nutriments,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil,ingredients_that_may_be_from_palm_oil_tags,nutrition_grade_uk,nutrition_grade_fr,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,main_category,main_category_en,image_url,image_small_url,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-lignoceric-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,-dihomo-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-elaidic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g
0,3087,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,1474103866,2016-09-17T09:17:46Z,1474103893,2016-09-17T09:18:13Z,Farine de blÃ© noir,,1kg,,,Ferme t'y R'nao,ferme-t-y-r-nao,,,,,,,,,,,,,,,,,,en:FR,en:france,France,,,,,,,,,,,,,,,,,,,,,,,"en:to-be-completed, en:nutrition-facts-to-be-c...","en:to-be-completed,en:nutrition-facts-to-be-co...","To be completed,Nutrition facts to be complete...",,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,4530,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Banana Chips Sweetened (Whole),,,,,,,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Bananas, vegetable oil (coconut oil, corn oil ...",,,,,,28 g (1 ONZ),,0.0,[ bananas -> en:bananas ] [ vegetable-oil -...,,,0.0,,,0.0,,,,d,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,2243.0,,28.57,28.57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.018,64.29,14.29,,,,,,,,,3.6,3.57,,,,0.0,0.0,,0.0,,,,,0.0214,,,,,,,,,,,,,,0.0,,0.00129,,,,,,,,,,,,,,,,,,,14.0,14.0,,
2,4559,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Peanuts,,,,,Torn & Glasser,torn-glasser,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Peanuts, wheat flour, sugar, rice flour, tapio...",,,,,,28 g (0.25 cup),,0.0,[ peanuts -> en:peanuts ] [ wheat-flour -> ...,,,0.0,,,0.0,,,,b,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,1941.0,,17.86,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,60.71,17.86,,,,,,,,,7.1,17.86,,,,0.635,0.25,,0.0,,,,,0.0,,,,,,,,,,,,,,0.071,,0.00129,,,,,,,,,,,,,,,,,,,0.0,0.0,,
3,16087,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489055731,2017-03-09T10:35:31Z,1489055731,2017-03-09T10:35:31Z,Organic Salted Nut Mix,,,,,Grizzlies,grizzlies,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Organic hazelnuts, organic cashews, organic wa...",,,,,,28 g (0.25 cup),,0.0,[ organic-hazelnuts -> en:organic-hazelnuts ...,,,0.0,,,0.0,,,,d,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,2540.0,,57.14,5.36,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,17.86,3.57,,,,,,,,,7.1,17.86,,,,1.22428,0.482,,,,,,,,,,,,,,,,,,,,,0.143,,0.00514,,,,,,,,,,,,,,,,,,,12.0,12.0,,
4,16094,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489055653,2017-03-09T10:34:13Z,1489055653,2017-03-09T10:34:13Z,Organic Polenta,,,,,Bob's Red Mill,bob-s-red-mill,,,,,,,,,,,,,,,,,,US,en:united-states,United States,Organic polenta,,,,,,35 g (0.25 cup),,0.0,[ organic-polenta -> en:organic-polenta ] [...,,,0.0,,,0.0,,,,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,1552.0,,1.43,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,77.14,,,,,,,,,,5.7,8.57,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Selecting Retail Country: US

For our analysis, we will only focus on food that is on the market in the US.

In [6]:
food['countries'].value_counts().head(20)

US                169789
France            106296
en:FR              16056
Suisse             12003
Deutschland         6900
EspaÃ±a             3896
United Kingdom      3363
United States       2358
en:CH               2207
en:BE               1759
en:GB               1647
Australia           1483
en:FR,France        1483
Ð Ð¾ÑÑÐ¸Ñ        1377
en:ES               1104
en:DE               1093
en:US                909
Belgique             830
en:AU                714
en:IT                660
Name: countries, dtype: int64

In [7]:
food_abbr = food[food['countries'].isin(['US', 'United States', 'en:US'])]

In [8]:
food_abbr.shape

(173056, 163)

## Selecting Ingredients and The Target - Energy Value

In [9]:
food_abbr.shape

(173056, 163)

In [10]:
features_energy = food_abbr.loc[:, ['ingredients_text', 'energy_100g']]

In [11]:
features_energy = features_energy.dropna(how='any')

In [12]:
#verify that all the nan values were excluded
features_energy['ingredients_text'].isnull().sum()

0

In [13]:
features_energy['energy_100g'].isnull().sum()

0

In [14]:
features_energy.head()

Unnamed: 0,ingredients_text,energy_100g
1,"Bananas, vegetable oil (coconut oil, corn oil ...",2243.0
2,"Peanuts, wheat flour, sugar, rice flour, tapio...",1941.0
3,"Organic hazelnuts, organic cashews, organic wa...",2540.0
4,Organic polenta,1552.0
5,"Rolled oats, grape concentrate, expeller press...",1933.0


# Parsing The Ingredients Text

In [15]:
#making all word lower case
lower = []
for entry in features_energy['ingredients_text']:
    lower.append(entry.lower())

In [16]:
#removing the fullstops from the strings

no_fullstops = []
for entry in lower:
    entry = entry.replace('.', '')
    no_fullstops.append(entry)

In [17]:
#splitting strings at commas
split = []
for entry in no_fullstops:
    entry = entry.split(',')
    split.append(entry)

In [18]:
cleaned = []
for entry in split:
    for item in entry:
        item = item.split("(")
        cleaned.append(item)

In [19]:
cleaned_new = []
for entry in cleaned:
    for item in entry:
        item= item.strip(')')
        cleaned_new.append(item)

In [20]:
cleaned_newer = []
for entry in cleaned_new:
        if entry.count('[') >0:
            entry = entry.split('[') 
            cleaned_newer.append(entry)
        else:
            cleaned_newer.append(entry)

In [21]:
cleaned_newer_one = []
for entry in cleaned_newer:
    if entry.count('and/or') >0:
        entry = entry.split('and/or')
        cleaned_newer_one.append(entry)
    else:
        cleaned_newer_one.append(entry)

In [22]:
unique_words = {}
for entry in cleaned_newer_one:
    if type(entry) == list:
        for word in entry:
            if word.strip() not in unique_words:
                unique_words[word.strip()] = 1
            else:   
                unique_words[word.strip()] += 1
    else:
        if entry.strip() not in unique_words:
            unique_words[entry.strip()] = 1
        else:
            unique_words[entry.strip()] += 1

In [23]:
frequent = {key:value for (key,value) in unique_words.items() if value >= 100}

In [24]:
frequent_vocab = frequent.keys()

In [25]:
for word in frequent_vocab:
    features_energy[word] = features_energy['ingredients_text'].str.contains(word, case=False, regex=False)

In [26]:
features_energy.head()

Unnamed: 0,ingredients_text,energy_100g,bananas,vegetable oil,coconut oil,corn oil,peanuts,wheat flour,sugar,rice flour,tapioca starch,salt,leavening,ammonium bicarbonate,baking soda,soy sauce,water,soybeans,wheat,potato starch,organic cashews,organic sunflower oil,sea salt,rolled oats,expeller pressed canola oil,sunflower seeds,almonds,sesame seeds,cashews,natural vitamin e,evaporated cane juice,pear juice concentrate,tocopherols,organic chocolate liquor,organic cocoa butter,roasted peanuts,unbleached wheat flour,sunflower oil,beet powder,turmeric,rice,corn starch,salt],brown rice syrup,paprika,onion powder,garlic powder,water and salt,organic rolled oats,honey,raisins,walnuts,wheat germ,molasses,cinnamon,organic bananas,organic coconut oil,organic sugar,cranberries,pineapple,papaya,cocoa butter,soy lecithin,an emulsifier],natural vanilla,safflower,peanut,Unnamed: 68,canola oil,dry roasted almonds,dried cranberries,dried cherries,dried blueberries,blueberries,organic evaporated cane juice,organic raisins,organic expeller pressed canola oil,organic coconut,organic nonfat milk,onion,natural flavors,garlic,maltodextrin,spices,citric acid,peanut butter,dry roasted peanuts,palm oil,brown rice flour,calcium carbonate,barley malt,an emulsifier,locust bean gum,carrageenan gum,organic vanilla,black pepper,torula yeast,yeast extract,dried parsley,spice,coconut,unsweetened chocolate,organic pumpkin seeds,organic almonds,organic cranberries,organic sunflower seeds,organic cinnamon,organic apples,organic blueberries,organic maple syrup,hazelnuts,organic vanilla extract,organic black beans,organic brown rice syrup,almond butter,roasted almonds,organic spices,turbinado sugar,organic safflower oil,canola,milled cane sugar,whole rolled oats,natural flavor,maple syrup,vegetable glycerin,corn flour,apples,barley malt syrup,annatto,for color,dates,filling,parmesan cheese,soybean oil,whole milk,egg whites,thiamine,riboflavin,niacin,iron,folic acid,rye,corn,barley,brown rice,buttermilk powder,sodium phosphate,cornstarch,thiamin mononitrate,riboflavin and folic acid,coating,sucrose,wheat starch,xanthan gum,hydrolyzed corn protein,natural smoke flavor,pecans,lactose,sunflower kernels,brazil nuts,starch,whole milk powder,vanilla,organic dates,organic oat flour,onions,carrots,figs,cultured whey,lemon juice,fractionated palm kernel oil,peanut flour,whey,lecithin,chocolate chips,partially defatted peanut flour,whey powder,milk],glucose syrup,malted barley,milk,bicarbonate of soda,mono and diglycerides,seasoning,malted barley flour,reduced iron,malt,yeast,nonfat milk powder,yogurt powder,oats,oat bran,millet,organic popcorn,apricots,sulfur dioxide,milk chocolate,chocolate liquor,vanillin,pretzels,enriched wheat flour,thiamine mononitrate,silicon dioxide,sodium bicarbonate,soybean,apple juice concentrate,butter,cream,eggs,wheat gluten,ascorbic acid,enzymes,sodium acid pyrophosphate,s,calcium sulfate,sorbic acid,cream cheese,pasteurized milk and cream,cheese culture,carob bean gum,cheddar cheese,enriched flour,bleached wheat flour,beta carotene,for color],vitamin a palmitate added,brown sugar,unbleached enriched flour,distilled vinegar,unsalted butter,sour cream,cultured pasteurized milk,soy flour,vitamin a palmitate,and vanilla,potassium bicarbonate,unbleached enriched wheat flour,modified corn starch,cultured dextrose,dextrose,agar,corn syrup,contains sulfites,nonfat dry milk,shortening,beta-carotene,apple,chocolate,cocoa mass,natural vanilla flavor,...,organic cumin,carbon dioxide,liquid soybean oil,protein blend,organic evaporated cane sugar,soy protein,provides tartness,controls acidity,prevents caking,contains 2% or less of each of the following: salt,whole grain oats,raisin paste,fennel,ferrous lactate,organic potato starch,organic cornstarch,organic cocoa powder,white tuna,and natural flavor,whey protein,organic natural flavor,hydrolyzed soy,vegetable color],bell pepper,green olives,whey solids,trace of lime,contains 2% or less of the following: soybean oil,mono-diglycerides,icing sugar,monopotassium phosphate,light tuna,diced tomatoes in juice,tomatillos,zucchini,chicken base,sardines,tomato sauce,semisweet chocolate chips,romaine lettuce,expeller pressed sunflower oil,organic apple cider vinegar,organic ginger,organic jalapeno peppers,potato starch and powdered cellulose,organic kale,soy bean oil,anchovy,kosher salt,used to protect quality,ferrous sulfate],lactic acid],tara gum,acidifier,cane or beet sugar,fd&c blue no 1,fd&c red no 40,flaxseed,organic potatoes,semi-sweet chocolate chips,contains one or more of the following: canola,manzanilla olives,contains less than 2% salt,calcium disodium edta to protect flavor,cultured pasteurized part skim milk,organic onion,red wine,organic carrot,magnesium sulfate,vegetable stock,a b vitamin,apple puree concentrate,sesame,potassium bromate,prevents foam,microcrystalline cellulose,beef and pork,may contain coconut,cardamom,vitamin b3,ferrous sulfate] and b vitamins,beet,ferrous sulfate) and b vitamins,folic acid)],potassium sorbate as a preservative,crust,preserve freshness,organic unbleached wheat flour,vegetable color,contains one or more of the following: soybean oil,fd&c yellow #5,organic agave,to retain freshness,a source of calcium,niacinamide*,pyridoxine hydrochloride*,thiamin mononitrate*,a source of iron,riboflavin*,folic acid*,tomatoes*,spices*,canola oil*,sodium benzoate as a preservative,cultured pasteurized skim milk,bifidus and l casei,stevia extract,guar gums,lowfat milk,natural spices,eggplant,black olives,d,pasteurized sheep's milk,fresh garlic,red onion,including chili pepper,soy lecithin and pgpr,liquid sugar,flavor enhancers,sulphur dioxide,citrus pectin,kosher gelatin,fd&c yellow no 5,fd&c yellow #6,soda,nutritional yeast,habanero peppers,spirit vinegar,glutinous rice,coconut milk,sodium metabisulphite,acidulant,palm shortening,live and active cultures,l casei,milkfat and nonfat milk,peach,citric,folate,sweetcream buttermilk,enzymes and annatto,fish,molasses*,crushed red pepper,white rice flour,cultured pasteurized grade a nonfat milk,ferrous fumarate,fd&c yellow 6,mold inhibitor,colored with,cooked chicken,almond flour,corn starch and calcium sulfate added to prevent caking natamycin,acacia,contains less than 2% of natural and artificial flavor,caramel coloring,including paprika,food starch - modified,red #40 lake,iodized salt,imported olive oil,as color,calcium disodium edta added to protect flavor,fd&c red #3,reduced protein whey,an artificial flavor)},natural color,honey*,natural cane sugar,coconut water,pure vanilla,fruit juice from concentrate,l rhamnosus,organic inulin,tartrazine,organic apple puree,artificial colors including,filtered carbonated water,preserves freshness,stevia leaf extract,monk fruit extract,juice concentrate,organic tapioca syrup,tapioca syrup*,cane sugar*,sunflower oil*,vanilla extract*,cocoa butter*,pea protein,smoked paprika,organic peanut butter,coconut oil*,organic coconut milk,cultured pasteurized nonfat milk,chick peas,palm oil shortening,shallot,cayenne,fresh basil,potato starch and powdered cellulose added to prevent caking,contains 2% or less salt,organic agave syrup,chia seeds,fresh yeast,rice bran,green pepper,wheat],rice bran oil,organic dark chocolate,non-gmo canola oil,yellow onion,dairy cream,palm fruit oil,liquid smoke,b1,pepitas,unbleached,contains less than 2% of natural flavor,organic distilled vinegar,reb a,tahini,himalayan pink salt,organic egg yolks,organic rice syrup,organic banana puree,nigari,source of zinc,celtic sea salt,organic flax seeds,artificial flavors and artificial colors,organic soy flour,powdered cellulose to prevent caking,pgpr,nonfat yogurt,made from corn,organic coconut cream,vit b3,vit b6,vit b2,vit b12,natural sweetener,contains less than 1% of modified corn starch,cultured grade a non fat milk,vegan,vit e,ergocalciferol,vit b1,organic roasted soybeans,vit a
1,"Bananas, vegetable oil (coconut oil, corn oil ...",2243.0,True,True,True,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,"Peanuts, wheat flour, sugar, rice flour, tapio...",1941.0,False,False,False,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,"Organic hazelnuts, organic cashews, organic wa...",2540.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,True,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,Organic polenta,1552.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,"Rolled oats, grape concentrate, expeller press...",1933.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [27]:
features_energy.shape

(170830, 1625)

In [28]:
type(features_energy.iloc[0, 2])

numpy.bool_

In [29]:
X = features_energy.iloc[:, 2:]
y = features_energy['energy_100g']

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=108)

# Ridge

In [31]:
ridge = Ridge()
fit = ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [32]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [33]:
rmse_scores = []
alphas = []

for value in [1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]:
    ridge = Ridge(alpha=value)
    rmse = np.sqrt(np.mean(-cross_val_score(ridge, X, y, scoring = 'neg_mean_squared_error')))
    rmse_scores.append(rmse)
    alphas.append(value)

In [34]:
df = pd.DataFrame(rmse_scores, columns = ['rmse'])
df['alphas'] = alphas

In [35]:
#alpha 20 is the winner

df.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
8,800.92714,20.0
7,801.571105,10.0
6,802.097974,5.0
5,802.712348,1.0
4,802.912863,0.01
3,802.914834,0.001
2,802.915032,0.0001
1,802.915054,1e-08
0,802.915059,1e-10


# Lasso

In [None]:
alphas_l = []
rmses_l = []

for value in [1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10]:
    lasso = Lasso(alpha=value)
    rmse_l = np.sqrt(np.mean(-cross_val_score(lasso, X, y, scoring = 'neg_mean_squared_error')))
    rmses_l.append(rmse_l)
    alphas_l.append(value)

# Elastic Net

In [None]:
from sklearn.model_selection import ParameterGrid

param_grid = {'alpha': [1e-4, 3e-4, 6e-4, 1e-3, 3e-3, 6e-3, 1e-2, 3e-2, 6e-2, 1e-21, 3e-1, 6e-1, 1, 3, 6], 
              'l1_ratio': 0.1, 0.5, 0.7, 0.9, 0.95, 0.99}

grid = ParameterGrid(param_grid)

rmses_net = []
parameters = []

for params in grid:
    net = ElasticNet(**params)
    rmse_net = np.sqrt(np.mean(-cross_val_score(net, X, y, scoring = 'neg_mean_squared_error')))
    rmses_net.append(rmse_net)
    parameters.append(params)

# Random Forest

In [None]:
param_grid = {'n_estimators': [200, 300, 500, 700, 1000], 
              'max_depth': [3, 5, 10, 30, 50],
             }
              
grid = ParameterGrid(param_grid)

rmses_rfr = []
parameters_rfr = []

for params in grid:
    rfr = RandomForestRegressor(**params)
    rmses_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X, y, scoring = 'neg_mean_squared_error')))
    rmses_rfr.append(rmse_net)
    parameters_rfr.append(params)

# Gradient Boosting

In [None]:
param_grid = {'loss': ['ls', 'lad', 'huber', 'quantile'], 
              'max_depth': [3, 5, 10, 30, 50],
             'learning_rate': [0.5, 0.1, 0.05, 0.01]
             'n_estimators'}
             'subsample': [0.25, 0.5, 0.75, 1]
grid = ParameterGrid(param_grid)

rmses_rfr = []
parameters_rfr = []

for params in grid:
    gbr = GradientBoostingRegressor(**params)
    rmses_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X, y, scoring = 'neg_mean_squared_error')))
    rmses_rfr.append(rmse_net)
    parameters_rfr.append(params)