In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
import boto3
import io

The dataset comes from *Open Food Facts* and was downloaded from: https://www.kaggle.com/openfoodfacts/world-food-facts/home.
'Open Food Facts is a free, open, collbarative database of food products from around the world, with ingredients, allergens, nutrition facts and all the tidbits of information we can find on product labels. Open Food Facts is a non-profit association of volunteers.' Over 5,000 contributors 'have added 100 000+ products from 150 countries using our Android, iPhone or Windows Phone app or their camera to scan barcodes and upload pictures of products and their labels.'

# DATA UPLOAD

In [2]:
#food = pd.read_csv(r'/Users/admin/Documents/food.tsv', delimiter = '\t', encoding='ISO-8859-1')

In [3]:
#s3 data

s3 = boto3.client('s3') 
obj = s3.get_object(Bucket='innawendell', Key='food.tsv') 
food = pd.read_csv(io.BytesIO(obj['Body'].read()), delimiter = '\t', encoding='ISO-8859-1')

In [4]:
food.shape

(356027, 163)

In [5]:
food_sample = food.sample(frac=0.2, replace=True, random_state=108)

In [6]:
food_sample.columns

Index(['code', 'url', 'creator', 'created_t', 'created_datetime',
       'last_modified_t', 'last_modified_datetime', 'product_name',
       'generic_name', 'quantity',
       ...
       'fruits-vegetables-nuts_100g', 'fruits-vegetables-nuts-estimate_100g',
       'collagen-meat-protein-ratio_100g', 'cocoa_100g', 'chlorophyl_100g',
       'carbon-footprint_100g', 'nutrition-score-fr_100g',
       'nutrition-score-uk_100g', 'glycemic-index_100g',
       'water-hardness_100g'],
      dtype='object', length=163)

In [7]:
pd.set_option('display.max_columns', 500)
food_sample.head()

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,packaging,packaging_tags,brands,brands_tags,categories,categories_tags,categories_en,origins,origins_tags,manufacturing_places,manufacturing_places_tags,labels,labels_tags,labels_en,emb_codes,emb_codes_tags,first_packaging_code_geo,cities,cities_tags,purchase_places,stores,countries,countries_tags,countries_en,ingredients_text,allergens,allergens_en,traces,traces_tags,traces_en,serving_size,no_nutriments,additives_n,additives,additives_tags,additives_en,ingredients_from_palm_oil_n,ingredients_from_palm_oil,ingredients_from_palm_oil_tags,ingredients_that_may_be_from_palm_oil_n,ingredients_that_may_be_from_palm_oil,ingredients_that_may_be_from_palm_oil_tags,nutrition_grade_uk,nutrition_grade_fr,pnns_groups_1,pnns_groups_2,states,states_tags,states_en,main_category,main_category_en,image_url,image_small_url,energy_100g,energy-from-fat_100g,fat_100g,saturated-fat_100g,-butyric-acid_100g,-caproic-acid_100g,-caprylic-acid_100g,-capric-acid_100g,-lauric-acid_100g,-myristic-acid_100g,-palmitic-acid_100g,-stearic-acid_100g,-arachidic-acid_100g,-behenic-acid_100g,-lignoceric-acid_100g,-cerotic-acid_100g,-montanic-acid_100g,-melissic-acid_100g,monounsaturated-fat_100g,polyunsaturated-fat_100g,omega-3-fat_100g,-alpha-linolenic-acid_100g,-eicosapentaenoic-acid_100g,-docosahexaenoic-acid_100g,omega-6-fat_100g,-linoleic-acid_100g,-arachidonic-acid_100g,-gamma-linolenic-acid_100g,-dihomo-gamma-linolenic-acid_100g,omega-9-fat_100g,-oleic-acid_100g,-elaidic-acid_100g,-gondoic-acid_100g,-mead-acid_100g,-erucic-acid_100g,-nervonic-acid_100g,trans-fat_100g,cholesterol_100g,carbohydrates_100g,sugars_100g,-sucrose_100g,-glucose_100g,-fructose_100g,-lactose_100g,-maltose_100g,-maltodextrins_100g,starch_100g,polyols_100g,fiber_100g,proteins_100g,casein_100g,serum-proteins_100g,nucleotides_100g,salt_100g,sodium_100g,alcohol_100g,vitamin-a_100g,beta-carotene_100g,vitamin-d_100g,vitamin-e_100g,vitamin-k_100g,vitamin-c_100g,vitamin-b1_100g,vitamin-b2_100g,vitamin-pp_100g,vitamin-b6_100g,vitamin-b9_100g,folates_100g,vitamin-b12_100g,biotin_100g,pantothenic-acid_100g,silica_100g,bicarbonate_100g,potassium_100g,chloride_100g,calcium_100g,phosphorus_100g,iron_100g,magnesium_100g,zinc_100g,copper_100g,manganese_100g,fluoride_100g,selenium_100g,chromium_100g,molybdenum_100g,iodine_100g,caffeine_100g,taurine_100g,ph_100g,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g
43467,41290381257,http://world-en.openfoodfacts.org/product/0041...,usda-ndb-import,1489070248,2017-03-09T14:37:28Z,1489070249,2017-03-09T14:37:29Z,"Mr. Pig, Soda",,,,,"Piggly Wiggly, Piggly Wiggly Company","piggly-wiggly,piggly-wiggly-company",,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Carbonated water, high fructose corn syrup, ca...",,,,,,355 ml (1 CAN),,5.0,[ carbonated-water -> en:carbonated-water ] ...,"en:e150a,en:e212,en:e270,en:e338,en:e340,en:e340i","E150a - Plain caramel,E212 - Potassium benzoat...",0.0,,,0.0,,,,,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,176.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11.55,11.55,,,,,,,,,,0.0,,,,0.0254,0.01,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
46446,41345565120,http://world-en.openfoodfacts.org/product/0041...,usda-ndb-import,1489065648,2017-03-09T13:20:48Z,1489065648,2017-03-09T13:20:48Z,"Thompson Grapes, Seedless Grapes In Light Syrup",,,,,Oregon,oregon,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Grapes, water and cane sugar.",,,,,,140 g (0.5 cup),,0.0,[ grapes -> en:grapes ] [ water-and-cane-su...,,,0.0,,,0.0,,,,c,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,297.0,,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,16.43,15.0,,,,,,,,,0.7,0.71,,,,0.01778,0.007,,0.0,,,,,0.0,,,,,,,,,,,,0.104,,0.0,,0.00051,,,,,,,,,,,,,,,,,,,3.0,3.0,,
127830,688267165153,http://world-en.openfoodfacts.org/product/0688...,usda-ndb-import,1489079860,2017-03-09T17:17:40Z,1489079860,2017-03-09T17:17:40Z,Oat & Ancient Grain Granola,,,,,Ahold,ahold,,,,,,,,,,,,,,,,,,US,en:united-states,United States,"Whole rolled oats, milled cane sugar, oat syru...",,,,,,55 g (0.67 cup),,0.0,[ whole-rolled-oats -> en:whole-rolled-oats ...,,,0.0,,,0.0,,,,a,,,"en:to-be-completed, en:nutrition-facts-complet...","en:to-be-completed,en:nutrition-facts-complete...","To be completed,Nutrition facts completed,Ingr...",,,,,1598.0,,9.09,0.91,,,,,,,,,,,,,,,3.64,3.64,,,,,,,,,,,,,,,,,0.0,0.0,74.55,21.82,,,,,,,,,9.1,10.91,,,,0.34544,0.136,,0.0,,,,,0.0,,,,,,,,,,,,,,0.073,,0.00327,,,,,,,,,,,,,,,,,,,-1.0,-1.0,,
223348,3257984450902,http://world-en.openfoodfacts.org/product/3257...,tacite,1448644918,2015-11-27T17:21:58Z,1448741919,2015-11-28T20:18:39Z,Crevettes de Madagascar,,300 g,"Barquette,plastique","barquette,plastique",Cora,cora,"Produits de la mer,CrustacÃ©s,Crevettes,Crevet...","en:seafood,en:crustaceans,en:shrimps,en:prawns","Seafood,Crustaceans,Shrimps,Prawns",Madagascar,madagascar,France,france,Label Rouge,en:label-rouge,Label Rouge,FR 62.474.100 EC,fr-62-474-100-ec,"50.666667,1.65",,isques-pas-de-calais-france,"France,CourriÃ¨res",Cora,France,en:france,France,"_Crevettes_ (Penaeus monodon), sel, sucre, ant...","Crevettes, disulfite",,,,,300 g,,2.0,[ crevettes -> fr:crevettes ] [ penaeus-mon...,"en:e223,en:e330","E223 - Sodium metabisulphite,E330 - Citric acid",0.0,,,0.0,,,,b,Fish Meat Eggs,Fish and seafood,"en:to-be-checked, en:complete, en:nutrition-fa...","en:to-be-checked,en:complete,en:nutrition-fact...","To be checked,Complete,Nutrition facts complet...",en:seafood,Seafood,http://en.openfoodfacts.org/images/products/32...,http://en.openfoodfacts.org/images/products/32...,416.2,,0.5,0.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,0.5,,,,,,,,,0.7,23.5,,,,1.5,0.590551,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,2.0,,
348725,8712566410231,http://world-en.openfoodfacts.org/product/8712...,date-limite-app,1475477889,2016-10-03T06:58:09Z,1486676576,2017-02-09T21:42:56Z,Spargel Cremesuppe mit Schnittlauch verfeinert,,,,,Knorr,knorr,Suppen,"en:meals,en:soups","Meals,Soups",,,,,,,,,,,,,,,Deutschland,en:germany,Germany,,,,,,,,,,,,,,,,,,,,,Composite foods,One-dish meals,"en:to-be-completed, en:nutrition-facts-to-be-c...","en:to-be-completed,en:nutrition-facts-to-be-co...","To be completed,Nutrition facts to be complete...",en:meals,Meals,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


# Selecting Retail Country: US

For our analysis, we will only focus on food that is on the market in the US.

In [8]:
food_sample['countries'].value_counts().head(20)

US                34048
France            21197
en:FR              3112
Suisse             2358
Deutschland        1444
EspaÃ±a             735
United Kingdom      690
United States       503
en:CH               453
en:BE               342
en:GB               339
Australia           304
en:FR,France        277
Ð Ð¾ÑÑÐ¸Ñ        261
en:DE               247
en:ES               210
en:US               176
Belgique            159
en:AU               144
Portugal            139
Name: countries, dtype: int64

In [9]:
food_abbr = food_sample[food_sample['countries'].isin(['US', 'United States', 'en:US'])]

In [10]:
food_abbr.shape

(34727, 163)

## Selecting Ingredients and The Target - Energy Value

In [11]:
food_abbr.shape

(34727, 163)

In [12]:
features_energy = food_abbr.loc[:, ['ingredients_text', 'energy_100g']]

In [13]:
features_energy = features_energy.dropna(how='any')

In [14]:
#verify that all the nan values were excluded
features_energy['ingredients_text'].isnull().sum()

0

In [15]:
features_energy['energy_100g'].isnull().sum()

0

In [16]:
features_energy.head()

Unnamed: 0,ingredients_text,energy_100g
43467,"Carbonated water, high fructose corn syrup, ca...",176.0
46446,"Grapes, water and cane sugar.",297.0
127830,"Whole rolled oats, milled cane sugar, oat syru...",1598.0
293254,"Soy bean (gmo) ,calcium sulfate, water, salt, ...",544.0
116086,"Organic concentrated apple puree, organic conc...",1343.0


# Parsing The Ingredients Text

In [17]:
#making all word lower case
lower = []
for entry in features_energy['ingredients_text']:
    lower.append(entry.lower())

In [18]:
#removing the fullstops from the strings

no_fullstops = []
for entry in lower:
    entry = entry.replace('.', '')
    no_fullstops.append(entry)

In [19]:
#splitting strings at commas
split = []
for entry in no_fullstops:
    entry = entry.split(',')
    split.append(entry)

In [20]:
cleaned = []
for entry in split:
    for item in entry:
        item = item.split("(")
        cleaned.append(item)

In [21]:
cleaned_new = []
for entry in cleaned:
    for item in entry:
        item= item.strip(')')
        cleaned_new.append(item)

In [22]:
cleaned_newer = []
for entry in cleaned_new:
        if entry.count('[') >0:
            entry = entry.split('[') 
            cleaned_newer.append(entry)
        else:
            cleaned_newer.append(entry)

In [23]:
cleaned_newer_one = []
for entry in cleaned_newer:
    if entry.count('and/or') >0:
        entry = entry.split('and/or')
        cleaned_newer_one.append(entry)
    else:
        cleaned_newer_one.append(entry)

In [24]:
unique_words = {}
for entry in cleaned_newer_one:
    if type(entry) == list:
        for word in entry:
            if word.strip() not in unique_words:
                unique_words[word.strip()] = 1
            else:   
                unique_words[word.strip()] += 1
    else:
        if entry.strip() not in unique_words:
            unique_words[entry.strip()] = 1
        else:
            unique_words[entry.strip()] += 1

In [25]:
frequent = {key:value for (key,value) in unique_words.items() if value >= 500}

In [26]:
frequent_vocab = frequent.keys()

In [27]:
for word in frequent_vocab:
    features_energy[word] = features_energy['ingredients_text'].str.contains(word, case=False, regex=False)

In [28]:
features_energy.head()

Unnamed: 0,ingredients_text,energy_100g,carbonated water,high fructose corn syrup,caramel color,natural and artificial flavor,lactic acid,honey,molasses,natural flavor,salt,water,ascorbic acid,vitamin c,for color,pectin,sucralose,wheat flour,niacin,iron,thiamin mononitrate,vitamin b1,riboflavin,vitamin b2,folic acid],sugar,vegetable oil,Unnamed: 28,soybean,citric acid,skim milk,baking soda,natural and artificial flavors,carnauba wax,sea salt,filtered water,natural flavors,vinegar,distilled vinegar,garlic powder,spice,sunflower,canola,corn syrup solids,maltodextrin,nonfat dry milk,butter,mono and diglycerides,milk,tomatoes,cinnamon,onion powder,yeast,corn syrup,cocoa butter,nonfat milk,soy lecithin,cream,almonds,monosodium glutamate,sodium phosphate,sunflower oil,rice flour,corn starch,enriched flour,reduced iron,thiamine mononitrate,folic acid,cocoa,tomato paste,soybean oil,enzymes,modified corn starch,cane sugar,dextrose,malic acid,spices,gelatin,titanium dioxide,malted barley flour,wheat,ferrous sulfate,cellulose gum,sodium citrate,red 40,blue 1,garlic,annatto,color,peanuts,canola oil,onion,eggs,whey,leavening,flour,xanthan gum,cornstarch,sodium erythorbate,carrageenan,vitamin a palmitate,cheese culture,potato starch,enriched wheat flour,chocolate liquor,palm oil,pasteurized milk,sodium bicarbonate,calcium chloride,yellow 5,brown sugar,onions,guar gum,cheese cultures,black pepper,turmeric,paprika,pork,coconut oil,artificial flavor,potassium sorbate,modified food starch,locust bean gum,yeast extract,potatoes,corn,vanilla,processed with alkali,yellow 6,preservative,sodium nitrite,sodium benzoate
43467,"Carbonated water, high fructose corn syrup, ca...",176.0,True,True,True,True,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False
46446,"Grapes, water and cane sugar.",297.0,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
127830,"Whole rolled oats, milled cane sugar, oat syru...",1598.0,False,False,False,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
293254,"Soy bean (gmo) ,calcium sulfate, water, salt, ...",544.0,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
116086,"Organic concentrated apple puree, organic conc...",1343.0,False,False,False,False,False,False,False,True,False,False,True,True,True,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [29]:
features_energy.shape

(34304, 132)

In [30]:
type(features_energy.iloc[0, 2])

numpy.bool_

In [31]:
X = features_energy.iloc[:, 2:]
y = features_energy['energy_100g']

# PCA

In [32]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.90)
X_pca = pca.fit_transform(X)

In [33]:
print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    len(pca.explained_variance_ratio_)
)

The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 75


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=108)

In [34]:
rfr_one = RandomForestRegressor(n_estimators = 20, max_depth = 10, n_jobs = -1)
rmse_rfr_one = np.sqrt(np.mean(-cross_val_score(rfr_one, X_pca, y, scoring = 'neg_mean_squared_error')))

In [35]:
print(rmse_rfr_one)

538.8571209671522


# Ridge

In [36]:
from sklearn.metrics import mean_squared_error

In [37]:
rmse_scores = []
alphas = []

for value in [1e-10,   1e-3,  1, 5, 20]:
    ridge = Ridge(alpha=value)
    rmse = np.sqrt(np.mean(-cross_val_score(ridge, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmse_scores.append(rmse)
    alphas.append(value)

In [38]:
df = pd.DataFrame(rmse_scores, columns = ['rmse'])
df['alphas'] = alphas

In [39]:
#alpha 20 is the winner

df.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
4,616.314299,20.0
3,616.321835,5.0
2,616.327002,1.0
1,616.32851,0.001
0,616.328512,1e-10


# Lasso

In [40]:
alphas_l = []
rmses_l = []

for value in [1e-15, 1e-3,  10]:
    lasso = Lasso(alpha=value)
    rmse_l = np.sqrt(np.mean(-cross_val_score(lasso, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmses_l.append(rmse_l)
    alphas_l.append(value)

In [41]:
df_lasso = pd.DataFrame(rmses_l, columns = ['rmse'])
df_lasso['alphas'] = alphas_l
df_lasso.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
1,616.328239,0.001
0,616.328512,1e-15
2,646.723504,10.0


# Elastic Net

In [42]:
from sklearn.model_selection import ParameterGrid

param_grid = {'alpha': [1e-4, 1e-2, 1, 6], 
              'l1_ratio': [0.1, 0.5, 0.95]}

grid = ParameterGrid(param_grid)

rmses_net = []
parameters = []

for params in grid:
    net = ElasticNet(**params)
    rmse_net = np.sqrt(np.mean(-cross_val_score(net, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmses_net.append(rmse_net)
    parameters.append(params)

In [43]:
df_net = pd.DataFrame(rmses_net, columns = ['rmse'])
df_net['alphas'] = parameters
df_net.sort_values(by=['rmse'])

Unnamed: 0,rmse,alphas
5,616.315165,"{'alpha': 0.01, 'l1_ratio': 0.95}"
0,616.325498,"{'alpha': 0.0001, 'l1_ratio': 0.1}"
1,616.32678,"{'alpha': 0.0001, 'l1_ratio': 0.5}"
2,616.328309,"{'alpha': 0.0001, 'l1_ratio': 0.95}"
4,616.606355,"{'alpha': 0.01, 'l1_ratio': 0.5}"
3,617.24707,"{'alpha': 0.01, 'l1_ratio': 0.1}"
8,630.886434,"{'alpha': 1, 'l1_ratio': 0.95}"
11,689.015233,"{'alpha': 6, 'l1_ratio': 0.95}"
7,703.011225,"{'alpha': 1, 'l1_ratio': 0.5}"
6,730.92065,"{'alpha': 1, 'l1_ratio': 0.1}"


# Random Forest

In [44]:
n_est = []
rmses_rfr = []

for value in [100, 500,700]:
    rfr = RandomForestRegressor(n_jobs=-1, n_estimators=value)
    rmse_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_rfr.append(rmse_rfr)
    n_est.append(value)

OSError: [Errno 28] No space left on device

In [None]:
df_rfr_one = pd.DataFrame(rmses_rfr, columns = ['rmse'])
df_rfr_one['n_estimators] = n_est
df_rfr_one.sort_values(by=['rmse'])

In [None]:
max_depth = []
rmses_rfr = []

for value in [[8, 12, 20]:
    rfr = RandomForestRegressor(n_jobs=-1, max_depth = value)
    rmse_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X_pca, y, scoring = 'neg_mean_squared_error', n_jobs=-1)))
    rmses_rfr.append(rmse_rfr)
    max_depth.append(value)


for params in grid:
    rfr = RandomForestRegressor(**params, n_jobs=-1)
    rmse_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmses_rfr.append(rmse_rfr)
    parameters_rfr.append(params)

# XGBoost

In [None]:
params = {'booster': ['gbtree'], 
        'max_depth': [2, 4, 6, 8],
        'learning_rate': [0.1, 0.05, 0.01],
        'subsample': [0.25, 0.5, 1]}

xgb = XGBRegressor(n_jobs = -1)              
grid = GridSearchCV(estimator = xgb, cv=3, n_jobs=-1, param_grid = params,scoring = 'neg_mean_squared_error').fit(X_pca, y)

print(grid.best_params_)
print(grid.best_score_)

# KNN Regression

In [None]:
rmse_scores = []
alphas = []

params = {'number': np.arange(10, 100, 10), 
         'weight':['uniform', 'distance']}

knn = KNeighborsRegressor(n_jobs = -1)              
grid = GridSearchCV(estimator = knn, cv=3, n_jobs=-1, param_grid = params,scoring = 'neg_mean_squared_error').fit(X_pca, y)

print(grid.best_params_)
print(grid.best_score_)

# Gradient Boosting

In [None]:
param_grid = {'loss': ['ls', 'lad', 'huber', 'quantile'], 
              'max_depth': [3, 5, 10, 50],
             'learning_rate': [0.5, 0.1, 0.01]
             'n_estimators': [100, 500, 700]}
             'subsample': [0.25, 0.5, 0.75]

gbr = GradientBoostingRegressor(n_jobs = -1)
grid = GridSearchCV(estimator = gbr, cv=3, n_jobs=-1, param_grid = params, scoring = 'neg_mean_squared_error').fit(X_pca, y)

print(grid.best_params_)
print(grid.best_score_)

param_grid = {'loss': ['ls', 'lad', 'huber', 'quantile'], 
              'max_depth': [3, 5, 10, 50],
             'learning_rate': [0.5, 0.1, 0.01]
             'n_estimators': [100, 500, 700]}
             'subsample': [0.25, 0.5, 0.75]
grid = ParameterGrid(param_grid)

rmses_rfr = []
parameters_rfr = []

for params in grid:
    gbr = GradientBoostingRegressor(**params, n_jobs=-1)
    rmses_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X_pca, y, scoring = 'neg_mean_squared_error')))
    rmses_rfr.append(rmse_gbr)
    parameters_rfr.append(params)