In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import ElasticNet
from sklearn import ensemble
import boto3
import io

The dataset comes from *Open Food Facts* and was downloaded from: https://www.kaggle.com/openfoodfacts/world-food-facts/home.
'Open Food Facts is a free, open, collbarative database of food products from around the world, with ingredients, allergens, nutrition facts and all the tidbits of information we can find on product labels. Open Food Facts is a non-profit association of volunteers.' Over 5,000 contributors 'have added 100 000+ products from 150 countries using our Android, iPhone or Windows Phone app or their camera to scan barcodes and upload pictures of products and their labels.'

# DATA UPLOAD

In [2]:
s3 = boto3.client('s3') 

obj = s3.get_object(Bucket='innawendell', Key='food.tsv') 

food = pd.read_csv(io.BytesIO(obj['Body'].read()), delimiter = '\t', encoding='ISO-8859-1')
#food = pd.read_csv(r'/Users/admin/Documents/food.tsv', delimiter = '\t', encoding='ISO-8859-1')

NameError: name 'io' is not defined

In [None]:
food.shape

In [None]:
food.columns

In [None]:
pd.set_option('display.max_columns', 500)
food.head()

# Selecting Retail Country: US

For our analysis, we will only focus on food that is on the market in the US.

In [None]:
food['countries'].value_counts().head(20)

In [None]:
food_abbr = food[food['countries'].isin(['US', 'United States', 'en:US'])]

In [None]:
food_abbr.shape

## Selecting Ingredients and The Target - Energy Value

In [None]:
food_abbr.shape

In [None]:
features_energy = food_abbr.loc[:, ['ingredients_text', 'energy_100g']]

In [None]:
features_energy = features_energy.dropna(how='any')

In [None]:
#verify that all the nan values were excluded
features_energy['ingredients_text'].isnull().sum()

In [None]:
features_energy['energy_100g'].isnull().sum()

In [None]:
features_energy.head()

# Parsing The Ingredients Text

In [None]:
#making all word lower case
lower = []
for entry in features_energy['ingredients_text']:
    lower.append(entry.lower())

In [None]:
#removing the fullstops from the strings

no_fullstops = []
for entry in lower:
    entry = entry.replace('.', '')
    no_fullstops.append(entry)

In [None]:
#splitting strings at commas
split = []
for entry in no_fullstops:
    entry = entry.split(',')
    split.append(entry)

In [None]:
cleaned = []
for entry in split:
    for item in entry:
        item = item.split("(")
        cleaned.append(item)

In [None]:
cleaned_new = []
for entry in cleaned:
    for item in entry:
        item= item.strip(')')
        cleaned_new.append(item)

In [None]:
cleaned_newer = []
for entry in cleaned_new:
        if entry.count('[') >0:
            entry = entry.split('[') 
            cleaned_newer.append(entry)
        else:
            cleaned_newer.append(entry)

In [None]:
cleaned_newer_one = []
for entry in cleaned_newer:
    if entry.count('and/or') >0:
        entry = entry.split('and/or')
        cleaned_newer_one.append(entry)
    else:
        cleaned_newer_one.append(entry)

In [None]:
unique_words = {}
for entry in cleaned_newer_one:
    if type(entry) == list:
        for word in entry:
            if word.strip() not in unique_words:
                unique_words[word.strip()] = 1
            else:   
                unique_words[word.strip()] += 1
    else:
        if entry.strip() not in unique_words:
            unique_words[entry.strip()] = 1
        else:
            unique_words[entry.strip()] += 1

In [None]:
frequent = {key:value for (key,value) in unique_words.items() if value >= 100}

In [None]:
frequent_vocab = frequent.keys()

In [None]:
for word in frequent_vocab:
    features_energy[word] = features_energy['ingredients_text'].str.contains(word, case=False, regex=False)

In [None]:
features_energy.head()

In [None]:
features_energy.shape

In [None]:
type(features_energy.iloc[0, 2])

In [None]:
X = features_energy.iloc[:, 2:]
y = features_energy['energy_100g']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=108)

# Ridge

In [None]:
ridge = Ridge()
fit = ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
rmse_scores = []
alphas = []

for value in [1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]:
    ridge = Ridge(alpha=value)
    rmse = np.sqrt(np.mean(-cross_val_score(ridge, X, y, scoring = 'neg_mean_squared_error')))
    rmse_scores.append(rmse)
    alphas.append(value)

In [None]:
df = pd.DataFrame(rmse_scores, columns = ['rmse'])
df['alphas'] = alphas

In [None]:
#alpha 20 is the winner

df.sort_values(by=['rmse'])

# Lasso

In [None]:
alphas_l = []
rmses_l = []

for value in [1e-15, 1e-10, 1e-8, 1e-5, 1e-4, 1e-3, 1e-2, 1, 5, 10]:
    lasso = Lasso(alpha=value)
    rmse_l = np.sqrt(np.mean(-cross_val_score(lasso, X, y, scoring = 'neg_mean_squared_error')))
    rmses_l.append(rmse_l)
    alphas_l.append(value)

# Elastic Net

In [None]:
from sklearn.model_selection import ParameterGrid

param_grid = {'alpha': [1e-4, 3e-4, 6e-4, 1e-3, 3e-3, 6e-3, 1e-2, 3e-2, 6e-2, 1e-21, 3e-1, 6e-1, 1, 3, 6], 
              'l1_ratio': 0.1, 0.5, 0.7, 0.9, 0.95, 0.99}

grid = ParameterGrid(param_grid)

rmses_net = []
parameters = []

for params in grid:
    net = ElasticNet(**params)
    rmse_net = np.sqrt(np.mean(-cross_val_score(net, X, y, scoring = 'neg_mean_squared_error')))
    rmses_net.append(rmse_net)
    parameters.append(params)

# Random Forest

In [None]:
param_grid = {'n_estimators': [200, 300, 500, 700, 1000], 
              'max_depth': [3, 5, 10, 30, 50],
             }
              
grid = ParameterGrid(param_grid)

rmses_rfr = []
parameters_rfr = []

for params in grid:
    rfr = RandomForestRegressor(**params)
    rmses_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X, y, scoring = 'neg_mean_squared_error')))
    rmses_rfr.append(rmse_net)
    parameters_rfr.append(params)

# Gradient Boosting

In [None]:
param_grid = {'loss': ['ls', 'lad', 'huber', 'quantile'], 
              'max_depth': [3, 5, 10, 30, 50],
             'learning_rate': [0.5, 0.1, 0.05, 0.01]
             'n_estimators'}
             'subsample': [0.25, 0.5, 0.75, 1]
grid = ParameterGrid(param_grid)

rmses_rfr = []
parameters_rfr = []

for params in grid:
    gbr = GradientBoostingRegressor(**params)
    rmses_rfr = np.sqrt(np.mean(-cross_val_score(rfr, X, y, scoring = 'neg_mean_squared_error')))
    rmses_rfr.append(rmse_net)
    parameters_rfr.append(params)