In [106]:
# This is a cell to hide code snippets from displaying
# This must be at first cell!

from IPython.display import HTML

hide_me = ''
HTML('''<script>
code_show=true; 
function code_toggle() {
  if (code_show) {
    $('div.input').each(function(id) {
      el = $(this).find('.cm-variable:first');
      if (id == 0 || el.text() == 'hide_me') {
        $(this).hide();
      }
    });
    $('div.output_prompt').css('opacity', 0);
  } else {
    $('div.input').each(function(id) {
      $(this).show();
    });
    $('div.output_prompt').css('opacity', 1);
  }
  code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input style="opacity:0" type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [121]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [108]:
'''Installing Necessary Libraries'''
from pandas import Series, DataFrame
import pandas as pd
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from matplotlib.pyplot import figure 
%pylab inline

from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.palettes import Set3, GnBu3, OrRd3, RdYlGn, Reds, Paired
from bokeh.transform import cumsum, dodge
from bokeh.models import ColumnDataSource, HoverTool, Panel
from bokeh.core.properties import value
from bokeh.models.widgets import Tabs


plot = figure(plot_width=600, plot_height=400, tools='pan,box_zoom')
output_notebook()



Populating the interactive namespace from numpy and matplotlib


# Exploring The Data

### Which Countries Offer The Healthiest Products?


In [109]:
'''
In order to analzye the general nutrition characteristics of the food products, 
it will be helpful to know the amount of energy, saturated fat, trans fat, fiber,
sodium, sugars, and protein contained in each product. Let's only select rows 
where this information is not missing. 
'''
df = pd.read_csv('final_food_data.csv') #reading in the data
df['country'] = df['country'].replace(['United Kingdom','United States'],['UK','US']) #renaming countries concisely
nutrients = df.dropna(subset=['saturated-fat_100g', 'trans-fat_100g',
             'fiber_100g','sugars_100g','proteins_100g','sodium_100g'], how='any')

#### Let's begin by looking into the number of products we have per country:

In [110]:
chart_colors = ['#BE9EC9', '#feb236', '#006E6D',
                '#d64161', '#F1EA7F', '#6b5b95',
                '#EC9787', ' #BD3D3A', '#D5AE41', '#00A591','#7F4145']

country = df['country'].value_counts().rename_axis('country').to_frame('counts')
country['angle'] = country['counts']/country['counts'].sum() * 2*pi
country['color'] = Set3[len(country)]

p = figure(plot_height=350, title="Number of Products by Country", toolbar_location=None,
           tools="hover", tooltips="@country: @counts", x_range=(-0.5, 1.0))

p.wedge(x=0, y=1, radius=0.4,
        start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
        line_color="white", fill_color='color', legend_field='country', source=country)

p.axis.axis_label=None
p.axis.visible=False
p.grid.grid_line_color = None

show(p)

The US and France definitely dominate this dataset. 

Let's try to figure out which countries offer the healthiest products. Though there are many schools of thought when it comes to evaluating nutrition labels, one should typically avoid foods high in sugar, saturated and trans fats (we will label these together as bad fats), and sodium. Foods high in protein and fiber are considered healthy to eat.

In [111]:
#combining saturated and trans fat into one variable
nutrients['bad-fats_100g'] = nutrients['saturated-fat_100g'] + nutrients['trans-fat_100g']

In [112]:
#focusing on mean nutrient value per product by country 
nutr = nutrients.groupby('country',as_index=False)[['sugars_100g','bad-fats_100g','sodium_100g','proteins_100g','fiber_100g']].agg('mean')
data = nutr.to_dict(orient='list')
country = nutr['country'].tolist()

#get max possible value of plotted columns with some offset
p = figure(x_range=country, y_range=(0, nutr[['sugars_100g','bad-fats_100g','sodium_100g']].values.max() + 3), 
           plot_height=250, title="Unhealthy Nutrients By Country", plot_width = 800,
           toolbar_location=None, tools="", y_axis_label = 'Mean Amount Per 100g')

p.vbar(x=dodge('country', -0.1, range=p.x_range), top='sugars_100g', width=0.4, source=ColumnDataSource(data),
       color=Set3[8][7], legend_label="sugars")

p.vbar(x=dodge('country', 0, range=p.x_range), top='bad-fats_100g', width=0.4, source=ColumnDataSource(data),
       color=Set3[4][2], legend_label="bad-fats")

p.vbar(x=dodge('country', 0.1, range=p.x_range), top='sodium_100g', width=0.4, source=ColumnDataSource(data),
       color=Set3[4][3], legend_label="sodium")

p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"


#get max possible value of plotted columns with some offset
h = figure(x_range=country, y_range=(0, nutr[['proteins_100g','fiber_100g']].values.max() + 3), 
           plot_height=250, title="Healthy Nutrients By Country", plot_width=800,
           toolbar_location=None, tools="", y_axis_label = 'Mean Amount Per 100g')

h.vbar(x=dodge('country', 0.1, range=p.x_range), top='proteins_100g', width=0.4, source=ColumnDataSource(data),
       color='#80ced6', legend_label='protein')

h.vbar(x=dodge('country',  0.2,  range=p.x_range), top='fiber_100g', width=0.4, source=ColumnDataSource(data),
       color='#d5f4e6',legend_label = "fiber")


h.x_range.range_padding = 0.1
h.xgrid.grid_line_color = None
h.legend.location = "top_left"
h.legend.orientation = "horizontal"

show(p)
show(h)

Sugar has fairly detrimental affects on health. It can lead to weight gain, increased risk of type 2 Diabetes, food addictions, and chronic digestive issues. Saturated and trans fats can causes increases in harmful LDL cholesterol levels, create inflammation, and contribute to insulin resistance. Excess sodium can lead to elevated blood pressure and cause varying health problems. 

Italy clearly takes home the gold when it comes to average sugar and fat content while Australia is the runner-up in terms of sodium. In general, most countries products tend to be fairly high in sugar. 

Protein is essential within a diet as it is used by our bodies to build and repair tissues. Our body is unable to store protein and needs relatively large quantities of it. Fiber aids our bodies in regulating digestion and lowers cholesterol levels. It is recommended to eat foods high in fiber to help achieve a healthy weight. 

From the above plots, we observe that Switzerland offers products with the highest average protein content while Australia's products have the highest average fiber content. 

__Relying upon the country's average score in terms of the above healthy/unhealthy nutrients, Italy provides some of the unhealthiest products while Switzerland, Russia, and Australia offer the healthiest.__ 

### Can We Verify This With External Information? 

According to the European Union Commission 2017 Health Report, the leading cause of death in Italy is cardiovascular disease, 40% among women and 33% among men. A recent study by clinical researchers from the University of Surrey found that the risk of heart disease is significantly increased by high-sugar diets, even amongst healthy populations. The average sugar content of Italy's products exceeds the next leading country by about 7 grams. Italy's consumption of high-sugar products may have a relationship with its population's predisposition towards cardiovascular disease.

Switzerland, Australia, and Russia ranked top for healthiest products offered. According to the Organization for Economic Cooperation and Development, Switzerland has one of the lowest obesity rates at 9% compared to other developed nations. In terms of overweight children, Switzerland, Australia, and Russia have some of the lowest rates: 19%, 18%, and 17% respectively. Italy's proportion of overweight children remains significantly higher, at 38%.  

While there are a variety of external and genetic factors that yield influence on an individual's health, this data suggests the presence of relationships between the nutritional quality of food products offered to a population and their health.  

*Sources at Bottom of Report*

### Which Country Offers the Healthiest Food Items?

Let's see which country offers products with the greatest amount of protein and fiber per 100g. 

In [113]:
max_nutrients = nutrients.groupby('country',as_index=False)[['proteins_100g','fiber_100g']].agg('max')
data2 = max_nutrients.to_dict(orient='list')
country = max_nutrients['country'].tolist()



#get max possible value of plotted columns with some offset
p = figure(x_range=country, y_range=(0, max_nutrients[['proteins_100g','fiber_100g']].values.max() + 3), 
           plot_height=250, title="Healthiest Food Product By Country", plot_width=800,
           toolbar_location=None, tools="", y_axis_label = 'Max Amount Per 100g')

p.vbar(x=dodge('country', -0.1, range=p.x_range), top='proteins_100g', width=0.4, source=ColumnDataSource(data2),
       color='#f7cac9', legend_label='protein')

p.vbar(x=dodge('country',  0.1,  range=p.x_range), top='fiber_100g', width=0.4, source=ColumnDataSource(data2),
       color='#92a8d1',legend_label = "fiber")


p.x_range.range_padding = 0.1
p.xgrid.grid_line_color = None
p.legend.location = "top_left"
p.legend.orientation = "horizontal"

show(p)


Switzerland's products target the protein factor while Australia contain high levels of fiber. __Given both of these countries stood out for out for their healthy product offerings in the previous section, this may indicate the presence of a health conscious market in Switzerland and Australia__

It seems a little odd that the US has a product with a 100g of protein and another with 100g fiber ... for a product to have 100 grams of protein or fiber per 100 grams, it must be entirely made up of protein or fiber. Let's see what these products are. 

In [114]:
#selecting country as US
usa = nutrients[nutrients['country']=='US']
usa[['product_name','serving_size','brands','energy_100g','fat_100g','fiber_100g','proteins_100g']].sort_values(by='proteins_100g',ascending=False)[:3]

Unnamed: 0,product_name,serving_size,brands,energy_100g,fat_100g,fiber_100g,proteins_100g
152137,Tcho-A-Day Dark Chocolate,8 g (1 BAR),Tcho,2301.0,35.0,0.0,100.0
41498,Unflavored Gelatin,7 g (7 g),Meijer,1795.0,0.0,0.0,100.0
18852,"Fisherman's Wharf, Cocktail Shrimp",22 g (8 SHRIMP AND 0.8 OZ SAUCE | ABOUT),Winn-Dixie Stores Inc.,2092.0,6.82,0.0,86.36


Unless the Tcho-A-Day Dark Chocolate bar actually is made of pure protein (if so, please tell me where to find it 😋), its protein content is probably the result of a data entry error. Otherwise, unflavored gelatin (a protein product derived from collagen) takes the gold for the US. This product is typically 99% protein and often taken as a protein supplement. 

In [115]:
#top 3 USA products with the highest fiber content
usa[['product_name','serving_size','brands','energy_100g','fat_100g','fiber_100g','proteins_100g']].sort_values(by='fiber_100g',ascending=False)[:3]

Unnamed: 0,product_name,serving_size,brands,energy_100g,fat_100g,fiber_100g,proteins_100g
87624,"Yamashin, Powdered Agar-Agar",4 g (1 PACKAGE),Wismettac Asian Foods Inc.,1046.0,0.0,100.0,0.0
37059,Premium Guar Gum,10 g (1 Tbsp),Bob's Red Mill,1255.0,0.0,90.0,0.0
77233,Xanthan Gum,1.15 g (0.5 tsp),,0.0,0.0,87.0,0.0


The products the US offers with the highest fiber content are all a little unusual. Agar-agar is a jelly-like product derived from seaweed. Guar gum consist of the fiber from the seed of the guar plant and Xantham gum is a plant-based thickening and stabilizing agent. All three of these products are typically used as thickening or stablizing agents within vegan cooking to help food imitate the consistency or creamyness provided from using animal products. __These products may indicate that a market for vegan, animal-product substitutes exists in the US.__

While we're on the topic of vegan products...

### Let's Look At The Popularity of Alternative Diets By Country

If a product fits a vegetarian diet, than it contains no animal meat products. If it fits a vegan diet, it contains no animal or dairy products. Foods that are vegan are also vegetarian.

In the data quality file, I used categorized the foods by their diet type. Let's use these tags to discovery which countries have preferences for the different diets. 

In [116]:
diets = nutrients[nutrients['Special Diet']!= 'Not Specified']

#gives us proportion of popularity of each diet to normalize for amount of products per country
diets2 = pd.crosstab(diets['country'],diets['Special Diet']).apply(lambda r: r/r.sum(), axis=1)

In [117]:
from bokeh.models.ranges import FactorRange
p = figure(title="Special Diet Popularity",
           y_axis_label='Country', x_axis_label='Proportion of Products',
           y_range = FactorRange(factors=list(diets2.index)),
           plot_height=600, plot_width=800)

p.hbar(y=diets2.index, left=0, right=diets2['Dairy Product'], height=0.2, color='#ff6f69', legend_label='Dairy')
p.hbar(y=diets2.index, left=diets2['Dairy Product'], right=diets2['Meat Product'], height=0.2, color='#ffcc5c', legend_label='Meat')
p.hbar(y=diets2.index, left=diets2['Meat Product'], right=diets2['Vegan'], height=0.2, color='#96ceb4', legend_label='Vegan')
p.hbar(y=diets2.index, left=diets2['Vegan'], right=diets2['Vegetarian'], height=0.2, color='#ffeead', legend_label='Vegetarian')


show(p)

__Wow! The majority of Spain's products in this dataset are vegan and no other country comes close. All of the countries seem to have fairly high proportions of dairy products with Russia taking the lead. France offers the greatest proportion of meat products while the UK and Australia are tied for the number of vegetarian products.__

*Remember that any vegan products can also be classified as vegetarian. We chose to separate them here to see how countries differ in products offered between these two categories.*

### Does The Nutritional Content of Products Differ By Special Diet Category?

In [118]:
nutr_cat = diets.groupby(['Special Diet'])[['proteins_100g','fiber_100g','fat_100g','sodium_100g']].mean()
nutr_cat = nutr_cat.T
nutr_cat.index = nutr_content
nutr_content = ['Proteins','Fibers','Fat','Sodium']

In [119]:
from bokeh.models.annotations import Label

#Initializing our plot
p = figure(x_range=nutr_cat.index.tolist(), title='Average Nutritional Content Per Special Diet')

#Plotting a line
p.line(nutr_cat.index,
      nutr_cat['Dairy Product'],
        line_color="tomato", legend_label = 'Dairy Product',
      line_width=1)

p.line(nutr_cat.index,
      nutr_cat['Meat Product'],
       line_color='#96ceb4',  legend_label = 'Meat Product',
      line_width=1)

p.line(nutr_cat.index,
      nutr_cat['Vegan'], line_color="indigo", legend_label = 'Vegan',
      line_width=1)

p.line(nutr_cat.index,
      nutr_cat['Vegetarian'],
       line_color="gold", legend_label = 'Vegetarian',
      line_width=1)

#Plotting data points as cirles
p.circle(nutr_cat.index,
      nutr_cat['Dairy Product'],
        radius=.05,
        fill_color="tomato", line_color='tomato')

p.circle(nutr_cat.index,
      nutr_cat['Meat Product'],
        radius=.05,
        fill_color='#96ceb4', line_color = '#96ceb4')

p.circle(nutr_cat.index,
      nutr_cat['Vegan'],
        radius=.05,
        fill_color="indigo", line_color='indigo')

p.circle(nutr_cat.index,
      nutr_cat['Vegetarian'],
        radius=.05,
        fill_color='gold', line_color='gold')


p.legend.location = "top_left"

show(p)

Meat products generally have high average values of protein, fat, and sodium while vegetarian and vegan products take the lead in average fiber content. 

All types of diets tend to be fairly high in fat, let's take a closer look at how the type of fat differs by diet. Remember, bad fat per 100 grams measures the total amount of both saturated and trans fats per 100 grams in the product

In [120]:
colormap = {'Dairy Product': "tomato", 'Meat Product': '#96ceb4', 'Vegan': 'indigo', 'Vegetarian':'gold'}
colors = [colormap[x] for x in diets['Special Diet']]


p = figure(title ="Relationship Between Fat and Bad Fats" )
p.xaxis.axis_label = "Bad Fats Per 100g"
p.yaxis.axis_label = 'Fats Per 100g'

for diet in colormap.keys():
    df = diets[diets['Special Diet']==diet]
    p.circle(x=df['bad-fats_100g'],y=df['fat_100g'], legend_label= diet,  color= colormap[diet], fill_alpha = 0.2, size = 10)
p.legend.location = "top_left"
show(p)

It looks like the fat content of dairy products is primarily derived from the unhealthy types of fats. Vegan and vegetarian tend to have lower values of bad fats per fat content while overall, meat products tend to be fairly low in fat.  

__Concluding Thoughts__

We found that Italy may have a bit of a sweet tooth, offering products that tend to have the highest average amounts of sugar and bad-fats per 100 grams. Given the majority of fat in dairy products stems from bad-fats, it might be helpful to stress the benefits of low dairy diets. 

Australia's products contain the highest average amounts of sodium and fiber. Given the sodium content tends to be higher in packaged goods, this could imply that Australia eats more processed and packaged food than other countries. Australia also offers a high proportion of vegetarian products, which tend to have high fiber content. When combined with the fact that Australia's products contained the highest average amounts of fiber, this may indicate a strong preference of vegetarian food items in Australia. 

While the US may not have had the highest averages for specific nutrients, they offer products with the most extreme amounts of protein and fiber. This may imply the existence of a health conscious market within the US, focused on taking supplements or food products merely for their nutritional value. 

Spain earns the gold medal when it comes to the amount of vegan products offered. This may suggest the existence of a large market for plant-based, meat alternatives. 

Overall, it is important to remember the that insights in this dataset reflect the products uploaded to the open food facts database. The data offers interesting tidbits about the ways different countries eat however, it may not be representative of the general eating habits of the whole population of these countries. 

Sources:


__[EC Report on Italy's Health](https://ec.europa.eu/health/sites/health/files/state/docs/chp_it_english.pdf#:~:text=Close%20to%20two-thirds%20of%20all%20deaths%20in%20Italy,to%20cardiovascular%20diseases%20and%20another%20one-third%20to%20cancer)__
__[High-Sugar Diets and Cardiovascular Disease](https://www.medicalnewstoday.com/articles/319663)__
__[OECD Report On Obesity](http://www.oecd.org/switzerland/Obesity-Update-2014-SWITZERLAND.pdf)__