# McDonald's Nutritional Information Dataset
### Found at https://www.kaggle.com/mcdonalds/nutrition-facts
#### Using unsupervised clustering algorithms, what groups within McDonald's menu can be created soelyl on nutrional inf

In [90]:
import pandas as pd
import numpy as np
from sklearn import preprocessing, cluster

In [91]:
df = pd.read_csv("data/menu.csv")

df.dropna(inplace=True)

X = df.drop(['Category', 'Item'],1)
y = df['Category']

## Format serving column
X['Serving Size'] = X['Serving Size'].apply(lambda x : (x[x.find("(")+1:x.rfind(")")]).split(" ")[0])
X['Serving Size'] = X['Serving Size'].astype(float)
X = preprocessing.scale(X)

# 1. Hierarchical Clustering (MeanShift)

In [92]:
clf = cluster.MeanShift()
clf.fit(X)
labels = clf.labels_
cluster_centers = clf.cluster_centers_

df['cluster_group'] = np.nan

for i in range(len(X)):
    df['cluster_group'].iloc[i] = labels[i]
    
n_clusters_ = len(np.unique(labels))

df[ (df['cluster_group']==0) ].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Calories,Calories from Fat,Total Fat,Total Fat (% Daily Value),Saturated Fat,Saturated Fat (% Daily Value),Trans Fat,Cholesterol,Cholesterol (% Daily Value),Sodium,...,Carbohydrates (% Daily Value),Dietary Fiber,Dietary Fiber (% Daily Value),Sugars,Protein,Vitamin A (% Daily Value),Vitamin C (% Daily Value),Calcium (% Daily Value),Iron (% Daily Value),cluster_group
count,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,...,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0,229.0
mean,328.362445,103.231441,11.510917,17.729258,5.310044,26.454148,0.172489,40.829694,13.681223,387.423581,...,15.100437,1.39738,5.611354,31.065502,11.39738,13.790393,4.091703,21.235808,6.20524,0.0
std,185.075979,86.729281,9.6109,14.799718,4.680581,23.395416,0.377182,51.035005,16.998933,427.2663,...,9.113006,1.345621,5.458138,28.91139,8.617821,25.380952,9.597851,17.442546,6.975947,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,200.0,10.0,1.0,2.0,0.5,3.0,0.0,5.0,2.0,90.0,...,10.0,0.0,0.0,6.0,3.0,2.0,0.0,6.0,0.0,0.0
50%,330.0,90.0,10.0,16.0,4.5,22.0,0.0,30.0,10.0,180.0,...,14.0,1.0,5.0,22.0,11.0,8.0,0.0,20.0,4.0,0.0
75%,450.0,180.0,20.0,30.0,8.0,42.0,0.0,50.0,17.0,700.0,...,20.0,2.0,8.0,51.0,16.0,15.0,2.0,35.0,10.0,0.0
max,850.0,290.0,33.0,50.0,20.0,101.0,1.5,300.0,100.0,1480.0,...,47.0,7.0,28.0,123.0,40.0,170.0,70.0,70.0,30.0,0.0


In [93]:
for i in range(n_clusters_):
    temp_df = df[ (df['cluster_group']==i) ]
    print("cluster {} caloric mean: {}".format(i, temp_df.mean()['Calories']))
    print(np.unique(temp_df['Category']))
    print(len(temp_df))
    print(temp_df.head(3)['Item'].tolist())
    print("------------------")

cluster 0 caloric mean: 328.3624454148472
['Beef & Pork' 'Beverages' 'Breakfast' 'Chicken & Fish' 'Coffee & Tea'
 'Desserts' 'Salads' 'Smoothies & Shakes' 'Snacks & Sides']
229
['Egg McMuffin', 'Egg White Delight', 'Sausage McMuffin']
------------------
cluster 1 caloric mean: 180.71428571428572
['Beverages' 'Breakfast' 'Snacks & Sides']
7
['Fruit & Maple Oatmeal', 'Fruit & Maple Oatmeal without Brown Sugar', 'Apple Slices']
------------------
cluster 2 caloric mean: 772.5
['Breakfast']
8
['Sausage Biscuit with Egg (Large Biscuit)', 'Steak & Egg Biscuit (Regular Biscuit)', 'Bacon, Egg & Cheese Bagel']
------------------
cluster 3 caloric mean: 748.6666666666666
['Beef & Pork' 'Breakfast' 'Chicken & Fish' 'Smoothies & Shakes']
15
['Big Breakfast with Egg Whites (Regular Biscuit)', 'Big Breakfast with Egg Whites (Large Biscuit)', 'Big Breakfast with Hotcakes and Egg Whites (Regular Biscuit)']
------------------
cluster 4 caloric mean: 1880.0
['Chicken & Fish']
1
['Chicken McNuggets (40 p

## Observations:
#### Cluster 0: Breakfast/Lightweight entrees.
#### Cluster 1: Lowest mean calories, sides.
#### Cluster 2: Full meal entrees (burgers, sandwiches).
#### Cluster 3: Large multi-side meals (breakfast).
#### Cluster 4: Outlier, highest calorie party dish.

### Clusters most organized by calories, total fat, and Cholesterol

# 2. Flat Clustering (KMeans) 

In [98]:
clf = cluster.KMeans(n_clusters=5)
clf.fit(X)
labels = clf.labels_
cluster_centers = clf.cluster_centers_

df['cluster_group'] = np.nan

for i in range(len(X)):
    df['cluster_group'].iloc[i] = labels[i]
    
n_clusters_ = len(np.unique(labels))
print(n_clusters_)
df[ (df['cluster_group']==0) ].describe()

5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,Calories,Calories from Fat,Total Fat,Total Fat (% Daily Value),Saturated Fat,Saturated Fat (% Daily Value),Trans Fat,Cholesterol,Cholesterol (% Daily Value),Sodium,...,Carbohydrates (% Daily Value),Dietary Fiber,Dietary Fiber (% Daily Value),Sugars,Protein,Vitamin A (% Daily Value),Vitamin C (% Daily Value),Calcium (% Daily Value),Iron (% Daily Value),cluster_group
count,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,...,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0,70.0
mean,501.428571,229.0,25.428571,39.2,8.957143,44.885714,0.328571,101.714286,33.857143,1171.857143,...,14.542857,2.957143,11.871429,7.285714,24.771429,20.128571,10.157143,17.728571,17.5,0.0
std,121.53947,76.331656,8.606656,13.2431,3.697573,18.445016,0.563926,81.155093,27.117183,285.145744,...,3.701491,1.082612,4.665387,4.440077,8.406642,35.159645,11.676355,7.630833,5.602148,0.0
min,220.0,80.0,8.0,13.0,2.0,9.0,0.0,0.0,0.0,290.0,...,3.0,1.0,5.0,0.0,6.0,0.0,0.0,2.0,6.0,0.0
25%,430.0,190.0,21.0,32.0,6.25,32.25,0.0,50.0,16.0,960.0,...,12.0,2.0,8.0,3.0,19.25,4.0,2.0,15.0,15.0,0.0
50%,480.0,220.0,24.5,38.0,9.0,44.0,0.0,77.5,25.5,1210.0,...,14.0,3.0,12.0,7.0,23.5,8.0,8.0,17.5,15.0,0.0
75%,565.0,280.0,31.0,47.75,12.0,60.5,0.5,105.0,35.0,1362.5,...,17.0,3.0,13.75,10.0,30.0,10.0,15.0,23.75,20.0,0.0
max,940.0,530.0,59.0,91.0,19.0,96.0,2.5,300.0,100.0,1800.0,...,23.0,7.0,28.0,16.0,48.0,170.0,70.0,30.0,35.0,0.0


In [99]:
for i in range(n_clusters_):
    temp_df = df[ (df['cluster_group']==i) ]
    print("cluster {} caloric mean: {}".format(i, temp_df.mean()['Calories']))
    print(np.unique(temp_df['Category']))
    print(len(temp_df))
    print(temp_df.head(3)['Item'].tolist())
    print("------------------")

cluster 0 caloric mean: 501.42857142857144
['Beef & Pork' 'Breakfast' 'Chicken & Fish' 'Salads' 'Snacks & Sides']
70
['Egg McMuffin', 'Sausage McMuffin', 'Sausage McMuffin with Egg']
------------------
cluster 1 caloric mean: 123.78787878787878
['Beverages' 'Breakfast' 'Chicken & Fish' 'Coffee & Tea' 'Desserts'
 'Smoothies & Shakes' 'Snacks & Sides']
66
['Hash Brown', 'Chicken McNuggets (4 piece)', 'Chicken McNuggets (6 piece)']
------------------
cluster 2 caloric mean: 294.6511627906977
['Beef & Pork' 'Beverages' 'Breakfast' 'Chicken & Fish' 'Coffee & Tea'
 'Desserts' 'Salads' 'Smoothies & Shakes' 'Snacks & Sides']
86
['Egg White Delight', 'Hotcakes', 'Cinnamon Melts']
------------------
cluster 3 caloric mean: 1100.0
['Breakfast' 'Chicken & Fish']
7
['Big Breakfast (Regular Biscuit)', 'Big Breakfast (Large Biscuit)', 'Big Breakfast with Hotcakes (Regular Biscuit)']
------------------
cluster 4 caloric mean: 627.0967741935484
['Coffee & Tea' 'Smoothies & Shakes']
31
['Mocha (Large)',

## Observations:
### (Using same amount of clusters as the last)
### (Cluster numbers do scramble in KMeans)
#### Cluster 0: Highest calorie meals and full meals
#### Cluster 1: Lowest caloric mean, small sides
#### Cluster 2: Only large drinks
#### Cluster 3: Single entrees
#### Cluster 4: Desserts and single entrees

### Cluster groups are a lot more diverse (more items per group). Seems more clustered according to calorie and other dietary information. Last clustering model focused solely calories creating uneven groups.