# Report 3

### by Kaitlyn Keil and Kevin Zhang
### April 2017

<a href = http://tna.europarchive.org/20110116113217/http:/www.food.gov.uk/science/dietarysurveys/dietsurveys/>UK Food Network Dataset</a>

http://tna.europarchive.org/20110116113217/http://www.food.gov.uk/multimedia/pdfs/cofuserdoc.pdf

In [3]:
""" This file contains code used to analyze the
UK Food Nutrient Database, the information for which can be found
at this link: http://tna.europarchive.org/20110116113217/http:/www.food.gov.uk/science/dietarysurveys/dietsurveys/

The following packages are required to run this code.
"""

from __future__ import print_function, division
import pandas as pd

import sys
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
%matplotlib inline
import seaborn as sns

from collections import defaultdict, Counter
import statsmodels.formula.api as smf

In [4]:
### This is for the other massive set

def ReadIngredients(region='African'):
    """ Takes the ingedients dataset and converts it into a Dataframe. 
    Region can be: African, EastAsian, EasternEuropean, LatinAmerican,
        MiddleEastern, NorthAmerican, NorthernEuropean, SouthAsian,
        SoutheastAsian, SouthernEuropean, WesternEuropean"""
    temp = pd.read_csv("FoodDataset/ingredients.csv") # Reads the file with a lot of NaNs and little order
    #temp.fillna(value='None')
    temp2 = temp[temp.cuisine_type==region]
    temp_dict = temp2.to_dict('index')
    
    temp_list = []
    
    for keys,values in temp_dict.items():
        td = {}
        for key,x in values.items():
            if key=='cuisine_type':
                td[key] = x
            else:
                td[x]=1
        temp_list.append(td)
        
        
    return pd.DataFrame(temp_list).fillna(0)

def corrPlot(df):
    """ Given the ingredients dataframe, selects
    the most used ingredients and creates a correlation 
    matrix from them."""
    df1 = df.ix[:, (df.columns != 'cuisine_type')]
    temp_df = df1.sum()
    mdf = temp_df[temp_df > 50].iloc[1:]
    print(mdf)
    corr_df = pd.DataFrame()
    
    for x in mdf.index:
        if x!='index1':
            corr_df[x] = df[x]
        
    corr_df['index1'] = [int(x) for x in corr_df.index.values]
    sns.heatmap(corr_df.corr())
    return corr_df

#ingredients = ReadIngredients('SoutheastAsian')

# Shows what ingredients are most/least likely to be used together.
#  For example, in African cuisine, cumin and coriander are fairly 
#  correlated, whereas vegetable oil and olive oil together are not.
#%matplotlib inline
#corr_df = corrPlot(ingredients)

In [17]:
def ReadProximates():
    df = pd.read_excel('dietary.xls', sheetname='Proximates')
    column_list = ['Water (g)', 'Protein (g)', 'Fat (g)', 'Carbohydrate (g)', 'Total sugars (g)']
    
    df['Water'] = pd.to_numeric(df['Water (g)'], errors='coerce')
    df['Protein'] = pd.to_numeric(df['Protein (g)'], errors='coerce')
    df['Fat'] = pd.to_numeric(df['Fat (g)'], errors='coerce')
    df['Carbohydrate'] = pd.to_numeric(df['Carbohydrate (g)'], errors='coerce')
    df['Sugars'] = pd.to_numeric(df['Total sugars (g)'], errors='coerce')
    
    df['Water'].replace([np.nan], 0, inplace=True)
    df['Protein'].replace([np.nan], 0, inplace=True)
    df['Fat'].replace([np.nan], 0, inplace=True)
    df['Carbohydrate'].replace([np.nan], 0, inplace=True)
    df['Sugars'].replace([np.nan], 0, inplace=True)
    
    
#     for column in column_list:
#         new_name = column[:-4]
#         df[new_name]= df[column]
#         for key, value in df[column].items():
#             if isinstance(value, str):
#                 if '(' in value:
#                     df[new_name][key] = float(value[1:-1])
#                 elif value=='Tr':
#                     df[new_name][key] = 0.0
#                 elif value=='N':
#                     df[new_name][key] = 0.0
#                 elif math.isnan(float(value)):
#                     df[new_name][key] = 0.0
#                     print('NaN found')
#                 else:
#                     df[new_name][key] = float(value)
        
    return df

In [7]:
#tester = pd.read_excel('dietary.xls', sheetname='Proximates')
# tester = ReadProximates()
# TO DO
# For cleaning: 
#  Get rid of Ns and Trs <- Done!
#  Figure out how to deal with parentheses <- Done!
#  Convert rows into floats (.astype) <- Done!
#  Get ready for lots of brackets...
#  Potentially just make different columns for the integer values <- ...done?

# For report:
#  Figure out if we can actually classify based on these values
#  Make a dictionary mapping from the Group codes to actual names
#  Figure out a good model
#  Do the thing
#  PCA to figure out the components we will want to look at
#  Unclassified ML to figure out natural groups?


In [19]:
tester = ReadProximates()
tester['Sugars']

0        1.6
1        2.3
2        2.2
3        6.1
4        4.6
5        1.5
6        2.9
7        2.0
8        2.4
9        1.0
10       2.1
11       2.3
12       5.0
13       4.2
14       1.2
15       2.2
16       1.2
17       3.0
18       0.0
19       2.3
20       2.6
21       3.6
22       4.3
23       7.3
24       6.9
25       5.9
26       1.4
27      12.0
28      12.4
29      32.8
        ... 
3393     2.5
3394     2.1
3395     2.8
3396     2.0
3397     7.3
3398    20.6
3399    24.8
3400     3.6
3401     3.0
3402     1.0
3403     1.7
3404     1.9
3405     2.0
3406     5.5
3407     0.0
3408     5.6
3409     0.0
3410     1.4
3411     1.4
3412     0.8
3413     0.4
3414     0.9
3415     0.6
3416     3.6
3417     2.5
3418     3.9
3419     0.2
3420     0.2
3421     4.2
3422     0.0
Name: Sugars, dtype: float64

In [21]:
from sklearn.cluster import KMeans
#['Water (g)', 'Protein (g)', 'Fat (g)', 'Carbohydrate (g)', 'Total sugars (g)']
water = pd.Series(tester.Water, name='Water')
protein = pd.Series(tester.Protein, name='Protein')
fat = pd.Series(tester.Fat, name='Fat')
carbs = pd.Series(tester.Carbohydrate, name='Carbs')
sugars = pd.Series(tester['Sugars'], name='Sugars')
X = pd.concat([water,protein,fat,carbs,sugars], axis=1)
X.fillna(0)
kmeans = KMeans(n_clusters=15, random_state=0)
kmeans.fit(X.dropna())

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=15, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)