In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
import math

In [2]:
#standard % daily value for each nutrient 
dailyvals = {'Total Fat': 65, 'Saturated Fat': 20, 'Cholesterol': 300, 'Sodium': 2400, 'Potassium': 3500,
             'Total Carbohydrates': 300, 'Dietary Fiber': 25, 'Protein': 50, 'Sugars': 31.5,'Vitamin A': 5000,
             'Vitamin C': 60, 'Calcium': 1000, 'Iron': 18, 'Thiamin': 1.5, 'Niacin': 20, 'Vitamin B6': 2, 'Folate': 400,
            'Magnesium': 400}

In [3]:
#get the combined recipe nutrition database
dir = 'allrecipes_data/'
db = pd.concat([pd.read_csv(dir+'allrecipes_nutrdb_all_0to1000.csv'),pd.read_csv(dir+'allrecipes_nutrdb_all_1000to2000.csv'),
                pd.read_csv(dir+'allrecipes_nutrdb_all_2000to3000.csv'),pd.read_csv(dir+'allrecipes_nutrdb_all_3000to4000.csv'),
               pd.read_csv(dir+'allrecipes_nutrdb_all_4000to5000.csv'),pd.read_csv(dir+'allrecipes_nutrdb_all_5000to6000.csv'),
               pd.read_csv(dir+'allrecipes_nutrdb_all_6000to6899.csv')])
db = db.rename(columns={'Unnamed: 0':'recipename'})
db = db.reset_index(drop=True)
db.head()

Unnamed: 0,recipename,Total Fat,Saturated Fat,Cholesterol,Sodium,Potassium,Total Carbohydrates,Dietary Fiber,Protein,Sugars,Vitamin A,Vitamin C,Calcium,Iron,Thiamin,Niacin,Vitamin B6,Magnesium,Folate,Calories
0,a good easy garlic chicken,10.1g,6.0g,91mg,368mg,329mg,1.7g,0.2g,27.6g,1g,352IU,2mg,23mg,1mg,0mg,19mg,1mg,35mg,6mcg,214
1,a jerky chicken,2.7g,1.0g,68mg,982mg,385mg,13.5g,0.3g,28.5g,10g,60IU,4mg,32mg,1mg,0mg,19mg,1mg,43mg,8mcg,197
2,a simply perfect roast turkey,33.8g,10.0g,211mg,710mg,794mg,13.7g,0.9g,72.2g,2g,320IU,4mg,82mg,5mg,0mg,26mg,1mg,62mg,28mcg,663
3,absolute best liver and onions,20.7g,11.0g,578mg,309mg,939mg,74.2g,4.4g,48.9g,11g,37510IU,13mg,165mg,12mg,1mg,39mg,2mg,71mg,518mcg,687
4,acapulco chicken,13.9g,3.0g,72mg,635mg,742mg,23.8g,4.9g,30.1g,6g,2269IU,86mg,64mg,3mg,0mg,19mg,1mg,59mg,42mcg,333


In [4]:
#check recipes that are "versions" of each other, usually end with a roman numeral
firstversions = [(num,db.iloc[num].recipename) for num in range(0,len(db)) if 'i' in db.iloc[num].recipename.split(' ')]
secondversions = [(num,db.iloc[num].recipename) for num in range(0,len(db)) if 'ii' in db.iloc[num].recipename.split(' ')]
thirdversions = [(num,db.iloc[num].recipename) for num in range(0,len(db)) if 'iii' in db.iloc[num].recipename.split(' ')]
fourthversions = [(num,db.iloc[num].recipename) for num in range(0,len(db)) if 'iv' in db.iloc[num].recipename.split(' ')]
fifthversions = [(num,db.iloc[num].recipename) for num in range(0,len(db)) if 'v' in db.iloc[num].recipename.split(' ')]
sixthversions = [(num,db.iloc[num].recipename) for num in range(0,len(db)) if 'vi' in db.iloc[num].recipename.split(' ')]
firstversions

[(39, 'anniversary chicken i'),
 (43, 'apricot chicken i'),
 (83, 'baked macaroni and cheese i'),
 (88, 'baked pork chops i'),
 (96, 'baked spaghetti i'),
 (101, 'baked ziti i'),
 (122, 'bbq chicken pizza i'),
 (174, 'broccoli chicken casserole i'),
 (235, 'chicken a la king i'),
 (254, 'chicken cordon bleu i'),
 (258, 'chicken enchiladas i'),
 (277, 'chicken noodle casserole i'),
 (283, 'chicken pot pie i'),
 (293, 'chicken spaghetti casserole i'),
 (306, 'chili dog casserole i'),
 (310, 'chinese chicken fried rice i'),
 (374, 'easy cheesy chicken i'),
 (387, 'easy lasagna i'),
 (404, 'eggplant parmesan i'),
 (437, 'fried rice i'),
 (460, 'gnocchi i'),
 (488, 'grilled salmon i'),
 (543, 'italian style meatloaf i'),
 (559, 'king ranch chicken casserole i'),
 (614, 'mexican chicken i'),
 (661, 'parmesan chicken i'),
 (693, 'pizza on the grill i'),
 (722, 'puttanesca i'),
 (757, 'salmon patties i'),
 (815, 'slow cooker beef stroganoff i'),
 (870, 'spaghetti squash i'),
 (895, 'stuffed gr

In [5]:
#get the indices of replicate recipes to remove (but keep the min/max calorie replicates)
db_indices_to_remv = []
versions = [firstversions,secondversions,thirdversions,fourthversions,fifthversions,sixthversions]
for ver in versions:
    for repl in ver:
         replind = np.array([num for num in range(0,len(db)) if ' '.join(db.iloc[repl[0]].recipename.split(' ')[:-1]) 
                             in db.iloc[num].recipename]) #returns tuple of indices of all recipes with similar base name
         replcalories = db.iloc[replind].Calories.reset_index(drop=True)
         keepind = [replcalories.idxmin(),replcalories.idxmax()]
         removeind = [x for x in range(0,len(replind)) if x not in keepind]
         db_indices_to_remv.append(replind[removeind])

In [6]:
db_indices_to_remv = np.unique(np.concatenate(db_indices_to_remv))
db_repl_removed = db.drop(db_indices_to_remv)
print('number of replicates to remove:',len(db_indices_to_remv))
print('number of samples after repls removed:',len(db_repl_removed))
db = db_repl_removed
db = db.reset_index(drop=True)

number of replicates to remove: 620
number of samples after repls removed: 5900


In [7]:
db.head()

Unnamed: 0,recipename,Total Fat,Saturated Fat,Cholesterol,Sodium,Potassium,Total Carbohydrates,Dietary Fiber,Protein,Sugars,Vitamin A,Vitamin C,Calcium,Iron,Thiamin,Niacin,Vitamin B6,Magnesium,Folate,Calories
0,a good easy garlic chicken,10.1g,6.0g,91mg,368mg,329mg,1.7g,0.2g,27.6g,1g,352IU,2mg,23mg,1mg,0mg,19mg,1mg,35mg,6mcg,214
1,a jerky chicken,2.7g,1.0g,68mg,982mg,385mg,13.5g,0.3g,28.5g,10g,60IU,4mg,32mg,1mg,0mg,19mg,1mg,43mg,8mcg,197
2,a simply perfect roast turkey,33.8g,10.0g,211mg,710mg,794mg,13.7g,0.9g,72.2g,2g,320IU,4mg,82mg,5mg,0mg,26mg,1mg,62mg,28mcg,663
3,absolute best liver and onions,20.7g,11.0g,578mg,309mg,939mg,74.2g,4.4g,48.9g,11g,37510IU,13mg,165mg,12mg,1mg,39mg,2mg,71mg,518mcg,687
4,acapulco chicken,13.9g,3.0g,72mg,635mg,742mg,23.8g,4.9g,30.1g,6g,2269IU,86mg,64mg,3mg,0mg,19mg,1mg,59mg,42mcg,333


In [8]:
#convert nutrient values to floats and strip the unit, add predicted health score, convert to %daily vals

ind = db['Sodium'].apply(lambda x: '< 1' in x)
db.loc[ind,'Sodium'] = '0mg'
ind = db['Cholesterol'].apply(lambda x: '< 1' in x)
db.loc[ind,'Cholesterol'] = '0mg'
ind = db['Sugars'].apply(lambda x: 'g' not in x)
db.loc[ind,'Sugars'] = '0g'

db['Vitamin A'] = db['Vitamin A'].apply(lambda x: float(x[:-2]))/dailyvals['Vitamin A']
db['Vitamin C'] = db['Vitamin C'].apply(lambda x: float(x[:-2]))/dailyvals['Vitamin C']
db['Calcium'] = db['Calcium'].apply(lambda x: float(x[:-2]))/dailyvals['Calcium']
db['Iron'] = db['Iron'].apply(lambda x: float(x[:-2]))/dailyvals['Iron']
db['Thiamin'] = db['Thiamin'].apply(lambda x: float(x[:-2]))/dailyvals['Thiamin']
db['Niacin'] = db['Niacin'].apply(lambda x: float(x[:-2]))/dailyvals['Niacin']
db['Vitamin B6'] = db['Vitamin B6'].apply(lambda x: float(x[:-2]))/dailyvals['Vitamin B6']
db['Magnesium'] = db['Magnesium'].apply(lambda x: float(x[:-2]))/dailyvals['Magnesium']
db['Folate'] = db['Folate'].apply(lambda x: float(x[:-3]))/dailyvals['Folate']

db['Total Fat'] = db['Total Fat'].apply(lambda x: float(x[:-1]))
db['Saturated Fat'] = db['Saturated Fat'].apply(lambda x: float(x[:-1]))
db['Cholesterol'] = db['Cholesterol'].apply(lambda x: float(x[:-2]))
db['Sodium'] = db['Sodium'].apply(lambda x: float(x[:-2]))
db['Potassium'] = db['Potassium'].apply(lambda x: float(x[:-2]))
db['Total Carbohydrates'] = db['Total Carbohydrates'].apply(lambda x: float(x[:-1]))
db['Dietary Fiber'] = db['Dietary Fiber'].apply(lambda x: float(x[:-1]))
db['Protein'] = db['Protein'].apply(lambda x: float(x[:-1]))
db['Sugars'] = db['Sugars'].apply(lambda x: float(x[:-1]))

#Health Score from Martin et al 2009
db['Health Score'] = (0.710-0.0538*db['Total Fat']-0.423*db['Saturated Fat']-0.00398*db['Cholesterol']-
0.00254*db['Sodium']-0.03*db['Total Carbohydrates']+0.0561*db['Dietary Fiber']-
0.0245*db['Sugars']+0.123*db['Protein']+0.00562*db['Vitamin A']+0.0137*db['Vitamin C']+
0.0685*db['Calcium']-0.0186*db['Iron'])

db['Total Fat'] = db['Total Fat']/dailyvals['Total Fat']
db['Saturated Fat'] = db['Saturated Fat']/dailyvals['Saturated Fat']
db['Cholesterol'] = db['Cholesterol']/dailyvals['Cholesterol']
db['Sodium'] = db['Sodium']/dailyvals['Sodium']
db['Potassium'] = db['Potassium']/dailyvals['Potassium']
db['Total Carbohydrates'] = db['Total Carbohydrates']/dailyvals['Total Carbohydrates']
db['Dietary Fiber'] = db['Dietary Fiber']/dailyvals['Dietary Fiber']
db['Protein'] = db['Protein']/dailyvals['Protein']
db['Sugars'] = db['Sugars']/dailyvals['Sugars']

db.head()

Unnamed: 0,recipename,Total Fat,Saturated Fat,Cholesterol,Sodium,Potassium,Total Carbohydrates,Dietary Fiber,Protein,Sugars,...,Vitamin C,Calcium,Iron,Thiamin,Niacin,Vitamin B6,Magnesium,Folate,Calories,Health Score
0,a good easy garlic chicken,0.155385,0.3,0.303333,0.153333,0.094,0.005667,0.008,0.552,0.031746,...,0.033333,0.023,0.055556,0.0,0.95,0.5,0.0875,0.015,214,-0.336366
1,a jerky chicken,0.041538,0.05,0.226667,0.409167,0.11,0.045,0.012,0.57,0.31746,...,0.066667,0.032,0.055556,0.0,0.95,0.5,0.1075,0.02,197,0.251289
2,a simply perfect roast turkey,0.52,0.5,0.703333,0.295833,0.226857,0.045667,0.036,1.444,0.063492,...,0.066667,0.082,0.277778,0.0,1.3,0.5,0.155,0.07,663,0.491193
3,absolute best liver and onions,0.318462,0.55,1.926667,0.12875,0.268286,0.247333,0.176,0.978,0.349206,...,0.216667,0.165,0.666667,0.666667,1.95,1.0,0.1775,1.295,687,-4.331888
4,acapulco chicken,0.213846,0.15,0.24,0.264583,0.212,0.079333,0.196,0.602,0.190476,...,1.433333,0.064,0.166667,0.0,0.95,0.5,0.1475,0.105,333,-0.066619


In [10]:
#combine nutrition data with reviews, links information 
db = db.set_index('recipename')
db = db[(np.abs(stats.zscore(db))<3).all(axis=1)] #remove outliers

ratings = pd.read_csv('allrecipes_db_all.csv')
ratings = ratings.drop(columns=['Unnamed: 0'])
ratings = ratings.set_index('recipename')
ratings = ratings[~ratings.index.duplicated(keep='first')] #remove duplicated recipes 

#transform average review using lower confidence bound of binomial proportion 
ratings['ratings'] = ratings['ratings']/5
ratings['ratings'] = (ratings['ratings'] - (ratings['ratings']*(1-ratings['ratings'])/ratings['numreviews']).apply(lambda x: math.sqrt(x)))*5

nutr_ratings = db.join(ratings,on='recipename',how='left')
nutr_ratings = nutr_ratings.reset_index('recipename')
nutr_ratings = nutr_ratings.drop_duplicates('recipename')
len(nutr_ratings)

# nutr_ratings.to_csv('allrecipes_nutr_ratingsdb.csv')

4801

In [15]:
nutr_ratings.head()

Unnamed: 0,recipename,Total Fat,Saturated Fat,Cholesterol,Sodium,Potassium,Total Carbohydrates,Dietary Fiber,Protein,Sugars,...,Thiamin,Niacin,Vitamin B6,Magnesium,Folate,Calories,Health Score,ratings,numreviews,links
0,a good easy garlic chicken,0.155385,0.3,0.303333,0.153333,0.094,0.005667,0.008,0.552,0.031746,...,0.0,0.95,0.5,0.0875,0.015,214,-0.336366,4.289474,1122,https://www.allrecipes.com/recipe/23998/a-good...
1,a jerky chicken,0.041538,0.05,0.226667,0.409167,0.11,0.045,0.012,0.57,0.31746,...,0.0,0.95,0.5,0.1075,0.02,197,0.251289,4.477107,292,https://www.allrecipes.com/recipe/50726/a-jerk...
2,acapulco chicken,0.213846,0.15,0.24,0.264583,0.212,0.079333,0.196,0.602,0.190476,...,0.0,0.95,0.5,0.1475,0.105,333,-0.066619,4.207489,283,https://www.allrecipes.com/recipe/19123/acapul...
3,actually delicious turkey burgers,0.146154,0.15,0.3,0.1475,0.083429,0.007667,0.008,0.418,0.0,...,0.0,0.4,0.0,0.06,0.035,183,0.185556,4.453058,1678,https://www.allrecipes.com/recipe/39748/actual...
4,adobo chicken with ginger,0.527692,0.5,0.566667,0.567917,0.150286,0.017667,0.02,0.884,0.0,...,0.0,1.2,0.5,0.1475,0.0425,517,-4.196161,4.344913,275,https://www.allrecipes.com/recipe/28363/adobo-...
