In [1]:
import pandas as pd
import numpy as np
import copy
from geopy.geocoders import Nominatim
from geopy.distance import vincenty

In [2]:
eat_in = pd.read_csv('data/eat_in.csv').dropna(how='all')
eat_out = pd.read_csv('data/eat_out.csv').dropna(how='all')
profiles = pd.read_csv('data/profiles.csv').dropna(how='all')

In [3]:
# Convert to metric
profiles['weight'] = profiles['weight'] * 0.453592
profiles['height'] = profiles['height'] * 2.54
profiles['preferences'] = profiles['preferences'].apply(lambda x : x.lower().split('_'))
profiles['location'] = profiles['location'].apply(lambda lst: tuple([float(x) for x in lst[1:-1].split(',')]))
profiles.index.name = 'id'

In [4]:
eat_out.columns = [x.lower() for x in eat_out.columns]
eat_out['price'] = eat_out.price.apply(lambda x: float(x[1:]))

eat_in.columns = [x.lower() for x in eat_in.columns]
eat_in['price low'] = eat_in['price low'].apply(lambda x: float(x[1:]))
eat_in['price high'] = eat_in['price high'].apply(lambda x: float(x[1:]))
eat_in['price'] = eat_in[['price low','price high']].mean(axis=1)

In [5]:
def fill_missing_profile(series):
    if str(series['age']) == 'nan':
        series['age'] = 30
    
    if str(series['sex']) == 'nan':
        if str(series['weight']) == 'nan':
            series['weight'] = 81.5
        if str(series['height']) == 'nan':
            series['height'] = 169.7
        if str(series['age']) == 'nan':
            series['age'] = 30
    elif str(series['sex']) == 'M':
        if str(series['weight']) == 'nan':
            series['weight'] = 88.3
        if str(series['height']) == 'nan':
            series['height'] = 176.4
        if str(series['age']) == 'nan':
            series['age'] = 30
    elif str(series['sex']) == 'F':
        if str(series['weight']) == 'nan':
            series['weight'] = 74.7
        if str(series['height']) == 'nan':
            series['height'] = 162.9
        if str(series['age']) == 'nan':
            series['age'] = 30
            
    return series

In [6]:
def get_bmr(series):
    
    if series['sex'] == 'M':
        bmr = (10 * series['weight']) + (6.25 * series['height']) - (5 * series['age']) + 5
    elif series['sex'] == 'F':
        bmr = (10 * series['weight']) + (6.25 * series['height']) - (5 * series['age']) - 161
    else:
        bmr = (10 * series['weight']) + (6.25 * series['height']) - (5 * series['age']) - 78
        
    if series['activity level'] == 'little to no':
        bmr *= 1.2
    elif series['activity level'] == 'light':
        bmr *= 1.375
    elif series['activity level'] == 'moderate':
        bmr *= 1.55
    elif series['activity level'] == 'heavy':
        bmr *= 1.725
    elif series['activity level'] == 'very heavy':
        bmr *= 1.9
    else: 
        bmr *= 1.3
        
    return round(bmr,2)

In [7]:
def get_daily_macros(series,bmr,grams=True):
    protein = round(2.2 * series['weight'],2)
    protein_cal = round(4 * protein,2)
    
    fat = round(0.4 * 2.2 * series['weight'],2)
    fat_cal = round(9 * fat,2)
    
    carb_cal = round(bmr - protein_cal - fat_cal,2)
    carb = round(carb_cal / 4,2)
    
    if grams:
        return protein, fat, carb
    else:
        return protein_cal, fat_cal, carb_cal

In [8]:
def get_nutrition(df):
    columns = ['calories','protein','fat','carb']
    for col in columns:
        df[col] = np.nan
    
    for index,row in df.iterrows():
        test_profile = fill_missing_profile(row)
        
        bmr = get_bmr(test_profile)
        protein, fat, carb = get_daily_macros(test_profile,bmr,grams=True)
        
        df.loc[index,'calories'] = bmr
        df.loc[index,'protein'] = protein
        df.loc[index,'fat'] = fat
        df.loc[index,'carb'] = carb
        
    return df

In [9]:
profiles = get_nutrition(profiles)

In [10]:
def get_preference_list():
    preferences_lst = [
        'fast food',
        'mexican',
        'thai',
        'indian',
        'chinese',
        'japanese',
        'caribbean',
        'mediterranean',
        'vegetarian',
        'vegan',
        'sandwiches',
        'american',
        'meat',
        'seafood',
    ]
    return preferences_lst

def get_preference_signal(series,eat_df):
    preferences_lst = get_preference_list()
    pref_dummies = pd.get_dummies(series['preferences'])
    for pref in preferences_lst:
        if pref not in pref_dummies.columns:
            pref_dummies[pref] = 0.4 # loss if missing preferences match, 1-0.4 = loss if meal has type but profile doesn't
    pref_one_hot = pref_dummies.max()
    
    loss = pd.Series(index=eat_df.index)
    for index, row in eat_df[preferences_lst].iterrows():
        loss[index] = (pref_one_hot - row.astype(int)).apply(lambda x : abs(x)).sum()
        
    norm_loss = loss - loss.min()
    norm_loss = norm_loss / norm_loss.max()
    
    return norm_loss

In [11]:
eat_out_signals = pd.DataFrame(columns=['preference','macros','distance','cost','sugar','sodium'])

In [12]:
num_cols = [
    'price',
    'fat (g)',
    'carb (g)',
    'protein (g)',
    'sugar (g)',
    'sodium (mg)',
    'calories',
]

eat_out_double = copy.deepcopy(eat_out)
eat_out_double['meal title'] = eat_out_double['meal title'].apply(lambda x: x + ' x2')
eat_out_double[num_cols] = eat_out[num_cols] * 2
eat_out_full = eat_out.append(eat_out_double,ignore_index=True)

eat_out_full.rename(columns={'fat (g)' : 'fat',
                             'carb (g)' : 'carb',
                             'protein (g)' : 'protein',
                             'sugar (g)' : 'sugar',
                             'sodium (mg)' : 'sodium'},
                   inplace=True)

num_cols = [
    'price low',
    'price high',
    'fat (g)',
    'carb (g)',
    'protein (g)',
    'sugar (g)',
    'sodium (mg)',
    'calories',
]

eat_in_double = copy.deepcopy(eat_in)
eat_in_double['meal title'] = eat_in_double['meal title'].apply(lambda x: x + ' x2')
eat_in_double[num_cols] = eat_in[num_cols] * 2
eat_in_full = eat_in.append(eat_in_double,ignore_index=True)

eat_in_full.rename(columns={'fat (g)' : 'fat',
                            'carb (g)' : 'carb',
                            'protein (g)' : 'protein',
                            'sugar (g)' : 'sugar',
                            'sodium (mg)' : 'sodium'},
                   inplace=True)

In [13]:
def get_macros_signal(series,eat_df):
    macro_lst = ['calories','protein','fat','carb']
    ideal_macros = series[macro_lst] / 3
    
    loss = (eat_df[macro_lst] - ideal_macros).abs()
    norm_loss = loss / loss.max()
    
    calories = norm_loss['calories']
    protein = norm_loss['protein']
    fat = norm_loss['fat']
    carb = norm_loss['carb']
    
    average_loss = norm_loss.mean(axis=1)
    
    return average_loss

In [14]:
def get_coords(eat_df,coord_dict):
    eat_df['latitude'] = np.nan
    eat_df['longitude'] = np.nan

    for index, row in eat_df.iterrows():
        eat_df.loc[index,['latitude','longitude']] = coord_dict[row['location']]
        
    eat_df['coordinates'] = list(zip(eat_df['latitude'],eat_df['longitude']))
    del eat_df['latitude']
    del eat_df['longitude']
        
    return eat_df

def get_distance_signal(series,eat_df):
    geolocator = Nominatim()
    location_dict = {key : geolocator.geocode(key)[-1] for key in eat_df['location'].unique()}
    eat_df = get_coords(eat_df,location_dict)
    
    distances = pd.Series(index=eat_df.index)
    for index,row in eat_df.iterrows():
        distances[index] = vincenty(row['coordinates'], series['location']).miles
        
    distances = distances / distances.max()
        
    return distances

In [15]:
def get_cost_signal(series,eat_df):
    return eat_df['price'] / eat_df['price'].max()

In [16]:
def get_sugar_signal(series,eat_df):
    return eat_df['sugar'] / eat_df['sugar'].max()

In [21]:
def get_sodium_signal(series,eat_df):
    return eat_df['sodium'] / eat_df['sodium'].max()

In [27]:
def get_function():
    function_dict = {
        'preferences' : get_preference_signal,
        'macros' : get_macros_signal,
        'distance' : get_distance_signal,
        'cost' : get_cost_signal,
        'sugar' : get_sugar_signal,
        'sodium' : get_sodium_signal,
    }
    return function_dict

def get_weights():
    weights = {
        'preferences' : 0.25,
        'macros' : 0.3,
        'distance' : 0.15,
        'cost' : 0.2,
        'sugar' : 0.05,
        'sodium' : 0.05,
    }
    return weights

def get_loss(series,eat_df):
    weights = get_weights()
    function_dict = get_function()
    
    sum_weights = sum(weights.values())
    weights = {key :(weights[key] / sum_weights) for key in weights.keys()}
    
    loss = pd.Series(0,index=eat_df.index)
    
    for signal in function_dict.keys():
        loss[:] += weights[signal] * function_dict[signal](series,eat_df)
        
    return loss

In [36]:
def getLoss(profile_df,eat_df):
    loss_df = pd.DataFrame(index=profile_df.index,columns=eat_df.index)
    
    for index,row in profile_df.iterrows():
        loss_df.loc[index,:] = get_loss(row,eat_df)
        
    loss_df.columns.name = 'meal_id'
    loss_df.index.name = 'client_id'
    
    return loss_df

In [37]:
eat_in_loss = getLoss(profiles,eat_in_full)
eat_out_loss = getLoss(profiles,eat_out_full)

In [117]:
def topnLoss(profile_df,eat_df,n=5):
    eat_cols = [
        'resaurant',
        'location',
        'foodstamps',
        'delivery',
        'meal title',
        'price',
        'fat',
        'carb',
        'protein',
        'sugar',
        'sodium',
        'calories',
    ]
    
    profile_cols = ['name_first', 'name_last']
    
    report = pd.DataFrame(index=np.arange(len(profile_df.index)*n),columns=profile_cols+['client_id','meal_id','loss']+eat_cols)
    loss_matrix = getLoss(profile_df,eat_df)
    
    init = 0
    for profile_id in profile_df.index:
        print('Client ID:',profile_id)
        report.loc[init:init+n-1,'client_id'] = profile_id
        report.loc[init:init+n-1,profile_cols] = profile_df.loc[profile_id,profile_cols].values

        topn = loss_matrix.T[profile_id].astype(float).nsmallest(n=n)
        topn_ids = topn.index.tolist()
        topn_loss = round(topn,2).values.tolist()
        
        report.loc[init:init+n-1,'meal_id'] = topn_ids
        report.loc[init:init+n-1,'loss'] = topn_loss
        report.loc[init:init+n-1,eat_cols] = eat_df.loc[topn_ids,eat_cols].values
        
        init += n
        
    return report

In [123]:
report_out = topnLoss(profiles,eat_out_full,n=5)
report_out.to_csv('top_recs_out.csv')

Client ID: 0
Client ID: 1


In [121]:
report_in = topnLoss(profiles,eat_in_full,n=5)
report_in.to_csv('top_recs_in.csv')

Client ID: 0
Client ID: 1
