In [90]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore', module='sklearn')

In [3]:

df = pd.read_csv('data/Nutritions_US.csv', encoding='latin1')


In [6]:
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
print('categorical_columns')
print(categorical_columns )
print("=" * 20)
print('numerical_columns')
print(numerical_columns)


categorical_columns
Index(['Shrt_Desc', 'GmWt_Desc1', 'GmWt_Desc2'], dtype='object')
numerical_columns
Index(['NDB_No', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)',
       'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)',
       'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)',
       'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_mg)',
       'Manganese_(mg)', 'Selenium_(¾g)', 'Vit_C_(mg)', 'Thiamin_(mg)',
       'Riboflavin_(mg)', 'Niacin_(mg)', 'Panto_Acid_mg)', 'Vit_B6_(mg)',
       'Folate_Tot_(¾g)', 'Folic_Acid_(¾g)', 'Food_Folate_(¾g)',
       'Folate_DFE_(¾g)', 'Choline_Tot_ (mg)', 'Vit_B12_(¾g)', 'Vit_A_IU',
       'Vit_A_RAE', 'Retinol_(¾g)', 'Alpha_Carot_(¾g)', 'Beta_Carot_(¾g)',
       'Beta_Crypt_(¾g)', 'Lycopene_(¾g)', 'Lut+Zea_ (¾g)', 'Vit_E_(mg)',
       'Vit_D_¾g', 'Vit_D_IU', 'Vit_K_(¾g)', 'FA_Sat_(g)', 'FA_Mono_(g)',
       'FA_Poly_(g)', 'Cholestrl_(mg)', 'GmWt_1', 'GmWt_2'],
      dtype='object')


In [7]:
# check categorical columns#
for c in categorical_columns:
    print("column : " +c)
    print("number of unique values: ", len(df[c].unique()))
    print("sample of unied values: ", df[c].unique()[:10])
    print("="*40)
    
print()
print("="*40)
print("All categorical columns are valid")
print("="*40)

column : Shrt_Desc
number of unique values:  8787
sample of unied values:  ['BUTTER,WITH SALT' 'BUTTER,WHIPPED,W/ SALT' 'BUTTER OIL,ANHYDROUS'
 'CHEESE,BLUE' 'CHEESE,BRICK' 'CHEESE,BRIE' 'CHEESE,CAMEMBERT'
 'CHEESE,CARAWAY' 'CHEESE,CHEDDAR' 'CHEESE,CHESHIRE']
column : GmWt_Desc1
number of unique values:  912
sample of unied values:  ['1 pat,  (1" sq, 1/3" high)' '1 tbsp' '1 oz' '1 cup, diced' '4 oz'
 '1 cup,  (not packed)' '1 cup, crumbled' '1 cup' '1 cup, shredded'
 '.5 cup']
column : GmWt_Desc2
number of unique values:  922
sample of unied values:  ['1 tbsp' '1 cup' '1 cubic inch' '1 cup, shredded' '1 cup, sliced' nan
 '1 cup, melted' '1 cup, large curd (not packed)' '1 cup,  (not packed)'
 '4 oz']

All categorical columns are valid


In [69]:
# df.to_csv('data/Nutritions_US_modified.csv', encoding='latin1', index = False, header=True)

In [230]:
class Meal:
    def __init__(self, df):
        self.df = df
        self.df_norm = df.copy()

        #dropping categorical columns
        self.df_norm = self.df_norm.drop(df.select_dtypes(include=['object', 'category']).columns,axis=1)
        
        # Normalization
        self.df_scalers = {}
        for c in self.df_norm:
            scaler = MinMaxScaler()
            scaler.fit(self.df_norm[[c]])
            self.df_scalers[c] = scaler
            self.df_norm[c]=scaler.transform(self.df_norm[[c]])


        self.display_columns = df.columns
        self.number_suggestions = 5


    def set_display_columns(display_columns):
        return display_columns

    def select_columns(self, dict_point_columns, df_target):
        temp_df = df_target[dict_point_columns].copy()
        temp_df = temp_df.dropna()
        return temp_df

    def scale_point_dict(self, dict_point, df_point_columns):
        point = {column:self.df_scalers[column].transform([[value]])[0][0] for column, value in dict_point.items()}
        return [point[column] for column in df_point_columns]

    def get_nearest_meal(self,dict_point, k=None,df_target=None):
        dict_point_columns = dict_point.keys()

        if k is None:
            k = self.number_suggestions
        
        if df_target is None:
            df_target = self.df_norm

        temp_df = self.select_columns(dict_point_columns, df_target)
        point = self.scale_point_dict(dict_point, temp_df.columns)
    
        model = NearestNeighbors(n_neighbors=k, algorithm='auto').fit(temp_df)

        distances, indices = model.kneighbors(np.array(point).reshape(1,-1))
        indices = temp_df.iloc[indices[0]].index
        nearest_points = df.iloc[indices]

        print(f"The nearest distances are:")
        display(distances)
        print(f"The nearest points to {point} are:")

        display(nearest_points[self.display_columns])

        return nearest_points[self.display_columns]
        

    
    def suggest_meals(self, ingredients, strict_search=False):
        ingredients = [i.lower() for i in ingredients]

        if strict_search:
            masks = [self.df["Shrt_Desc"].str.contains(word, case=False, na=False) for word in ingredients]
            df_target = self.df.copy()
            mask = masks[0]
            for m in masks[1:]:
                mask *= m
            
            df_target = df_target[mask]

        else:
            regex_pattern = '|'.join(ingredients)
            mask = self.df["Shrt_Desc"].str.contains(regex_pattern, na=False, case=False)
            df_target = self.df[mask]


        display(df_target)
        return df_target
    
    def suggest_meals_with_focus(self, ingredients, point, k, strict_search=False):
        if strict_search:
            masks = [self.df["Shrt_Desc"].str.contains(word, case=False, na=False) for word in ingredients]
            df_target = self.df.copy()
            mask = masks[0]
            for m in masks[1:]:
                mask *= m
            
            df_target = df_target[mask]
        else:
            regex_pattern = '|'.join(ingredients)
            mask = self.df["Shrt_Desc"].str.contains(regex_pattern, na=False, case=False)
            df_target = self.df_norm[mask].copy()

        if k > len(df_target.index):
            k = len(df_target.index)

        return self.get_nearest_meal(point, k, df_target)



In [231]:
engine = Meal(df)
rs1 = engine.get_nearest_meal({'Lipid_Tot_(g)':16,'Protein_(g)':26, "Calcium_(mg)":15}, k=4)
rs2 = engine.suggest_meals(['goat','CHEESE'],strict_search=True)
rs3 = engine.suggest_meals_with_focus(['Cheese'],{'Protein_(g)':30,'Energ_Kcal':500, "Sugar_Tot_(g)":1}, k=4, strict_search=True)

The nearest distances are:


array([[0.00222607, 0.00349893, 0.00356165, 0.00412741]])

The nearest points to [0.16, 0.2943840579710145, 0.0020369364475828354] are:


Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),...,Vit_D_IU,Vit_K_(¾g),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2
859,5037,"CHICKEN,BROILERS OR FRYERS,DK MEAT,MEAT&SKN,CK...",58.63,253,25.97,15.78,0.92,0.0,0.0,,...,,,4.37,6.19,3.49,91.0,101.0,"1 unit, (yield from 1 lb ready-to-cook chicken)",167.0,".5 chicken, bone removed"
7309,23229,"BEEF,RIB EYE STEK,BNLES,LIP OFF,LN & FAT,0"" FA...",57.42,248,26.29,15.9,1.05,0.0,0.0,0.0,...,5.0,1.6,7.226,7.786,0.81,77.0,85.0,3 oz,266.0,1 steak
7573,23502,"USDA COMMODITY,BF,GROUND BULK/COARSE GROUND,FR...",56.49,259,26.06,16.34,0.95,0.0,0.0,,...,,,5.744,7.504,0.62,89.0,28.35,1 oz,,
2642,10195,"PORK,FRSH,LOIN,CNTR RIB (CHOPS),BNLESS,LN&FAT,...",58.15,255,26.29,15.79,1.2,0.0,0.0,,...,,,6.12,7.21,1.32,73.0,85.0,3 oz,81.0,"1 chop, excluding refuse (yield from 1 raw cho..."


Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),...,Vit_D_IU,Vit_K_(¾g),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2
138,1156,"CHEESE,GOAT,HARD TYPE",29.01,452,30.52,35.59,3.72,2.17,0.0,2.17,...,26.0,3.0,24.609,8.117,0.845,105.0,28.35,1 oz,,
139,1157,"CHEESE,GOAT,SEMISOFT TYPE",45.52,364,21.58,29.84,2.94,0.12,0.0,0.12,...,22.0,2.5,20.639,6.808,0.709,79.0,28.35,1 oz,,
140,1159,"CHEESE,GOAT,SOFT TYPE",60.75,264,18.52,21.08,1.58,0.0,0.0,0.0,...,15.0,1.8,14.575,4.807,0.501,46.0,28.35,1 oz,,


The nearest distances are:


array([[264.07224145, 364.06582255, 452.45852431]])

The nearest points to [0.3396739130434783, 0.5543237250554324, 0.01002004008016032] are:


Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),...,Vit_D_IU,Vit_K_(¾g),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2
140,1159,"CHEESE,GOAT,SOFT TYPE",60.75,264,18.52,21.08,1.58,0.0,0.0,0.0,...,15.0,1.8,14.575,4.807,0.501,46.0,28.35,1 oz,,
139,1157,"CHEESE,GOAT,SEMISOFT TYPE",45.52,364,21.58,29.84,2.94,0.12,0.0,0.12,...,22.0,2.5,20.639,6.808,0.709,79.0,28.35,1 oz,,
138,1156,"CHEESE,GOAT,HARD TYPE",29.01,452,30.52,35.59,3.72,2.17,0.0,2.17,...,26.0,3.0,24.609,8.117,0.845,105.0,28.35,1 oz,,
