In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from xgboost import plot_importance
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc


In [3]:
#Defining the characteristics for a liquid
#
nutrients_dataset = "Food And Nutrient Database For Dietary Studies/2021-2023 FNDDS At A Glance - FNDDS Nutrient Values.xlsx"
df = pd.read_excel(nutrients_dataset, sheet_name=0, header=1)

# keywords = ['milk', 'dips', 'spread', 'cream', 'butter', 'sauces', 'dressing', 'ice cream', 'yogurt'
#             , 'formula', 'drinks', 'liquor', 'tea', 'coffee', 'milk', 'water']
keywords = ['milk', 'dips', 'spread', 'cream', 'butter', 'sauces', 'dressing', 'ice cream', 'yogurt', 'formula', 'drinks', 'liquor', 'tea', 'coffee', 'milk', 'water', 'juice', 'soups', 'beverages', 'wine', 'beer']
            

df['Is Liquid'] = df['WWEIA Category description'].apply(lambda x: 1 if any(keyword in str(x).lower() for keyword in keywords) else 0)

In [4]:
df.head()

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),...,22:1\n(g),18:2\n(g),18:3\n(g),18:4\n(g),20:4\n(g),20:5 n-3\n(g),22:5 n-3\n(g),22:6 n-3\n(g),Water\n(g),Is Liquid
0,11100000,"Milk, NFS",1004,"Milk, reduced fat",52,3.33,4.83,4.88,0.0,2.14,...,0.0,0.074,0.008,0.0,0.003,0.0,0.001,0.0,88.92,1
1,11111000,"Milk, whole",1002,"Milk, whole",61,3.27,4.63,4.81,0.0,3.2,...,0.0,0.115,0.013,0.0,0.004,0.001,0.002,0.0,88.1,1
2,11112110,"Milk, reduced fat (2%)",1004,"Milk, reduced fat",50,3.36,4.9,4.89,0.0,1.9,...,0.0,0.061,0.007,0.0,0.003,0.0,0.001,0.0,89.1,1
3,11112210,"Milk, low fat (1%)",1006,"Milk, lowfat",43,3.38,5.18,4.96,0.0,0.95,...,0.0,0.033,0.004,0.0,0.001,0.0,0.0,0.0,89.7,1
4,11113000,"Milk, fat free (skim)",1008,"Milk, nonfat",34,3.43,4.92,5.05,0.0,0.08,...,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,90.8,1


In [5]:
df['Is Liquid'].value_counts()

Is Liquid
0    4472
1     959
Name: count, dtype: int64

Nutritional Density Score - Solids using the NRF9.3 Model

In [6]:
# alt method to display the top 10 foods in df_solid by NRF9.3 Score


def calculate_nrf9_3(df):
    """
    Calculate the NRF9.3 score for a DataFrame of food items, normalizing nutrients per 100 kcal.

 """
  

    beneficial_nutrients = [
        "Protein (g)",
        "Fiber, total dietary (g)",
        "Vitamin A, RAE (mcg_RAE)",
        "Vitamin C (mg)",
        "Vitamin E (alpha-tocopherol) (mg)",
        "Calcium (mg)",
        "Iron\n(mg)",
        "Magnesium (mg)",
        "Potassium (mg)",
    ]

    limiting_nutrients = [
        "Fatty acids, total saturated (g)",
        "Sugars, total\n(g)",
        "Sodium (mg)",
    ]

    # Normalize nutrient values per 100 kcal
    for nutrient in beneficial_nutrients + limiting_nutrients:
        df[nutrient + "_per_100kcal"] = (df[nutrient] / df["Energy (kcal)"]) * 100

    # Calculate NRF9.3
    df["NRF9.3"] = df[[n + "_per_100kcal" for n in beneficial_nutrients]].sum(
        axis=1
    ) - df[[n + "_per_100kcal" for n in limiting_nutrients]].sum(axis=1)

    return df


#drop all rows that are liquid = 1
# df_solid = df[df['Is Liquid'] == 0]


df_NR9b = calculate_nrf9_3(df)
# df_NR9b = calculate_nrf9_3(df_solid)
print(df_NR9b[["Food code", "Main food description", "NRF9.3"]])

      Food code                 Main food description       NRF9.3
0      11100000                             Milk, NFS   594.867308
1      11111000                           Milk, whole   451.885246
2      11112110                Milk, reduced fat (2%)   677.180000
3      11112210                    Milk, low fat (1%)   729.934884
4      11113000                 Milk, fat free (skim)   977.444118
...         ...                                   ...          ...
5426   99997810     Vegetables as ingredient in curry   842.752830
5427   99997815     Vegetables as ingredient in soups   836.560000
5428   99997820     Vegetables as ingredient in stews  1027.135593
5429   99998130     Sauce as ingredient in hamburgers  -225.313971
5430   99998210  Industrial oil as ingredient in food    -2.481166

[5431 rows x 3 columns]


In [7]:
df['Is Liquid'].value_counts()

Is Liquid
0    4472
1     959
Name: count, dtype: int64

In [8]:
df_NR9b['Is Liquid'].value_counts()

Is Liquid
0    4472
1     959
Name: count, dtype: int64

In [9]:
df_NR9b.nlargest(10, 'NRF9.3')

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),...,Vitamin C (mg)_per_100kcal,Vitamin E (alpha-tocopherol) (mg)_per_100kcal,Calcium (mg)_per_100kcal,Iron\n(mg)_per_100kcal,Magnesium (mg)_per_100kcal,Potassium (mg)_per_100kcal,"Fatty acids, total saturated (g)_per_100kcal","Sugars, total\n(g)_per_100kcal",Sodium (mg)_per_100kcal,NRF9.3
5107,92303100,"Tea, hot, leaf, green, decaffeinated",7304,Tea,0,0.0,0.0,0.0,0.0,0.0,...,,,,,inf,inf,,,,inf
5142,92308550,"Tea, iced, brewed, green, decaffeinated, unswe...",7304,Tea,0,0.0,0.0,0.0,0.0,0.0,...,,,,,inf,inf,,,,inf
5328,94200200,"Water, enhanced, diet",7804,Enhanced water,0,0.0,0.0,0.0,0.0,0.0,...,inf,,,,,inf,,,,inf
4255,72130100,"Watercress, raw",6411,Other dark green vegetables,11,2.3,1.29,0.2,0.5,0.1,...,390.909091,9.090909,1090.909091,1.818182,190.909091,3000.0,0.245455,1.818182,372.727273,5788.845455
4989,92100000,"Coffee, NS as to type",7302,Coffee,1,0.12,0.0,0.0,0.0,0.02,...,0.0,1.0,200.0,1.0,300.0,4900.0,0.2,0.0,200.0,5213.8
4990,92100500,"Coffee, NS as to brewed or instant",7302,Coffee,1,0.12,0.0,0.0,0.0,0.02,...,0.0,1.0,200.0,1.0,300.0,4900.0,0.2,0.0,200.0,5213.8
4991,92101000,"Coffee, brewed",7302,Coffee,1,0.12,0.0,0.0,0.0,0.02,...,0.0,1.0,200.0,1.0,300.0,4900.0,0.2,0.0,200.0,5213.8
4996,92101700,"Coffee, brewed, flavored",7302,Coffee,1,0.12,0.0,0.0,0.0,0.02,...,0.0,1.0,200.0,1.0,300.0,4900.0,0.2,0.0,200.0,5213.8
5086,92152000,"Coffee and chicory, brewed",7302,Coffee,1,0.12,0.0,0.0,0.0,0.02,...,0.0,1.0,200.0,1.0,300.0,4900.0,0.2,0.0,200.0,5213.8
4186,72101100,"Beet greens, raw",6411,Other dark green vegetables,22,2.2,4.33,0.5,3.7,0.13,...,136.363636,6.818182,531.818182,11.681818,318.181818,3463.636364,0.090909,2.272727,1027.272727,4902.045455


In [10]:
#print the top 10 foods in df_solid by NRF9.3 Score - include main food description and NRF9.3 score
print(df_NR9b.nlargest(10, 'NRF9.3')[["Main food description", "NRF9.3"]])

                                  Main food description       NRF9.3
5107               Tea, hot, leaf, green, decaffeinated          inf
5142  Tea, iced, brewed, green, decaffeinated, unswe...          inf
5328                              Water, enhanced, diet          inf
4255                                    Watercress, raw  5788.845455
4989                              Coffee, NS as to type  5213.800000
4990                 Coffee, NS as to brewed or instant  5213.800000
4991                                     Coffee, brewed  5213.800000
4996                           Coffee, brewed, flavored  5213.800000
5086                         Coffee and chicory, brewed  5213.800000
4186                                   Beet greens, raw  4902.045455


In [11]:
df_NR9b.nsmallest(10, 'NRF9.3')

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),...,Vitamin C (mg)_per_100kcal,Vitamin E (alpha-tocopherol) (mg)_per_100kcal,Calcium (mg)_per_100kcal,Iron\n(mg)_per_100kcal,Magnesium (mg)_per_100kcal,Potassium (mg)_per_100kcal,"Fatty acids, total saturated (g)_per_100kcal","Sugars, total\n(g)_per_100kcal",Sodium (mg)_per_100kcal,NRF9.3
5147,92309040,"Tea, iced, bottled, black, unsweetened",7304,Tea,0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,inf,-inf
4367,74406060,Buffalo sauce,8406,Mustard and other condiments,11,1.21,0.75,0.12,0.6,0.71,...,38.181818,0.090909,127.272727,9.909091,100.0,1090.909091,0.909091,1.090909,26518.18,-24437.36
1073,27150210,Fish sauce,8404,Soy-based condiments,35,5.06,3.64,3.64,0.0,0.01,...,1.428571,0.0,122.857143,2.228571,500.0,822.857143,0.008571,10.4,22431.43,-20966.58
2058,41420300,Soy sauce,8404,Soy-based condiments,53,8.14,4.93,0.4,0.8,0.57,...,0.0,0.0,62.264151,2.735849,139.622642,820.754717,0.137736,0.754717,10364.15,-9322.798
2059,41420350,"Soy sauce, reduced sodium",8404,Soy-based condiments,57,9.05,5.59,0.5,0.7,0.3,...,0.0,0.719298,52.631579,2.368421,121.052632,617.54386,0.061404,0.877193,6312.281,-5501.798
1748,28340120,"Soup, broth",3804,"Soups, broth-based",6,0.89,0.24,0.22,0.0,0.22,...,0.0,0.333333,83.333333,2.0,33.333333,600.0,1.033333,3.666667,6200.0,-5470.867
4711,75511040,"Peppers, hot, pickled",8408,"Olives, pickles, pickled vegetables",22,0.8,4.56,1.6,2.6,0.4,...,55.909091,4.090909,277.272727,1.454545,27.272727,513.636364,0.277273,7.272727,6500.0,-5417.005
1766,28520100,Oyster sauce,8404,Soy-based condiments,51,1.35,10.92,0.0,0.3,0.25,...,0.196078,0.0,62.745098,0.352941,7.843137,105.882353,0.084314,0.0,5358.824,-5178.653
4694,75503010,"Pickles, dill",8408,"Olives, pickles, pickled vegetables",14,0.48,1.99,1.28,1.0,0.43,...,15.0,0.857143,385.714286,1.642857,50.0,800.0,0.564286,9.142857,5771.429,-4488.779
4713,75511100,"Pickles, NFS",8408,"Olives, pickles, pickled vegetables",14,0.48,1.99,1.28,1.0,0.43,...,15.0,0.857143,385.714286,1.642857,50.0,800.0,0.564286,9.142857,5771.429,-4488.779


In [12]:
#print the bottom 10 foods in df_solid by NRF9.3 Score - include main food description and NRF9.3 score
print(df_NR9b.nsmallest(10, 'NRF9.3')[["Main food description", "NRF9.3"]])

                       Main food description        NRF9.3
5147  Tea, iced, bottled, black, unsweetened          -inf
4367                           Buffalo sauce -2.443736e+04
1073                              Fish sauce -2.096658e+04
2058                               Soy sauce -9.322798e+03
2059               Soy sauce, reduced sodium -5.501798e+03
1748                             Soup, broth -5.470867e+03
4711                   Peppers, hot, pickled -5.417005e+03
1766                            Oyster sauce -5.178653e+03
4694                           Pickles, dill -4.488779e+03
4713                            Pickles, NFS -4.488779e+03


In [13]:
# Create 'Junk Food' column based on NRF9.3 score
df_NR9b['Junk Food'] = (df_NR9b['NRF9.3'] <= 0).astype(int)

print(df_NR9b[['Main food description', 'NRF9.3', 'Junk Food']])


                     Main food description       NRF9.3  Junk Food
0                                Milk, NFS   594.867308          0
1                              Milk, whole   451.885246          0
2                   Milk, reduced fat (2%)   677.180000          0
3                       Milk, low fat (1%)   729.934884          0
4                    Milk, fat free (skim)   977.444118          0
...                                    ...          ...        ...
5426     Vegetables as ingredient in curry   842.752830          0
5427     Vegetables as ingredient in soups   836.560000          0
5428     Vegetables as ingredient in stews  1027.135593          0
5429     Sauce as ingredient in hamburgers  -225.313971          1
5430  Industrial oil as ingredient in food    -2.481166          1

[5431 rows x 3 columns]


In [14]:
#write the data to a csv file only for rows where liquid = 0
df_solid = df_NR9b[df_NR9b['Is Liquid'] == 0]
df_solid.to_csv('data_with_junk_food_solid - NVA.csv', index=False)

In [15]:
#write the data to a csv file only for rows where liquid = 1
df_liquid = df_NR9b[df_NR9b['Is Liquid'] == 1]
df_liquid.to_csv('data_with_junk_food_liquid - NVA.csv', index=False)


In [16]:
df_liquid.head()

Unnamed: 0,Food code,Main food description,WWEIA Category number,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),"Sugars, total\n(g)","Fiber, total dietary (g)",Total Fat (g),...,Vitamin E (alpha-tocopherol) (mg)_per_100kcal,Calcium (mg)_per_100kcal,Iron\n(mg)_per_100kcal,Magnesium (mg)_per_100kcal,Potassium (mg)_per_100kcal,"Fatty acids, total saturated (g)_per_100kcal","Sugars, total\n(g)_per_100kcal",Sodium (mg)_per_100kcal,NRF9.3,Junk Food
0,11100000,"Milk, NFS",1004,"Milk, reduced fat",52,3.33,4.83,4.88,0.0,2.14,...,0.057692,240.384615,0.0,23.076923,300.0,2.401923,9.384615,75.0,594.867308,0
1,11111000,"Milk, whole",1002,"Milk, whole",61,3.27,4.63,4.81,0.0,3.2,...,0.081967,201.639344,0.0,19.672131,245.901639,3.04918,7.885246,62.295082,451.885246,0
2,11112110,"Milk, reduced fat (2%)",1004,"Milk, reduced fat",50,3.36,4.9,4.89,0.0,1.9,...,0.06,252.0,0.0,24.0,318.0,2.22,9.78,78.0,677.18,0
3,11112210,"Milk, low fat (1%)",1006,"Milk, lowfat",43,3.38,5.18,4.96,0.0,0.95,...,0.046512,293.023256,0.0,27.906977,369.767442,1.32093,11.534884,90.697674,729.934884,0
4,11113000,"Milk, fat free (skim)",1008,"Milk, nonfat",34,3.43,4.92,5.05,0.0,0.08,...,0.0,388.235294,0.0,35.294118,491.176471,0.144118,14.852941,120.588235,977.444118,0
