# ðŸ¥— Meal Recommendation Chatbot  


This notebook converts raw nutrition data into a clean, structured dataset that serves as the foundation for the project.

---

#1. Environment Configuration and Data Sources

In [7]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
survey_foods = pd.read_csv("food.csv")
survey_foods_por = pd.read_csv("food_portion.csv")
nut_id = pd.read_csv("nutrient.csv")
survey_foods_nut = pd.read_csv("food_nutrient.csv")

# 2. Data Cleaning

In [8]:
survey_foods.head(5)

Unnamed: 0,fdc_id,data_type,description,food_category_id,publication_date
0,2705383,survey_fndds_food,"Milk, human",9602,2022-10-28
1,2705384,survey_fndds_food,"Milk, NFS",1004,2022-10-28
2,2705385,survey_fndds_food,"Milk, whole",1002,2022-10-28
3,2705386,survey_fndds_food,"Milk, reduced fat (2%)",1004,2022-10-28
4,2705387,survey_fndds_food,"Milk, low fat (1%)",1006,2022-10-28


## 2.1 Nutrient Selection

In [9]:
RELEVANT_NUTRIENTS = [
    "Energy",
    # Macros
    "Protein",
    "Total lipid (fat)",
    "Carbohydrate, by difference",
    "Fiber, total dietary",
    "Sugars, Total",
    "Sugars, added",
    "Cholesterol",
    # Minerals
    "Sodium, Na",
    "Potassium, K",
    "Calcium, Ca",
    "Iron, Fe",
    # Vitamins
    "Vitamin A",
    "Vitamin C, total ascorbic acid",
    "Vitamin D (D2 + D3)",
    "Vitamin E (alpha-tocopherol)",
    "Vitamin B-12"]

nut_id_relevant = nut_id[nut_id["name"].isin(RELEVANT_NUTRIENTS)]
nut_id_relevant["nutrient_nbr"] = nut_id_relevant["nutrient_nbr"].astype(int)
nut_id_relevant.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nut_id_relevant["nutrient_nbr"] = nut_id_relevant["nutrient_nbr"].astype(int)


Unnamed: 0,id,name,unit_name,nutrient_nbr,rank
4,1003,Protein,G,203,600.0
5,1004,Total lipid (fat),G,204,800.0
6,1005,"Carbohydrate, by difference",G,205,1110.0
9,1008,Energy,KCAL,208,300.0
63,1062,Energy,kJ,268,400.0


In [10]:
survey_foods_nut_enriched = survey_foods_nut.merge(
    nut_id_relevant[["nutrient_nbr", "name","unit_name"]],
    left_on="nutrient_id",
    right_on="nutrient_nbr",
    how="left")

survey_foods_nut_clean = survey_foods_nut_enriched[["fdc_id", "amount", "nutrient_nbr", "name"]].copy()
survey_foods_nut_clean = survey_foods_nut_clean.dropna(subset=["name"])
survey_foods_nut_clean.head()

Unnamed: 0,fdc_id,amount,nutrient_nbr,name
0,2705384,125.0,301.0,"Calcium, Ca"
6,2705384,4.88,269.0,"Sugars, Total"
8,2705384,2.14,204.0,Total lipid (fat)
14,2705384,52.0,208.0,Energy
16,2705384,0.03,323.0,Vitamin E (alpha-tocopherol)


## 3. Data Transformation

In [11]:
nutrients_wide = (survey_foods_nut_clean.pivot_table(index="fdc_id", columns="name",values="amount",aggfunc="first").reset_index())
nutrients_wide.head()

name,fdc_id,"Calcium, Ca","Carbohydrate, by difference",Cholesterol,Energy,"Fiber, total dietary","Iron, Fe","Potassium, K",Protein,"Sodium, Na","Sugars, Total",Total lipid (fat),Vitamin B-12,"Vitamin C, total ascorbic acid",Vitamin D (D2 + D3),Vitamin E (alpha-tocopherol)
0,2705384,125.0,4.83,9.0,52.0,0.0,0.0,156.0,3.33,39.0,4.88,2.14,0.56,0.1,1.1,0.03
1,2705385,123.0,4.63,12.0,61.0,0.0,0.0,150.0,3.27,38.0,4.81,3.2,0.54,0.0,1.1,0.05
2,2705386,126.0,4.9,8.0,50.0,0.0,0.0,159.0,3.36,39.0,4.89,1.9,0.55,0.2,1.1,0.03
3,2705387,126.0,5.18,5.0,43.0,0.0,0.0,159.0,3.38,39.0,4.96,0.95,0.61,0.0,1.1,0.02
4,2705388,132.0,4.92,3.0,34.0,0.0,0.0,167.0,3.43,41.0,5.05,0.08,0.58,0.0,1.1,0.0


In [12]:
final_df = survey_foods.merge(
    nutrients_wide,
    on="fdc_id",how="outer")

portion_default = (survey_foods_por
    .sort_values(["fdc_id", "seq_num"])
    .groupby("fdc_id", as_index=False)
    .first())

final_df = final_df.merge(
    portion_default[["fdc_id", "portion_description", "gram_weight"]],
    on="fdc_id",
    how="outer")

final_df = final_df.drop(columns=[
    "food_category_id",
    "publication_date"])


# 4. Final Output

In [13]:
ordered_cols = [
    # Identifiers / metadata
    "fdc_id",
    "data_type",
    "description",

    # Energy & macros
    "Energy",
    "Protein",
    "Total lipid (fat)",
    "Carbohydrate, by difference",
    "Fiber, total dietary",
    "Sugars, Total",
    "Cholesterol",

    # Minerals
    "Sodium, Na",
    "Potassium, K",
    "Calcium, Ca",
    "Iron, Fe",

    # Vitamins
    "Vitamin B-12",
    "Vitamin C, total ascorbic acid",
    "Vitamin D (D2 + D3)",
    "Vitamin E (alpha-tocopherol)",

    # Portion information
    "portion_description",
    "gram_weight"]

final_df = final_df[ordered_cols]
final_df = final_df.dropna(subset=["Energy"])
final_df.head(10)

Unnamed: 0,fdc_id,data_type,description,Energy,Protein,Total lipid (fat),"Carbohydrate, by difference","Fiber, total dietary","Sugars, Total",Cholesterol,"Sodium, Na","Potassium, K","Calcium, Ca","Iron, Fe",Vitamin B-12,"Vitamin C, total ascorbic acid",Vitamin D (D2 + D3),Vitamin E (alpha-tocopherol),portion_description,gram_weight
1,2705384,survey_fndds_food,"Milk, NFS",52.0,3.33,2.14,4.83,0.0,4.88,9.0,39.0,156.0,125.0,0.0,0.56,0.1,1.1,0.03,1 cup,244.0
2,2705385,survey_fndds_food,"Milk, whole",61.0,3.27,3.2,4.63,0.0,4.81,12.0,38.0,150.0,123.0,0.0,0.54,0.0,1.1,0.05,1 cup,244.0
3,2705386,survey_fndds_food,"Milk, reduced fat (2%)",50.0,3.36,1.9,4.9,0.0,4.89,8.0,39.0,159.0,126.0,0.0,0.55,0.2,1.1,0.03,1 cup,244.0
4,2705387,survey_fndds_food,"Milk, low fat (1%)",43.0,3.38,0.95,5.18,0.0,4.96,5.0,39.0,159.0,126.0,0.0,0.61,0.0,1.1,0.02,1 cup,244.0
5,2705388,survey_fndds_food,"Milk, fat free (skim)",34.0,3.43,0.08,4.92,0.0,5.05,3.0,41.0,167.0,132.0,0.0,0.58,0.0,1.1,0.0,1 cup,244.0
6,2705389,survey_fndds_food,"Milk, lactose free, low fat (1%)",43.0,3.38,0.95,5.18,0.0,4.96,5.0,39.0,159.0,126.0,0.0,0.61,0.0,1.1,0.02,1 cup,244.0
7,2705390,survey_fndds_food,"Milk, lactose free, fat free (skim)",34.0,3.43,0.08,4.92,0.0,5.05,3.0,41.0,167.0,132.0,0.0,0.58,0.0,1.1,0.0,1 cup,244.0
8,2705391,survey_fndds_food,"Milk, lactose free, reduced fat (2%)",50.0,3.36,1.9,4.9,0.0,4.89,8.0,39.0,159.0,126.0,0.0,0.55,0.2,1.1,0.03,1 cup,244.0
9,2705392,survey_fndds_food,"Milk, lactose free, whole",61.0,3.27,3.2,4.63,0.0,4.81,12.0,38.0,150.0,123.0,0.0,0.54,0.0,1.1,0.05,1 cup,244.0
10,2705393,survey_fndds_food,Buttermilk,43.0,3.46,1.08,4.81,0.0,5.36,5.0,92.0,158.0,120.0,0.01,0.22,1.0,0.5,0.05,1 cup,244.0


In [14]:
len(final_df)

5431

In [17]:
final_df.to_csv("structured_food_dataset.csv", index=False)