# Cleaning nutrional information data

In [2]:
import pandas as pd

In [2]:
# load the data
# nutrient information 
df_nutrients = pd.read_csv("../../data/food_csv/nutrient.csv")
# food name
df_food = pd.read_csv("../../data/food_csv/food.csv")
# food nutrient information
df_food_nutrient = pd.read_csv("../../data/food_csv/food_nutrient.csv")

In [None]:
df_nutrients = df_nutrients.dropna()
df_nutrients = df_nutrients.astype({"nutrient_nbr": int})

Join tables to include all nutrients in a single df

In [None]:
df_cleaned_nutrient = df_food_nutrient.merge(
    df_nutrients[["nutrient_nbr", "name", "unit_name"]],
    left_on="nutrient_id",
    right_on="nutrient_nbr",
    how="inner",
)
df_cleaned_nutrient = df_cleaned_nutrient[
    ["fdc_id", "nutrient_id", "amount", "name", "unit_name"]
]

Show the most common nutrients

In [23]:
df_cleaned_nutrient[df_cleaned_nutrient["amount"] > 0].groupby(
    "name"
).count().sort_values(by="fdc_id", ascending=False)

Unnamed: 0_level_0,fdc_id,nutrient_id,amount,unit_name
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Water,5413,5413,5413,5413
Energy,5403,5403,5403,5403
"Potassium, K",5389,5389,5389,5389
"Sodium, Na",5365,5365,5365,5365
"Zinc, Zn",5351,5351,5351,5351
...,...,...,...,...
Caffeine,403,403,403,403
Theobromine,275,275,275,275
"Vitamin B-12, added",265,265,265,265
"Vitamin E, added",193,193,193,193


Remove unwanted columns

In [None]:
required_nutrients = (
    "Protein",
    "Energy",
    "Total lipid (fat)",
    "Carbohydrate, by summation",
    "Fiber, total dietary",
)
df_cleaned_nutrient = df_cleaned_nutrient[
    df_cleaned_nutrient["name"].isin(required_nutrients)
]

In [25]:
df_cleaned_nutrient

Unnamed: 0,fdc_id,nutrient_id,amount,name,unit_name
8,2705384,269,4.880,"Sugars, Total",G
9,2705384,269,4.880,Total Sugars,G
13,2705384,204,2.140,Total lipid (fat),G
19,2705384,208,52.000,Energy,KCAL
27,2705384,606,1.249,"Fatty acids, total saturated",G
...,...,...,...,...,...
407298,2710814,269,0.000,"Sugars, Total",G
407299,2710814,269,0.000,Total Sugars,G
407301,2710814,204,100.000,Total lipid (fat),G
407306,2710814,203,0.000,Protein,G


Append the unit name to the end of the nutrient name, to simplify data structure

In [None]:
df_cleaned_nutrient["nutrient"] = (
    df_cleaned_nutrient["name"] + " (" + df_cleaned_nutrient["unit_name"] + ")"
)

Pivot the table, turning each nutrient into a column, to be joined with the food table

In [27]:
df_pivot_nutrient = df_cleaned_nutrient.pivot(
    index="fdc_id", columns="nutrient", values="amount"
)

In [28]:
df_nutrional_information = df_food.merge(df_pivot_nutrient, on="fdc_id", how="left")

Rename columns to follow a standard and avoid whitespapce

In [34]:
df_nutrional_information = df_nutrional_information.rename(
    columns={
        "description": "food_name",
        "Carbohydrate, by summation (G)": "carbohydrates_g",
        "Energy (KCAL)": "energy_kcal",
        "Total lipid (fat) (G)": "lipid_g",
        "Protein (G)": "protein_g",
        "Fiber, total dietary (G)": "fiber_g",
    },
)
df_nutrional_information = df_nutrional_information[
    ["food_name", "carbohydrates_g", "energy_kcal", "lipid_g", "protein_g", "fiber_g"]
]

Save the cleaned data

In [None]:
df_nutrional_information = df_nutrional_information.dropna()
df_nutrional_information.to_csv("data/cleaned_food_data.csv", index=False)

ModuleNotFoundError: No module named 'daia'