# Thanksgiving Food Data

In [3]:
import pandas as pd
import numpy as np

## Read in data and explore data attributes

In [4]:
food_info = pd.read_csv("food_info.csv")

In [5]:
food_info.head()

Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),...,Vit_A_IU,Vit_A_RAE,Vit_E_(mg),Vit_D_mcg,Vit_D_IU,Vit_K_(mcg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg)
0,1001,BUTTER WITH SALT,15.87,717,0.85,81.11,2.11,0.06,0.0,0.06,...,2499.0,684.0,2.32,1.5,60.0,7.0,51.368,21.021,3.043,215.0
1,1002,BUTTER WHIPPED WITH SALT,15.87,717,0.85,81.11,2.11,0.06,0.0,0.06,...,2499.0,684.0,2.32,1.5,60.0,7.0,50.489,23.426,3.012,219.0
2,1003,BUTTER OIL ANHYDROUS,0.24,876,0.28,99.48,0.0,0.0,0.0,0.0,...,3069.0,840.0,2.8,1.8,73.0,8.6,61.924,28.732,3.694,256.0
3,1004,CHEESE BLUE,42.41,353,21.4,28.74,5.11,2.34,0.0,0.5,...,721.0,198.0,0.25,0.5,21.0,2.4,18.669,7.778,0.8,75.0
4,1005,CHEESE BRICK,41.11,371,23.24,29.68,3.18,2.79,0.0,0.51,...,1080.0,292.0,0.26,0.5,22.0,2.5,18.764,8.598,0.784,94.0


In [6]:
dimension = food_info.shape #8618 rows and 36 columns (8618, 36)
num_rows = dimension[0]
num_cols = dimension[1]

In [7]:
hundredth_row = food_info.loc[100] #selects 100th row

In [8]:
food_info.dtypes #data type of a dataframe = series containing each column's type

NDB_No               int64
Shrt_Desc           object
Water_(g)          float64
Energ_Kcal           int64
Protein_(g)        float64
Lipid_Tot_(g)      float64
Ash_(g)            float64
Carbohydrt_(g)     float64
Fiber_TD_(g)       float64
Sugar_Tot_(g)      float64
Calcium_(mg)       float64
Iron_(mg)          float64
Magnesium_(mg)     float64
Phosphorus_(mg)    float64
Potassium_(mg)     float64
Sodium_(mg)        float64
Zinc_(mg)          float64
Copper_(mg)        float64
Manganese_(mg)     float64
Selenium_(mcg)     float64
Vit_C_(mg)         float64
Thiamin_(mg)       float64
Riboflavin_(mg)    float64
Niacin_(mg)        float64
Vit_B6_(mg)        float64
Vit_B12_(mcg)      float64
Vit_A_IU           float64
Vit_A_RAE          float64
Vit_E_(mg)         float64
Vit_D_mcg          float64
Vit_D_IU           float64
Vit_K_(mcg)        float64
FA_Sat_(g)         float64
FA_Mono_(g)        float64
FA_Poly_(g)        float64
Cholestrl_(mg)     float64
dtype: object

In [9]:
food_info.loc[3:6] #selects rows at index 3, 4, 5, 6
food_info.loc[[2,5,10]] #selects rows at index 2, 5, 10

Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),...,Vit_A_IU,Vit_A_RAE,Vit_E_(mg),Vit_D_mcg,Vit_D_IU,Vit_K_(mcg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg)
2,1003,BUTTER OIL ANHYDROUS,0.24,876,0.28,99.48,0.0,0.0,0.0,0.0,...,3069.0,840.0,2.8,1.8,73.0,8.6,61.924,28.732,3.694,256.0
5,1006,CHEESE BRIE,48.42,334,20.75,27.68,2.7,0.45,0.0,0.45,...,592.0,174.0,0.24,0.5,20.0,2.3,17.41,8.013,0.826,100.0
10,1011,CHEESE COLBY,38.2,394,23.76,32.11,3.36,2.57,0.0,0.52,...,994.0,264.0,0.28,0.6,24.0,2.7,20.218,9.28,0.953,95.0


In [10]:
ndb_col = food_info["NDB_No"] #selects column called "NDB_No"
saturated_fat_col = food_info["FA_Sat_(g)"]
selenium_thiamin = food_info[["Selenium_(mcg)", "Thiamin_(mg)"]] #selects multiple columns

## Make a dataframe consisting only of columns that have gram values

In [11]:
column_list = list(food_info.columns)
gram_column_list = list()
for column_name in column_list:
    if column_name.endswith("(g)"): #endswith = 
        gram_column_list.append(column_name)
gram_df = food_info[gram_column_list]

## Arithmetic operators to transform numerical columns
1. Convert mg to g by dividing the value in mg by 1000
2. Calculate the amount of protein per gram of water
3. Calculate the nutritional index (= 2 * protein - 0.75 * lipid)

In [16]:
food_info["Sodium_(mg)"] = food_info["Sodium_(mg)"] / 1000

In [21]:
grams_of_protein_per_gram_of_water = food_info["Protein_(g)"] / food_info["Water_(g)"]
milligrams_of_calcium_and_iron = food_info["Calcium_(mg)"] + food_info["Iron_(mg)"]

In [23]:
initial_rating = food_info["Protein_(g)"] * 2 - (food_info["Lipid_Tot_(g)"]*0.75)

## Normalizing Columns in a Data Set
Rescaling: results in all values in a numeric column ranging between 0 and 1 <br>
x' = x - min(x) / max(x) - min(x)

In [28]:
#Normalize protein column
max_protein = food_info["Protein_(g)"].max()
min_protein = food_info["Protein_(g)"].min()
range_protein = max_protein - min_protein
normalized_protein = (food_info["Protein_(g)"] - min_protein) / range_protein 

In [31]:
#Normalize fat column
max_fat = food_info["Lipid_Tot_(g)"].max()
min_fat = food_info["Lipid_Tot_(g)"].min()
range_fat = max_fat - min_fat
normalized_fat = (food_info["Lipid_Tot_(g)"] - min_fat) / range_fat

In [32]:
food_info["Normalized_Protein"] = normalized_protein
food_info["Normalized_fat"] = normalized_fat

In [33]:
Normalized_Nutrition_Index = 2*normalized_protein - 0.75*normalized_fat

In [34]:
food_info["Norm_Nutr_Index"] = Normalized_Nutrition_Index

## Sorting Values
DataFrame.sort_values(name_of_the_column, inplace=True, ascending=False) <br>
sorts by descending order <br>
sorts the dataframe in-place instead of returning a new dataframe

In [36]:
food_info.sort_values("Norm_Nutr_Index", inplace=True)

In [37]:
food_info

Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),...,Vit_D_mcg,Vit_D_IU,Vit_K_(mcg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),Normalized_Protein,Normalized_fat,Norm_Nutr_Index
637,4055,OIL PALM,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,,,8.0,49.300,37.000,9.300,0.0,0.000000,1.0000,-0.750000
774,4685,OIL COOKING & SALAD ENOVA 80% DIGLYCERIDES,0.00,884,0.00,100.00,0.00,0.00,,,...,,,,4.630,37.016,53.370,,0.000000,1.0000,-0.750000
770,4678,OIL VEG NATREON CANOLA HI STABILITY NON TRANS ...,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,0.0,0.0,,6.511,71.991,17.098,0.0,0.000000,1.0000,-0.750000
695,4583,OIL MUSTARD,0.00,884,0.00,100.00,0.00,0.00,0.0,,...,,,,11.582,59.187,21.230,,0.000000,1.0000,-0.750000
673,4541,OIL CUPU ASSU,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,,,,53.200,38.700,3.800,0.0,0.000000,1.0000,-0.750000
696,4584,OIL SUNFLOWER HI OLEIC (70% & OVER),0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,0.0,0.0,5.4,9.859,83.689,3.798,0.0,0.000000,1.0000,-0.750000
671,4536,OIL SHEANUT,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,,,,46.600,44.000,5.200,0.0,0.000000,1.0000,-0.750000
764,4670,USDA CMDTY FD OIL VEG LO SATURATED FAT,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,,,197.6,7.429,22.730,65.138,0.0,0.000000,1.0000,-0.750000
670,4534,OIL BABASSU,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,,,,81.200,11.400,1.600,0.0,0.000000,1.0000,-0.750000
669,4532,OIL HAZELNUT,0.00,884,0.00,100.00,0.00,0.00,0.0,0.00,...,,,,7.400,78.000,10.200,0.0,0.000000,1.0000,-0.750000


## Working with Missing Data
1. 