In [22]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Preprocessing for Nutrient data

Target: Vitamin C (milligrams)

Features (all per 100 grams): 
- Water (grams)
- Energy (kcal)
- Protein (grams)
- Carbohydrates (grams)
- Sodium (milligrams)
- Sugar (grams)


Exporting five files
1) df_raw.csv: Includes ID and description, and all data are not scaled or standardized.
2) features_train.csv: No ID or description, only scaled feature data for training
3) features_test.csv: No ID or description, only scaled feature data for test
4) y_train.csv: Only the training labels (vitamin C values)
5) y_test.csv: Only the test labels (vitamin C values)


In [23]:
# read in data
data = pd.read_excel('ABBREV.xlsx')
data.head()

Unnamed: 0,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),Sugar_Tot_(g),Calcium_(mg),Iron_(mg),Magnesium_(mg),Phosphorus_(mg),Potassium_(mg),Sodium_(mg),Zinc_(mg),Copper_mg),Manganese_(mg),Selenium_(µg),Vit_C_(mg),Thiamin_(mg),Riboflavin_(mg),Niacin_(mg),Panto_Acid_mg),Vit_B6_(mg),Folate_Tot_(µg),Folic_Acid_(µg),Food_Folate_(µg),Folate_DFE_(µg),Choline_Tot_ (mg),Vit_B12_(µg),Vit_A_IU,Vit_A_RAE,Retinol_(µg),Alpha_Carot_(µg),Beta_Carot_(µg),Beta_Crypt_(µg),Lycopene_(µg),Lut+Zea_ (µg),Vit_E_(mg),Vit_D_µg,Vit_D_IU,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Refuse_Pct
0,1001,"BUTTER,WITH SALT",15.87,717,0.85,81.11,2.11,0.06,0.0,0.06,24.0,0.02,2.0,24.0,24.0,643.0,0.09,0.0,0.0,1.0,0.0,0.005,0.034,0.042,0.11,0.003,3.0,0.0,3.0,3.0,18.8,0.17,2499.0,684.0,671.0,0.0,158.0,0.0,0.0,0.0,2.32,0.0,0.0,7.0,51.368,21.021,3.043,215.0,5.0,"1 pat, (1"" sq, 1/3"" high)",14.2,1 tbsp,0.0
1,1002,"BUTTER,WHIPPED,W/ SALT",16.72,718,0.49,78.3,1.62,2.87,0.0,0.06,23.0,0.05,1.0,24.0,41.0,583.0,0.05,0.01,0.001,0.0,0.0,0.007,0.064,0.022,0.097,0.008,4.0,0.0,4.0,4.0,18.8,0.07,2468.0,683.0,671.0,1.0,135.0,6.0,0.0,13.0,1.37,0.0,0.0,4.6,45.39,19.874,3.331,225.0,3.8,"1 pat, (1"" sq, 1/3"" high)",9.4,1 tbsp,0.0
2,1003,"BUTTER OIL,ANHYDROUS",0.24,876,0.28,99.48,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,5.0,2.0,0.01,0.001,0.0,0.0,0.0,0.001,0.005,0.003,0.01,0.001,0.0,0.0,0.0,0.0,22.3,0.01,3069.0,840.0,824.0,0.0,193.0,0.0,0.0,0.0,2.8,0.0,0.0,8.6,61.924,28.732,3.694,256.0,12.8,1 tbsp,205.0,1 cup,0.0
3,1004,"CHEESE,BLUE",42.41,353,21.4,28.74,5.11,2.34,0.0,0.5,528.0,0.31,23.0,387.0,256.0,1146.0,2.66,0.04,0.009,14.5,0.0,0.029,0.382,1.016,1.729,0.166,36.0,0.0,36.0,36.0,15.4,1.22,721.0,198.0,192.0,0.0,74.0,0.0,0.0,0.0,0.25,0.5,21.0,2.4,18.669,7.778,0.8,75.0,28.35,1 oz,17.0,1 cubic inch,0.0
4,1005,"CHEESE,BRICK",41.11,371,23.24,29.68,3.18,2.79,0.0,0.51,674.0,0.43,24.0,451.0,136.0,560.0,2.6,0.024,0.012,14.5,0.0,0.014,0.351,0.118,0.288,0.065,20.0,0.0,20.0,20.0,15.4,1.26,1080.0,292.0,286.0,0.0,76.0,0.0,0.0,0.0,0.26,0.5,22.0,2.5,18.764,8.598,0.784,94.0,132.0,"1 cup, diced",113.0,"1 cup, shredded",0.0


### Extract and rename columns

In [24]:
# extract relevant columns
columns = ['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Carbohydrt_(g)', 'Sodium_(mg)', 'Sugar_Tot_(g)', 'Vit_C_(mg)']
df = data[columns]

# rename to easier words
col_names = {'NDB_No':'ID', 'Shrt_Desc':'Description', 'Water_(g)':'Water', 'Energ_Kcal':'Energy', 'Protein_(g)':'Protein', 'Carbohydrt_(g)':'Carbohydrates', 'Sodium_(mg)':'Sodium', 'Sugar_Tot_(g)':'Sugar', 'Vit_C_(mg)':'VitaminC'}
df = df.rename(columns=col_names)

df.head()

# export as shortened raw dataset
df.to_csv('df_raw.csv')

### Train-test split, Min-max scale

75-25 train-test split


In [25]:
# # split data
y = df['VitaminC']
X = df.drop(columns=['ID', 'Description', 'VitaminC'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# reset indices
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# export labels
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

# scale features
scaler = MinMaxScaler()
feature_cols = X_train.columns
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)        # scale test features on training mean and variance for consistency
X_train = pd.DataFrame(X_train, columns=feature_cols)
X_test = pd.DataFrame(X_test, columns=feature_cols)     # reconvert to dataframe for exporting

# export features
X_train.to_csv('features_train.csv')
X_test.to_csv('features_test.csv')
