# Creating Dataset of Food Items
This list will be used in the chatbot to provide lookup tables of food items. The data is taken from the appendix of https://www.dietaryguidelines.gov/sites/default/files/2020-12/Dietary_Guidelines_for_Americans_2020-2025.pdf.

Each food item will be categorized by their food groups as determined by USDA:

* Vegetables
* Fruits
* Grains
* Dairy
* Protein Foods

The dataset will then be converted to a .txt file to be used in Rasa's NLU training data.

In [1]:
import pandas as pd
import matplotlib
import seaborn as sns
import numpy as np

## Pre-processing List of Vegetables in their sub-groups

I will convert strings of food items in their sub-groups to a list of food items, removing unnecessary words and whitespaces.

In [110]:
dark_green_vegetables = "amaranth leaves, basil, beet greens, bitter melon leaves, bok choy, broccoli, chamnamul, chrysanthemum leaves, chard, cilantro, collards, cress, dandelion greens, kale, lambsquarters, mustard greens, poke greens, romaine lettuce, spinach, nettles, taro leaves, turnip greens, and watercress"

red_orange_vegetables = "calabaza, carrots, red chili peppers, red or orange bell peppers, pimento/pimiento, sweet potatoes, tomatoes, 100% tomato juice, and winter squash such as acorn, butternut, kabocha, and pumpkin"

pulses = "black beans, black-eyed peas, bayo beans, brown beans, chickpeas (garbanzo beans), cowpeas, edamame, fava beans, kidney beans, lentils, lima beans, mung beans, navy beans, pigeon peas, pink beans, pinto beans, split peas, soybeans, and white beans"

starchy_vegetables = "breadfruit, burdock root, cassava, corn, jicama, lotus root, lima beans, immature or raw (not dried) peas (e.g., cowpeas, black-eyed peas, green peas, pigeon peas), plantains, white potatoes, salsify, tapioca, taro root (dasheen or yautia), water chestnuts, yam, and yucca"

others_vegetables = "artichoke, asparagus, avocado, bamboo shoots, bean sprouts, beets, bitter melon (bitter gourd, balsam pear), broccoflower, Brussels sprouts, cabbage, cactus pads (nopales), cauliflower, celeriac, celery, chayote (mirliton), chives, cucumber, eggplant, fennel bulb, garlic, ginger root, green beans, iceberg lettuce, kohlrabi, leeks, luffa (Chinese okra), mushrooms, okra, onions, green chili peppers, green bell peppers, radicchio, sprouted beans, radish, rutabaga, seaweed, snow peas, summer squash, tomatillos, turnips, and winter melons"

In [111]:
# Remove the word 'and', split string by comma
dark_green_vegetables = dark_green_vegetables.replace('and', '').split(",")
# Remove whitespaces at start and end of each food item
dark_green_vegetables = [x.strip() for x in dark_green_vegetables]
print(dark_green_vegetables)

['amaranth leaves', 'basil', 'beet greens', 'bitter melon leaves', 'bok choy', 'broccoli', 'chamnamul', 'chrysanthemum leaves', 'chard', 'cilantro', 'collards', 'cress', 'delion greens', 'kale', 'lambsquarters', 'mustard greens', 'poke greens', 'romaine lettuce', 'spinach', 'nettles', 'taro leaves', 'turnip greens', 'watercress']


In [112]:
# Remove the word 'and', split string by comma
red_orange_vegetables = red_orange_vegetables.replace('and', '').split(",")
# Remove whitespaces at start and end of each food item
red_orange_vegetables = [x.strip() for x in red_orange_vegetables]
print(red_orange_vegetables)

['calabaza', 'carrots', 'red chili peppers', 'red or orange bell peppers', 'pimento/pimiento', 'sweet potatoes', 'tomatoes', '100% tomato juice', 'winter squash such as acorn', 'butternut', 'kabocha', 'pumpkin']


In [113]:
temp = []

for veg in red_orange_vegetables:
    
    # Split "red or orange bell peppers" into 2 separate items
    if "bell peppers" in veg:
        temp.append("red bell peppers")
        temp.append("orange bell peppers")
    # Remove "winter squash such as" and rename "acorn" to "acorn squash"
    elif "winter" in veg:
        temp.append("acorn squash")
    # Rename "butternut" to "butternut squash"
    elif "butternut" in veg:
        temp.append("butternut squash")
    else:
        temp.append(veg)

red_orange_vegetables = temp
print(red_orange_vegetables)

['calabaza', 'carrots', 'red chili peppers', 'red bell peppers', 'orange bell peppers', 'pimento/pimiento', 'sweet potatoes', 'tomatoes', '100% tomato juice', 'acorn squash', 'butternut squash', 'kabocha', 'pumpkin']


In [114]:
# Remove the word 'and', split string by comma
pulses = pulses.replace('and', '').split(",")
# Remove whitespaces at start and end of each food item
pulses = [x.strip() for x in pulses]
print(pulses)

['black beans', 'black-eyed peas', 'bayo beans', 'brown beans', 'chickpeas (garbanzo beans)', 'cowpeas', 'edamame', 'fava beans', 'kidney beans', 'lentils', 'lima beans', 'mung beans', 'navy beans', 'pigeon peas', 'pink beans', 'pinto beans', 'split peas', 'soybeans', 'white beans']


In [104]:
# Function to add alternative name of food in brackets as separate food item
def alternative_name(food_item_list):

    temp = []

    for food in food_item_list:

        # Replace '(' with '+' for easier splitting
        if ("(") in food:
            food = food.replace("(", '+')
        # Replace ')' with '+' for easier splitting
        if (")") in food:
            food = food.replace(")", '+')
        # Split string into separate food items
        if ("+" in food):
            # Remove 'or' in string
            food = food.replace(" or ", '+')
            food_split = food.split("+")

            for sub_food in food_split:
                temp.append(sub_food.strip())

        else:
            temp.append(food)

    # Remove empty strings
    temp = [x for x in temp if x]
    return temp

In [116]:
# Split alternative name for vegetable in brackets into a separate food item
pulses = alternative_name(pulses)
print(pulses)

['black beans', 'black-eyed peas', 'bayo beans', 'brown beans', 'chickpeas', 'garbanzo beans', 'cowpeas', 'edamame', 'fava beans', 'kidney beans', 'lentils', 'lima beans', 'mung beans', 'navy beans', 'pigeon peas', 'pink beans', 'pinto beans', 'split peas', 'soybeans', 'white beans']


In [117]:
# Remove the word 'and', split string by comma
starchy_vegetables = starchy_vegetables.replace('and', '').split(",")
# Remove whitespaces at start and end of each food item
starchy_vegetables = [x.strip() for x in starchy_vegetables]
print(starchy_vegetables)

['breadfruit', 'burdock root', 'cassava', 'corn', 'jicama', 'lotus root', 'lima beans', 'immature or raw (not dried) peas (e.g.', 'cowpeas', 'black-eyed peas', 'green peas', 'pigeon peas)', 'plantains', 'white potatoes', 'salsify', 'tapioca', 'taro root (dasheen or yautia)', 'water chestnuts', 'yam', 'yucca']


In [118]:
# Remove 'immature or raw (not dried) peas (e.g.' from list
starchy_vegetables.pop(7)
print(starchy_vegetables)

['breadfruit', 'burdock root', 'cassava', 'corn', 'jicama', 'lotus root', 'lima beans', 'cowpeas', 'black-eyed peas', 'green peas', 'pigeon peas)', 'plantains', 'white potatoes', 'salsify', 'tapioca', 'taro root (dasheen or yautia)', 'water chestnuts', 'yam', 'yucca']


In [119]:
# Split alternative name for vegetable in brackets into a separate food item
starchy_vegetables = alternative_name(starchy_vegetables)
print(starchy_vegetables)

['breadfruit', 'burdock root', 'cassava', 'corn', 'jicama', 'lotus root', 'lima beans', 'cowpeas', 'black-eyed peas', 'green peas', 'pigeon peas', 'plantains', 'white potatoes', 'salsify', 'tapioca', 'taro root', 'dasheen', 'yautia', 'water chestnuts', 'yam', 'yucca']


In [122]:
# Remove the word 'and', split string by comma
others_vegetables = others_vegetables.replace('and', '').split(",")
# Remove whitespaces at start and end of each food item
others_vegetables = [x.strip() for x in others_vegetables]
print(others_vegetables)

['artichoke', 'asparagus', 'avocado', 'bamboo shoots', 'bean sprouts', 'beets', 'bitter melon (bitter gourd', 'balsam pear)', 'broccoflower', 'Brussels sprouts', 'cabbage', 'cactus pads (nopales)', 'cauliflower', 'celeriac', 'celery', 'chayote (mirliton)', 'chives', 'cucumber', 'eggplant', 'fennel bulb', 'garlic', 'ginger root', 'green beans', 'iceberg lettuce', 'kohlrabi', 'leeks', 'luffa (Chinese okra)', 'mushrooms', 'okra', 'onions', 'green chili peppers', 'green bell peppers', 'radicchio', 'sprouted beans', 'radish', 'rutabaga', 'seaweed', 'snow peas', 'summer squash', 'tomatillos', 'turnips', 'winter melons']


In [123]:
others_vegetables = alternative_name(others_vegetables)
print(others_vegetables)

['artichoke', 'asparagus', 'avocado', 'bamboo shoots', 'bean sprouts', 'beets', 'bitter melon', 'bitter gourd', 'balsam pear', 'broccoflower', 'Brussels sprouts', 'cabbage', 'cactus pads', 'nopales', 'cauliflower', 'celeriac', 'celery', 'chayote', 'mirliton', 'chives', 'cucumber', 'eggplant', 'fennel bulb', 'garlic', 'ginger root', 'green beans', 'iceberg lettuce', 'kohlrabi', 'leeks', 'luffa', 'Chinese okra', 'mushrooms', 'okra', 'onions', 'green chili peppers', 'green bell peppers', 'radicchio', 'sprouted beans', 'radish', 'rutabaga', 'seaweed', 'snow peas', 'summer squash', 'tomatillos', 'turnips', 'winter melons']


## Create dataframe of vegetables with their sub-groups

In [127]:
cols = ['food_group', 'sub_group']

dark_green_df = pd.DataFrame(columns=cols)
dark_green_df['name'] = dark_green_vegetables
dark_green_df['sub_group'] = 'dark green'
dark_green_df['food_group'] = 'vegetables'

dark_green_df.head()

Unnamed: 0,food_group,sub_group,name
0,vegetables,dark green,amaranth leaves
1,vegetables,dark green,basil
2,vegetables,dark green,beet greens
3,vegetables,dark green,bitter melon leaves
4,vegetables,dark green,bok choy


In [128]:
red_orange_df = pd.DataFrame(columns=cols)
red_orange_df['name'] = red_orange_vegetables
red_orange_df['sub_group'] = 'red and orange'
red_orange_df['food_group'] = 'vegetables'

red_orange_df.head()

Unnamed: 0,food_group,sub_group,name
0,vegetables,red and orange,calabaza
1,vegetables,red and orange,carrots
2,vegetables,red and orange,red chili peppers
3,vegetables,red and orange,red bell peppers
4,vegetables,red and orange,orange bell peppers


In [129]:
pulses_df = pd.DataFrame(columns=cols)
pulses_df['name'] = pulses
pulses_df['sub_group'] = 'beans, peas, lentils (pulses)'
pulses_df['food_group'] = 'vegetables'

pulses_df.head()

Unnamed: 0,food_group,sub_group,name
0,vegetables,"beans, peas, lentils (pulses)",black beans
1,vegetables,"beans, peas, lentils (pulses)",black-eyed peas
2,vegetables,"beans, peas, lentils (pulses)",bayo beans
3,vegetables,"beans, peas, lentils (pulses)",brown beans
4,vegetables,"beans, peas, lentils (pulses)",chickpeas


In [130]:
starchy_df = pd.DataFrame(columns=cols)
starchy_df['name'] = starchy_vegetables
starchy_df['sub_group'] = 'starchy'
starchy_df['food_group'] = 'vegetables'

starchy_df.head()

Unnamed: 0,food_group,sub_group,name
0,vegetables,starchy,breadfruit
1,vegetables,starchy,burdock root
2,vegetables,starchy,cassava
3,vegetables,starchy,corn
4,vegetables,starchy,jicama


In [131]:
others_df = pd.DataFrame(columns=cols)
others_df['name'] = others_vegetables
others_df['sub_group'] = 'others'
others_df['food_group'] = 'vegetables'

others_df.head()

Unnamed: 0,food_group,sub_group,name
0,vegetables,others,artichoke
1,vegetables,others,asparagus
2,vegetables,others,avocado
3,vegetables,others,bamboo shoots
4,vegetables,others,bean sprouts


In [133]:
# Concat all veg dataframes into single dataframe
veg_df = pd.concat([dark_green_df, red_orange_df, pulses_df, starchy_df, others_df])

# Get sum of rows to ensure the dataframe concatenated correctly
print("Sum of rows across all dataframes: ", len(dark_green_df) + len(red_orange_df) + len(pulses_df) + len(starchy_df) + len(others_df))
print(len(veg_df.index))

Sum of rows across all dataframes:  123
123


In [134]:
# Pickle dataframe to reuse
veg_df.to_pickle('veg_dataset.pkl')

## Pre-processing List of Fruits

In [145]:
fruits = "Asian pears, bananas, (blackberries, blueberries, cranberries, currants, dewberries, huckleberries, kiwifruit, loganberries, mulberries, raspberries, and strawberries); (calamondin, grapefruit, kumquats, lemons, limes, mandarin oranges, pomelos, tangerines, and tangelos); cherries, dates, figs, grapes, guava, jackfruit, lychee, mangoes, melons (cantaloupe, casaba, honeydew, and watermelon); nectarines, papaya, passion fruit, peaches, pears, persimmons, pineapple, plums, pomegranates, prunes, raisins, rhubarb, sapote, soursop, starfruit, and tamarind"

In [146]:
# Remove the word 'and', split string by comma
fruits = fruits.replace(' and ', '').replace(';', ',').replace('(', '').replace(')', '').split(",")
# Remove whitespaces at start and end of each food item
fruits = [x.strip() for x in fruits]
print(fruits)

['Asian pears', 'bananas', 'blackberries', 'blueberries', 'cranberries', 'currants', 'dewberries', 'huckleberries', 'kiwifruit', 'loganberries', 'mulberries', 'raspberries', 'strawberries', 'calamondin', 'grapefruit', 'kumquats', 'lemons', 'limes', 'mandarin oranges', 'pomelos', 'tangerines', 'tangelos', 'cherries', 'dates', 'figs', 'grapes', 'guava', 'jackfruit', 'lychee', 'mangoes', 'melons cantaloupe', 'casaba', 'honeydew', 'watermelon', 'nectarines', 'papaya', 'passion fruit', 'peaches', 'pears', 'persimmons', 'pineapple', 'plums', 'pomegranates', 'prunes', 'raisins', 'rhubarb', 'sapote', 'soursop', 'starfruit', 'tamarind']


In [149]:
# Create fruits dataframe

fruit_df = pd.DataFrame(columns=cols)
fruit_df['name'] = fruits
fruit_df['sub_group'] = 'None'
fruit_df['food_group'] = 'Fruits'

fruit_df.head()

Unnamed: 0,food_group,sub_group,name
0,Fruits,,Asian pears
1,Fruits,,bananas
2,Fruits,,blackberries
3,Fruits,,blueberries
4,Fruits,,cranberries


In [150]:
fruit_df.to_pickle('fruit_dataset.pkl')

## Pre-processing List of Grains

In [153]:
whole_grains = "amaranth, hulled barley, brown rice, buckwheat, bulgur, millet, oats, popcorn, quinoa, dark rye, triticale, whole-grain cornmeal, whole-wheat bread, whole-wheat chapati, whole-grain cereals, whole-grain crackers, and wild rice" 

refined_grains = "white breads, cereals, crackers, corn grits, cream of rice, cream of wheat, barley, masa, pasta, and white rice"


In [154]:
# Remove the word 'and', split string by comma
whole_grains = whole_grains.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
whole_grains = [x.strip() for x in whole_grains]
print(whole_grains)

['amaranth', 'hulled barley', 'brown rice', 'buckwheat', 'bulgur', 'millet', 'oats', 'popcorn', 'quinoa', 'dark rye', 'triticale', 'whole-grain cornmeal', 'whole-wheat bread', 'whole-wheat chapati', 'whole-grain cereals', 'whole-grain crackers', 'wild rice']


In [155]:
# Remove the word 'and', split string by comma
refined_grains = refined_grains.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
refined_grains = [x.strip() for x in refined_grains]
print(refined_grains)

['white breads', 'cereals', 'crackers', 'corn grits', 'cream of rice', 'cream of wheat', 'barley', 'masa', 'pasta', 'white rice']


In [156]:
# Create whole-grains dataframe

whole_grains_df = pd.DataFrame(columns=cols)
whole_grains_df['name'] = whole_grains
whole_grains_df['sub_group'] = 'Whole Grains'
whole_grains_df['food_group'] = 'Grains'

whole_grains_df.head()

Unnamed: 0,food_group,sub_group,name
0,Grains,Whole Grains,amaranth
1,Grains,Whole Grains,hulled barley
2,Grains,Whole Grains,brown rice
3,Grains,Whole Grains,buckwheat
4,Grains,Whole Grains,bulgur


In [157]:
# Create refined-grains dataframe

refined_grains_df = pd.DataFrame(columns=cols)
refined_grains_df['name'] = refined_grains
refined_grains_df['sub_group'] = 'Refined Grains'
refined_grains_df['food_group'] = 'Grains'

refined_grains_df.head()

Unnamed: 0,food_group,sub_group,name
0,Grains,Refined Grains,white breads
1,Grains,Refined Grains,cereals
2,Grains,Refined Grains,crackers
3,Grains,Refined Grains,corn grits
4,Grains,Refined Grains,cream of rice


In [158]:
# Concat all veg dataframes into single dataframe
grains_df = pd.concat([whole_grains_df, refined_grains_df])

# Get sum of rows to ensure the dataframe concatenated correctly
print("Sum of rows across all dataframes: ", len(whole_grains_df) + len(refined_grains_df))
print(len(grains_df.index))

Sum of rows across all dataframes:  27
27


In [159]:
grains_df.to_pickle('grains_dataset.pkl')

## Pre-processing List of Dairy

In [166]:
healthy_milks = "milk, soy milk, buttermilk, yogurt, kefir, frozen yogurt, evaporated milk, milk powder, condensed milk, whole milk"

healthy_cheeses = "cheese, brie, camembert, cheddar, cottage cheese, colby, edam, feta, fontina, goats cheese, gouda, gruyere, limburger, queso anejo, queso asadero, queso chihuahua, monterey, mozzarella, muenster, parmesan, provolone, ricotta, quark, and Swiss cheese"

unhealthy_dairy = "Cream, heavy cream, sour cream, whipped cream, butter, ice-cream, ice cream, ghee, and cream cheese"

others_dairy = "whey, cultured milk, curds, curd whey"

In [None]:
synonyms = "fat free milk, fat-free milk, skim milk, semi-skim milk, skimmed milk, semi-skimmed milk, semi skimmed milk, semi skim milk, goat's cheese, goat cheese, swiss, monterey jack, mozzarella cheese, cheddar cheese, low fat milk, low-fat milk"

In [167]:
# Remove the word 'and', split string by comma
healthy_milks = healthy_milks.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
healthy_milks = [x.strip() for x in healthy_milks]
print(healthy_milks)

['milk', 'soy milk', 'buttermilk', 'yogurt', 'kefir', 'frozen yogurt', 'evaporated milk', 'milk powder', 'condensed milk', 'whole milk']


In [168]:
# Remove the word 'and', split string by comma
healthy_cheeses = healthy_cheeses.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
healthy_cheeses = [x.strip() for x in healthy_cheeses]
print(healthy_cheeses)

['cheese', 'brie', 'camembert', 'cheddar', 'cottage cheese', 'colby', 'edam', 'feta', 'fontina', 'goats cheese', 'gouda', 'gruyere', 'limburger', 'queso anejo', 'queso asadero', 'queso chihuahua', 'monterey', 'mozzarella', 'muenster', 'parmesan', 'provolone', 'ricotta', 'quark', 'Swiss cheese']


In [169]:
# Remove the word 'and', split string by comma
unhealthy_dairy = unhealthy_dairy.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
unhealthy_dairy = [x.strip() for x in unhealthy_dairy]
print(unhealthy_dairy)

['Cream', 'heavy cream', 'sour cream', 'whipped cream', 'butter', 'ice-cream', 'ice cream', 'ghee', 'cream cheese']


In [170]:
# Remove the word 'and', split string by comma
others_dairy = others_dairy.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
others_dairy = [x.strip() for x in others_dairy]
print(others_dairy)

['whey', 'cultured milk', 'curds', 'curd whey']


In [171]:
# Create healthy milks dataframe

healthy_milks_df = pd.DataFrame(columns=cols)
healthy_milks_df['name'] = healthy_milks
healthy_milks_df['sub_group'] = 'Healthy Milk'
healthy_milks_df['food_group'] = 'Dairy'

healthy_milks_df.head()

Unnamed: 0,food_group,sub_group,name
0,Dairy,Healthy Milk,milk
1,Dairy,Healthy Milk,soy milk
2,Dairy,Healthy Milk,buttermilk
3,Dairy,Healthy Milk,yogurt
4,Dairy,Healthy Milk,kefir


In [172]:
# Create healthy cheeses dataframe

healthy_cheeses_df = pd.DataFrame(columns=cols)
healthy_cheeses_df['name'] = healthy_cheeses
healthy_cheeses_df['sub_group'] = 'Healthy Cheese'
healthy_cheeses_df['food_group'] = 'Dairy'

healthy_cheeses_df.head()

Unnamed: 0,food_group,sub_group,name
0,Dairy,Healthy Cheese,cheese
1,Dairy,Healthy Cheese,brie
2,Dairy,Healthy Cheese,camembert
3,Dairy,Healthy Cheese,cheddar
4,Dairy,Healthy Cheese,cottage cheese


In [173]:
# Create unhealthy dairy dataframe

unhealthy_dairy_df = pd.DataFrame(columns=cols)
unhealthy_dairy_df['name'] = unhealthy_dairy
unhealthy_dairy_df['sub_group'] = 'Unhealthy'
unhealthy_dairy_df['food_group'] = 'Dairy'

unhealthy_dairy_df.head()

Unnamed: 0,food_group,sub_group,name
0,Dairy,Unhealthy,Cream
1,Dairy,Unhealthy,heavy cream
2,Dairy,Unhealthy,sour cream
3,Dairy,Unhealthy,whipped cream
4,Dairy,Unhealthy,butter


In [174]:
# Create others dairy dataframe

others_dairy_df = pd.DataFrame(columns=cols)
others_dairy_df['name'] = others_dairy
others_dairy_df['sub_group'] = 'Others'
others_dairy_df['food_group'] = 'Dairy'

others_dairy_df.head()

Unnamed: 0,food_group,sub_group,name
0,Dairy,Others,whey
1,Dairy,Others,cultured milk
2,Dairy,Others,curds
3,Dairy,Others,curd whey


In [175]:
# Concat all veg dataframes into single dataframe
dairy_df = pd.concat([healthy_milks_df, healthy_cheeses_df, unhealthy_dairy_df, others_dairy_df])

# Get sum of rows to ensure the dataframe concatenated correctly
print("Sum of rows across all dataframes: ", len(healthy_milks_df) + len(healthy_cheeses_df) + len(unhealthy_dairy_df) + len(others_dairy_df))
print(len(dairy_df.index))

Sum of rows across all dataframes:  47
47


In [176]:
dairy_df.to_pickle('dairy_dataset.pkl')

## Pre-processing List of Proteins

In [192]:
red_meat = "beef, goat, lamb, pork, bear, bison, deer, elk, moose, opossum, rabbit, raccoon, and squirrel"

poultry = "chicken, Cornish hens, dove, duck, ostrich, pheasant, and quail, goose, and turkey"

eggs = "chicken eggs, duck eggs, quail eggs, ostrich eggs"

others_seafood = "anchovy, light tuna"

oily_fish = "herring, pilchards, salmon, sardines, sprats, trout, mackerel"

white_fish = "Cod, black sea bass, catfish, crawfish, haddock, hake, plaice, perch, squid, pollock, coley, dab, flounder, red mullet, sole, gurnard, and tilapia, whiting"

shellfish = "crab, clams, lobster, oyster, prawns, shrimp, mussels, scallops, squid, and langoustine"

nuts_seeds_soy = "tree nuts, peanuts, nut butter, peanut butter, chia seeds, flax seeds, pumpkin seeds, sesame seeds, and sunflower seeds, seed butter, sesame butter, tahini butter, sunflower butter, tofu, tempeh, soy flour"

In [193]:
# Remove the word 'and', split string by comma
red_meat = red_meat.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
red_meat = [x.strip() for x in red_meat]
print(red_meat)

['beef', 'goat', 'lamb', 'pork', 'bear', 'bison', 'deer', 'elk', 'moose', 'opossum', 'rabbit', 'raccoon', 'squirrel']


In [194]:
# Remove the word 'and', split string by comma
poultry = poultry.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
poultry = [x.strip() for x in poultry]
print(poultry)

['chicken', 'Cornish hens', 'dove', 'duck', 'ostrich', 'pheasant', 'quail', 'goose', 'turkey']


In [195]:
# Remove the word 'and', split string by comma
eggs = eggs.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
eggs = [x.strip() for x in eggs]
print(eggs)

['chicken eggs', 'duck eggs', 'quail eggs', 'ostrich eggs']


In [196]:
# Remove the word 'and', split string by comma
oily_fish = oily_fish.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
oily_fish = [x.strip() for x in oily_fish]
print(oily_fish)

['herring', 'pilchards', 'salmon', 'sardines', 'sprats', 'trout', 'mackerel']


In [197]:
# Remove the word 'and', split string by comma
white_fish = white_fish.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
white_fish = [x.strip() for x in white_fish]
print(white_fish)

['Cod', 'black sea bass', 'catfish', 'crawfish', 'haddock', 'hake', 'plaice', 'perch', 'squid', 'pollock', 'coley', 'dab', 'flounder', 'red mullet', 'sole', 'gurnard', 'tilapia', 'whiting']


In [198]:
# Remove the word 'and', split string by comma
shellfish = shellfish.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
shellfish = [x.strip() for x in shellfish]
print(shellfish)

['crab', 'clams', 'lobster', 'oyster', 'prawns', 'shrimp', 'mussels', 'scallops', 'squid', 'langoustine']


In [199]:
# Remove the word 'and', split string by comma
nuts_seeds_soy = nuts_seeds_soy.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
nuts_seeds_soy = [x.strip() for x in nuts_seeds_soy]
print(nuts_seeds_soy)

['tree nuts', 'peanuts', 'nut butter', 'peanut butter', 'chia seeds', 'flax seeds', 'pumpkin seeds', 'sesame seeds', 'sunflower seeds', 'seed butter', 'sesame butter', 'tahini butter', 'sunflower butter', 'tofu', 'tempeh', 'soy flour']


In [200]:
# Remove the word 'and', split string by comma
others_seafood = others_seafood.replace(' and ', '').split(",")
# Remove whitespaces at start and end of each food item
others_seafood = [x.strip() for x in others_seafood]
print(others_seafood)

['anchovy', 'light tuna']


In [201]:
# Create red meat dataframe

red_meat_df = pd.DataFrame(columns=cols)
red_meat_df['name'] = red_meat
red_meat_df['sub_group'] = 'Red Meat'
red_meat_df['food_group'] = 'Proteins'

red_meat_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Red Meat,beef
1,Proteins,Red Meat,goat
2,Proteins,Red Meat,lamb
3,Proteins,Red Meat,pork
4,Proteins,Red Meat,bear


In [202]:
# Create poultry dataframe

poultry_df = pd.DataFrame(columns=cols)
poultry_df['name'] = poultry
poultry_df['sub_group'] = 'Poultry'
poultry_df['food_group'] = 'Proteins'

poultry_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Poultry,chicken
1,Proteins,Poultry,Cornish hens
2,Proteins,Poultry,dove
3,Proteins,Poultry,duck
4,Proteins,Poultry,ostrich


In [203]:
# Create eggs dataframe

eggs_df = pd.DataFrame(columns=cols)
eggs_df['name'] = eggs
eggs_df['sub_group'] = 'Eggs'
eggs_df['food_group'] = 'Proteins'

eggs_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Eggs,chicken eggs
1,Proteins,Eggs,duck eggs
2,Proteins,Eggs,quail eggs
3,Proteins,Eggs,ostrich eggs


In [204]:
# Create white fish dataframe

whitefish_df = pd.DataFrame(columns=cols)
whitefish_df['name'] = white_fish
whitefish_df['sub_group'] = 'Seafood White Fish'
whitefish_df['food_group'] = 'Proteins'

whitefish_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Seafood White Fish,Cod
1,Proteins,Seafood White Fish,black sea bass
2,Proteins,Seafood White Fish,catfish
3,Proteins,Seafood White Fish,crawfish
4,Proteins,Seafood White Fish,haddock


In [205]:
# Create oily fish dataframe

oilyfish_df = pd.DataFrame(columns=cols)
oilyfish_df['name'] = oily_fish
oilyfish_df['sub_group'] = 'Seafood Oily Fish'
oilyfish_df['food_group'] = 'Proteins'

oilyfish_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Seafood Oily Fish,herring
1,Proteins,Seafood Oily Fish,pilchards
2,Proteins,Seafood Oily Fish,salmon
3,Proteins,Seafood Oily Fish,sardines
4,Proteins,Seafood Oily Fish,sprats


In [207]:
# Create shellfish dataframe

shellfish_df = pd.DataFrame(columns=cols)
shellfish_df['name'] = shellfish
shellfish_df['sub_group'] = 'Seafood Shellfish'
shellfish_df['food_group'] = 'Proteins'

shellfish_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Seafood Shellfish,crab
1,Proteins,Seafood Shellfish,clams
2,Proteins,Seafood Shellfish,lobster
3,Proteins,Seafood Shellfish,oyster
4,Proteins,Seafood Shellfish,prawns


In [208]:
# Create others seafood dataframe

others_seafood_df = pd.DataFrame(columns=cols)
others_seafood_df['name'] = others_dairy
others_seafood_df['sub_group'] = 'Seafood Others'
others_seafood_df['food_group'] = 'Proteins'

others_seafood_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Seafood Others,whey
1,Proteins,Seafood Others,cultured milk
2,Proteins,Seafood Others,curds
3,Proteins,Seafood Others,curd whey


In [206]:
# Create nuts seeds soy dataframe

nuts_seeds_soy_df = pd.DataFrame(columns=cols)
nuts_seeds_soy_df['name'] = nuts_seeds_soy
nuts_seeds_soy_df['sub_group'] = 'Nuts Seeds Soy'
nuts_seeds_soy_df['food_group'] = 'Proteins'

nuts_seeds_soy_df.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Nuts Seeds Soy,tree nuts
1,Proteins,Nuts Seeds Soy,peanuts
2,Proteins,Nuts Seeds Soy,nut butter
3,Proteins,Nuts Seeds Soy,peanut butter
4,Proteins,Nuts Seeds Soy,chia seeds


In [209]:
# Concat all protein dataframes into single dataframe
proteins_df_list = [red_meat_df, poultry_df, eggs_df, whitefish_df, oilyfish_df, shellfish_df, others_seafood_df, nuts_seeds_soy_df]
rows_count = [len(x) for x in proteins_df_list]

proteins_df = pd.concat(proteins_df_list)

# Get sum of rows to ensure the dataframe concatenated correctly
print("Sum of rows across all dataframes: ", sum(rows_count))
print(len(proteins_df.index))

Sum of rows across all dataframes:  81
81


In [210]:
proteins_df.to_pickle('proteins_dataset.pkl')

## Exporting Data as .txt For NLU

In [212]:
all_vegetables = veg_df['name'].tolist()
all_fruits = fruit_df['name'].tolist()
all_grains = grains_df['name'].tolist()
all_dairy = dairy_df['name'].tolist()
all_proteins = proteins_df['name'].tolist()

In [220]:
# Write all food items to .txt files in yml format
def write_to_textfile(list_to_convert, file_name):

    with open(f"{file_name}.txt", 'w') as f:
        # File header
        f.write("nlu:\n" + 
                f"- lookup: {file_name}\n" +
                "  examples: |\n")
        # Write food items
        for item in list_to_convert:
            f.write(f"    - {item}\n")
    
    print(f"{file_name}.txt has been saved.")


In [222]:
write_to_textfile(all_vegetables, "vegetables")
write_to_textfile(all_fruits, "fruits")
write_to_textfile(all_grains, "grains")
write_to_textfile(all_dairy, "dairy")
write_to_textfile(all_proteins, "proteins")

vegetables.txt has been saved.
fruits.txt has been saved.
grains.txt has been saved.
dairy.txt has been saved.
proteins.txt has been saved.


In [1]:
import tensorflow as tf

2021-07-17 21:29:33.295157: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


In [2]:
tf.config.list_physical_devices('GPU')

2021-07-17 21:30:14.985212: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2021-07-17 21:30:15.010274: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:982] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-17 21:30:15.010868: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1716] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: Quadro P4000 computeCapability: 6.1
coreClock: 1.48GHz coreCount: 14 deviceMemorySize: 7.93GiB deviceMemoryBandwidth: 226.62GiB/s
2021-07-17 21:30:15.010889: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
2021-07-17 21:30:15.013886: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcublas.so.10
2021-07-17 21:30:15.015863: I tensorflow/stream_executor/platform/defaul

[]

In [1]:
import pandas as pd

In [43]:
dairy_df = pd.read_pickle('../data/datasets/dairy_dataset.pkl')
dairy_df.head()

Unnamed: 0,food_group,sub_group,name
0,Dairy,Healthy Milk,milk
1,Dairy,Healthy Milk,soy milk
2,Dairy,Healthy Milk,buttermilk
3,Dairy,Healthy,yogurt
4,Dairy,Healthy,kefir


In [7]:
print(dairy_df)

   food_group       sub_group             name
0       Dairy    Healthy Milk             milk
1       Dairy    Healthy Milk         soy milk
2       Dairy    Healthy Milk       buttermilk
3       Dairy    Healthy Milk           yogurt
4       Dairy    Healthy Milk            kefir
5       Dairy    Healthy Milk    frozen yogurt
6       Dairy    Healthy Milk  evaporated milk
7       Dairy    Healthy Milk      milk powder
8       Dairy    Healthy Milk   condensed milk
9       Dairy    Healthy Milk       whole milk
0       Dairy  Healthy Cheese           cheese
1       Dairy  Healthy Cheese             brie
2       Dairy  Healthy Cheese        camembert
3       Dairy  Healthy Cheese          cheddar
4       Dairy  Healthy Cheese   cottage cheese
5       Dairy  Healthy Cheese            colby
6       Dairy  Healthy Cheese             edam
7       Dairy  Healthy Cheese             feta
8       Dairy  Healthy Cheese          fontina
9       Dairy  Healthy Cheese     goats cheese
10      Dairy

In [3]:
fruit = pd.read_pickle('fruit_dataset.pkl')
fruit.head()

Unnamed: 0,food_group,sub_group,name
0,Fruits,,Asian pears
1,Fruits,,bananas
2,Fruits,,blackberries
3,Fruits,,blueberries
4,Fruits,,cranberries


In [4]:
grains = pd.read_pickle('grains_dataset.pkl')
grains.head()

Unnamed: 0,food_group,sub_group,name
0,Grains,Whole Grains,amaranth
1,Grains,Whole Grains,hulled barley
2,Grains,Whole Grains,brown rice
3,Grains,Whole Grains,buckwheat
4,Grains,Whole Grains,bulgur


In [5]:
proteins = pd.read_pickle('proteins_dataset.pkl')
proteins.head()

Unnamed: 0,food_group,sub_group,name
0,Proteins,Red Meat,beef
1,Proteins,Red Meat,goat
2,Proteins,Red Meat,lamb
3,Proteins,Red Meat,pork
4,Proteins,Red Meat,bear


In [6]:
veg = pd.read_pickle('veg_dataset.pkl')
veg.head()

Unnamed: 0,food_group,sub_group,name
0,vegetables,dark green,amaranth leaves
1,vegetables,dark green,basil
2,vegetables,dark green,beet greens
3,vegetables,dark green,bitter melon leaves
4,vegetables,dark green,bok choy


In [11]:
dairy[dairy['sub_group'].str.contains('milk', case=False) | dairy['name'].str.contains('milk', case=False)]

Unnamed: 0,food_group,sub_group,name
0,Dairy,Healthy Milk,milk
1,Dairy,Healthy Milk,soy milk
2,Dairy,Healthy Milk,buttermilk
3,Dairy,Healthy Milk,yogurt
4,Dairy,Healthy Milk,kefir
5,Dairy,Healthy Milk,frozen yogurt
6,Dairy,Healthy Milk,evaporated milk
7,Dairy,Healthy Milk,milk powder
8,Dairy,Healthy Milk,condensed milk
9,Dairy,Healthy Milk,whole milk


In [37]:
dairy.at[5, 'sub_group'] = 'Healthy'
dairy.at[3, 'sub_group'] = 'Healthy'
dairy.at[4, 'sub_group'] = 'Healthy'

In [39]:
dairy.at[43, 'sub_group'] = 'Healthy'
dairy.at[44, 'sub_group'] = 'Healthy'
dairy.at[45, 'sub_group'] = 'Healthy'
dairy.at[46, 'sub_group'] = 'Healthy'

In [35]:
dairy.reset_index(drop=True, inplace=True)

In [30]:
dairy.drop(index=[47,48,49], inplace=True)

In [40]:
print(dairy)

   food_group       sub_group             name
0       Dairy    Healthy Milk             milk
1       Dairy    Healthy Milk         soy milk
2       Dairy    Healthy Milk       buttermilk
3       Dairy         Healthy           yogurt
4       Dairy         Healthy            kefir
5       Dairy         Healthy    frozen yogurt
6       Dairy    Healthy Milk  evaporated milk
7       Dairy    Healthy Milk      milk powder
8       Dairy    Healthy Milk   condensed milk
9       Dairy    Healthy Milk       whole milk
10      Dairy  Healthy Cheese           cheese
11      Dairy  Healthy Cheese             brie
12      Dairy  Healthy Cheese        camembert
13      Dairy  Healthy Cheese          cheddar
14      Dairy  Healthy Cheese   cottage cheese
15      Dairy  Healthy Cheese            colby
16      Dairy  Healthy Cheese             edam
17      Dairy  Healthy Cheese             feta
18      Dairy  Healthy Cheese          fontina
19      Dairy  Healthy Cheese     goats cheese
20      Dairy

In [41]:
dairy.to_pickle('dairy_dataset.pkl')

In [48]:
dairy_df[(dairy_df['name'].str.contains('milk', case=False)
            | dairy_df['sub_group'].str.contains('milk', case=False))
                & ~dairy_df['sub_group'].str.contains('unhealthy', case=False)]

Unnamed: 0,food_group,sub_group,name
0,Dairy,Healthy Milk,milk
1,Dairy,Healthy Milk,soy milk
2,Dairy,Healthy Milk,buttermilk
6,Dairy,Healthy Milk,evaporated milk
7,Dairy,Healthy Milk,milk powder
8,Dairy,Healthy Milk,condensed milk
9,Dairy,Healthy Milk,whole milk
44,Dairy,Healthy,cultured milk


In [60]:
entity_fooditem = 'whey'
unhealthy_message = ''
df = None

for index, rows in dairy_df.iterrows():

    # If food item is unhealthy
    if (entity_fooditem in rows['name'].lower()
    and 'unhealthy' in rows['sub_group'].lower()):
        unhealthy_message = f"{entity_fooditem} is not a healthy source of dairy!"

    df = dairy_df[(dairy_df['name'].str.contains(entity_fooditem, case=False)
    | dairy_df['sub_group'].str.contains(entity_fooditem, case=False))
    & ~dairy_df['sub_group'].str.contains('unhealthy', case=False)] # Excludes 'unhealthy'

print(unhealthy_message)
print(df)


   food_group sub_group       name
43      Dairy   Healthy       whey
46      Dairy   Healthy  curd whey


In [91]:
entity_fooditem = 'ice-cream'
unhealthy = list(dairy_df.loc[dairy_df['name'].str.contains(entity_fooditem), 'sub_group'].str.lower())[0]

if unhealthy == 'unhealthy':
    print('yes')

yes


In [93]:
if list(dairy_df.loc[dairy_df['name'].str.contains(entity_fooditem), 'sub_group'].str.lower())[0] == 'unhealthy':
    print(f"{entity_fooditem} is not a healthy source of dairy!")

ice-cream is not a healthy source of dairy!


In [98]:
entity_fooditem = 'tomato'
sub_group = list(veg.loc[veg['name'].str.contains(entity_fooditem, case=False), 'sub_group'])[0]
df = veg[veg['sub_group'].str.contains(sub_group, case=False)]
print(df)

    food_group       sub_group                 name
0   vegetables  red and orange             calabaza
1   vegetables  red and orange              carrots
2   vegetables  red and orange    red chili peppers
3   vegetables  red and orange     red bell peppers
4   vegetables  red and orange  orange bell peppers
5   vegetables  red and orange     pimento/pimiento
6   vegetables  red and orange       sweet potatoes
7   vegetables  red and orange             tomatoes
8   vegetables  red and orange    100% tomato juice
9   vegetables  red and orange         acorn squash
10  vegetables  red and orange     butternut squash
11  vegetables  red and orange              kabocha
12  vegetables  red and orange              pumpkin


In [61]:
print(veg)

    food_group   sub_group                 name
0   vegetables  dark green      amaranth leaves
1   vegetables  dark green                basil
2   vegetables  dark green          beet greens
3   vegetables  dark green  bitter melon leaves
4   vegetables  dark green             bok choy
..         ...         ...                  ...
41  vegetables      others            snow peas
42  vegetables      others        summer squash
43  vegetables      others           tomatillos
44  vegetables      others              turnips
45  vegetables      others        winter melons

[123 rows x 3 columns]


In [99]:
print(grains)

   food_group       sub_group                  name
0      Grains    Whole Grains              amaranth
1      Grains    Whole Grains         hulled barley
2      Grains    Whole Grains            brown rice
3      Grains    Whole Grains             buckwheat
4      Grains    Whole Grains                bulgur
5      Grains    Whole Grains                millet
6      Grains    Whole Grains                  oats
7      Grains    Whole Grains               popcorn
8      Grains    Whole Grains                quinoa
9      Grains    Whole Grains              dark rye
10     Grains    Whole Grains             triticale
11     Grains    Whole Grains  whole-grain cornmeal
12     Grains    Whole Grains     whole-wheat bread
13     Grains    Whole Grains   whole-wheat chapati
14     Grains    Whole Grains   whole-grain cereals
15     Grains    Whole Grains  whole-grain crackers
16     Grains    Whole Grains             wild rice
0      Grains  Refined Grains          white breads
1      Grain

In [102]:
sub_groups = ['whole-grains', 'whole grains', 'whole-grain', 'whole grain'
                        'refined-grains', 'refined grains', 'refined-grain', 'refined grain'
                        'processed-grains', 'processed grains', 'processed grain', 'processed-grain']

entity_fooditem = 'whole-grain'

for subgroup in sub_groups:

    # If asking for sub-group only
    if subgroup in entity_fooditem:
        df = grains[('sub_group')].str.contains(entity_fooditem, case=False)
    
    # If asking if specific grain food is part of a subgroup
    else:

        sub_group = list(grains.loc[grains['name'].str.contains(entity_fooditem, case=False), 'sub_group'])[0]
        df = grains[grains['sub_group'].str.contains(sub_group, case=False)]

print(df)

AttributeError: 'str' object has no attribute 'str'

In [1]:
import pandas as pd

In [4]:
proteins_df = pd.read_pickle('../data/datasets/proteins_dataset.pkl')

In [6]:
print(proteins_df['sub_group'].unique())

['Red Meat' 'Poultry' 'Eggs' 'Seafood White Fish' 'Seafood Oily Fish'
 'Seafood Shellfish' 'Seafood Others' 'Nuts Seeds Soy']


In [33]:
sub_groups = ['red meat', 'poultry', 'seafood', 'white fish', 'oily fish', 'shellfish', 'nuts', 'seeds', 'soy', 'seafood']

In [16]:
entity_fooditem = "shellfish"

if (entity_fooditem == "red meat" or entity_fooditem == "poultry" 
or entity_fooditem == "nuts" or entity_fooditem == "seeds" or entity_fooditem == "soy"):

    df = proteins_df[proteins_df[('sub_group')].str.contains(entity_fooditem, case=False)]

elif (entity_fooditem == "shellfish" or entity_fooditem == "white fish"
or entity_fooditem == "oily fish" or  entity_fooditem == "shellfish"):

    df = proteins_df[proteins_df[('sub_group')].str.contains('seafood', case=False)]

In [43]:
entity_fooditem = "seeds"

for sg in sub_groups:
    if sg in entity_fooditem:
        df = proteins_df[proteins_df[('sub_group')].str.contains(entity_fooditem, case=False)]
        print(entity_fooditem)

seeds


In [44]:
print(df)

   food_group       sub_group              name
0    Proteins  Nuts Seeds Soy         tree nuts
1    Proteins  Nuts Seeds Soy           peanuts
2    Proteins  Nuts Seeds Soy        nut butter
3    Proteins  Nuts Seeds Soy     peanut butter
4    Proteins  Nuts Seeds Soy        chia seeds
5    Proteins  Nuts Seeds Soy        flax seeds
6    Proteins  Nuts Seeds Soy     pumpkin seeds
7    Proteins  Nuts Seeds Soy      sesame seeds
8    Proteins  Nuts Seeds Soy   sunflower seeds
9    Proteins  Nuts Seeds Soy       seed butter
10   Proteins  Nuts Seeds Soy     sesame butter
11   Proteins  Nuts Seeds Soy     tahini butter
12   Proteins  Nuts Seeds Soy  sunflower butter
13   Proteins  Nuts Seeds Soy              tofu
14   Proteins  Nuts Seeds Soy            tempeh
15   Proteins  Nuts Seeds Soy         soy flour


In [45]:
entity_fooditem = 'beef'

sub_group = list(proteins_df.loc[proteins_df['name'].str.contains(entity_fooditem, case=False), 'sub_group'])[0]
df = proteins_df[proteins_df['sub_group'].str.contains(sub_group, case=False)]

print(df)

   food_group sub_group      name
0    Proteins  Red Meat      beef
1    Proteins  Red Meat      goat
2    Proteins  Red Meat      lamb
3    Proteins  Red Meat      pork
4    Proteins  Red Meat      bear
5    Proteins  Red Meat     bison
6    Proteins  Red Meat      deer
7    Proteins  Red Meat       elk
8    Proteins  Red Meat     moose
9    Proteins  Red Meat   opossum
10   Proteins  Red Meat    rabbit
11   Proteins  Red Meat   raccoon
12   Proteins  Red Meat  squirrel


In [46]:
fruit_df = pd.read_pickle('../data/datasets/fruit_dataset.pkl')

In [55]:
print(fruit_df)

   food_group sub_group               name
0      Fruits      None        Asian pears
1      Fruits      None            bananas
2      Fruits   Berries       blackberries
3      Fruits   Berries        blueberries
4      Fruits   Berries        cranberries
5      Fruits      None           currants
6      Fruits   Berries         dewberries
7      Fruits   Berries      huckleberries
8      Fruits      None          kiwifruit
9      Fruits   Berries       loganberries
10     Fruits   Berries         mulberries
11     Fruits   Berries        raspberries
12     Fruits   Berries       strawberries
13     Fruits      None         calamondin
14     Fruits      None         grapefruit
15     Fruits      None           kumquats
16     Fruits      None             lemons
17     Fruits      None              limes
18     Fruits      None   mandarin oranges
19     Fruits      None            pomelos
20     Fruits      None         tangerines
21     Fruits      None           tangelos
22     Frui

In [81]:
fruit_df['sub_group'] = fruit_df['sub_group'].fillna('Others')

In [83]:
fruit_df.at[0, 'sub_group'] = 'Others'
fruit_df.at[1, 'sub_group'] = 'Others'

In [85]:
fruit_df.at[2, 'sub_group'] = 'Berries'
fruit_df.at[3, 'sub_group'] = 'Berries'
fruit_df.at[4, 'sub_group'] = 'Berries'
fruit_df.at[5, 'sub_group'] = 'Berries'
fruit_df.at[6, 'sub_group'] = 'Berries'
fruit_df.at[7, 'sub_group'] = 'Berries'
fruit_df.at[8, 'sub_group'] = 'Berries'
fruit_df.at[9, 'sub_group'] = 'Berries'
fruit_df.at[10, 'sub_group'] = 'Berries'
fruit_df.at[11, 'sub_group'] = 'Berries'
fruit_df.at[12, 'sub_group'] = 'Berries'

In [87]:
fruit_df.at[13, 'sub_group'] = 'Citrus'
fruit_df.at[14, 'sub_group'] = 'Citrus'
fruit_df.at[15, 'sub_group'] = 'Citrus'
fruit_df.at[16, 'sub_group'] = 'Citrus'
fruit_df.at[17, 'sub_group'] = 'Citrus'
fruit_df.at[18, 'sub_group'] = 'Citrus'
fruit_df.at[19, 'sub_group'] = 'Citrus'
fruit_df.at[20, 'sub_group'] = 'Citrus'
fruit_df.at[21, 'sub_group'] = 'Citrus'

In [89]:
fruit_df.at[30, 'sub_group'] = 'Melons'
fruit_df.at[31, 'sub_group'] = 'Melons'
fruit_df.at[32, 'sub_group'] = 'Melons'
fruit_df.at[33, 'sub_group'] = 'Melons'

In [90]:
print(fruit_df)

   food_group sub_group                       name
0      Fruits    Others                Asian pears
1      Fruits    Others             bananas/banana
2      Fruits   Berries    blackberries/blackberry
3      Fruits   Berries      blueberries/blueberry
4      Fruits   Berries      cranberries/cranberry
5      Fruits   Berries                   currants
6      Fruits   Berries        dewberries/dewberry
7      Fruits   Berries  huckleberries/huckleberry
8      Fruits   Berries                  kiwifruit
9      Fruits   Berries    loganberries/loganberry
10     Fruits   Berries        mulberries/mulberry
11     Fruits   Berries      raspberries/raspberry
12     Fruits   Berries    strawberries/strawberry
13     Fruits    Citrus                 calamondin
14     Fruits    Citrus                 grapefruit
15     Fruits    Citrus                   kumquats
16     Fruits    Citrus                     lemons
17     Fruits    Citrus                      limes
18     Fruits    Citrus        

In [75]:
fruit_df = fruit_df.append({'food_group': 'Fruits', 'sub_group': 'Others', 'name': 'apricots'}, ignore_index=True)

In [None]:
fruit_df.to_pickle('fruit_dataset.pkl')