# Identify macrosections of ingredients

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load dictionary: ingredients_v_10k_balanced.csv
df = pd.read_csv('./dataset/ingredients_v_10k_balanced.csv')
df.shape

(31852, 1)

In [3]:
# see if there are any missing values
df.isnull().sum()

0    1
dtype: int64

In [4]:
df.head()

Unnamed: 0,0
0,
1,oasted cauliflower
2,minced jalapeno taste
3,mixed mushrooms minced cup
4,cheese mix


In [5]:
# drop nan
df = df.dropna()
df.shape

(31851, 1)

In [6]:
# Rename the column to 'Ingredient'
df.columns = ['Ingredient']

# Convert the values in the 'Ingredient' column to strings
df['Ingredient'] = df['Ingredient'].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31851 entries, 1 to 31851
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Ingredient  31851 non-null  object
dtypes: object(1)
memory usage: 497.7+ KB


In [7]:
df.head()

Unnamed: 0,Ingredient
1,oasted cauliflower
2,minced jalapeno taste
3,mixed mushrooms minced cup
4,cheese mix
5,cooked shrimp salt pepper


In [8]:
df.iloc[25]

Ingredient    mayonnaise mayonnaise
Name: 26, dtype: object

In [9]:
# identify all ingredients in df that consist of only 2 identical words and keep only the first word

# Split the 'Ingredient' column into individual words
df['Words'] = df['Ingredient'].str.split()

# Filter rows where the ingredient consists of only two identical words
mask = df['Words'].apply(lambda x: len(x) == 2 and x[0] == x[1])

# Update the 'Ingredient' column to keep only the first word
df.loc[mask, 'Ingredient'] = df.loc[mask, 'Words'].apply(lambda x: x[0])

# Drop the intermediate 'Words' column
df.drop(columns=['Words'], inplace=True)

In [10]:
df.iloc[25]

Ingredient    mayonnaise
Name: 26, dtype: object

## Definition of Main Categories

In [12]:
Categories = [
    'Beverages',
    'Cereal Grains and Pasta',
    'Dairy',
    'Egg Products',
    'Fish',
    'Fruits',
    'Legumes',
    'Meat',
    'Nuts and Seeds',
    'Spices and Herbs',
    'Vegetables'
]

In [13]:
# categories of FoodData Central
df_category = pd.read_csv('./dataset/food_category.csv')
df_category.head()

Unnamed: 0,id,code,description
0,1,100,Dairy and Egg Products
1,2,200,Spices and Herbs
2,3,300,Baby Foods
3,4,400,Fats and Oils
4,5,500,Poultry Products


## Assign Ingredients to Categories

## Test the Procedure