In [1]:
import pandas as pd
import json
import unicodedata
import transformers
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("cleanedData1.csv")

In [3]:
def get_unique_ingredient_name_list(df):
    ingredients_set = set()
    for index, row in df.iterrows():
        ingredients = json.loads(row["ingredients"])
        for ingredient in ingredients:
            if "ingredient" in ingredient:
                ingredients_set.add(ingredient["ingredient"].lower())

    return list(ingredients_set)

In [4]:
unique_ingredients = get_unique_ingredient_name_list(df)

In [5]:
print(len(unique_ingredients))
unique_ingredients[:25]

41423


['',
 'butternut squash, halved lengthwise and seeded',
 'mint, for garnish',
 'french vanilla cake mix (such as duncan hines®)',
 'prepared angel food cake, cut into cubes',
 'eggs, separated, divided',
 'sea salt, or more to taste',
 'spicy ranch-style seasoning mix',
 'dry lentils, rinsed and drained',
 'fresh sage, bruised',
 'hot yellow chile peppers, finely chopped (optional)',
 'orange-flavored liqueur, or to taste',
 'bottled salsa',
 'pico de gallo salsa, drained',
 'chopped italian flat leaf parsley',
 'country ham slices, diced',
 'sambal oelek',
 'carrots, peeled and cut into matchsticks',
 'vegan pie crust',
 'apple juice, in a spray bottle (optional)',
 'hard-boiled egg, chopped',
 'roasted red pepper hummus, such as athenos®',
 'sliced green bell peppers',
 'pimientos, drained and diced',
 'refrigerated biscuit dough, baked according to package instructions']

In [15]:
vectorizer = CountVectorizer(analyzer="word", ngram_range=(1, 2))
X = vectorizer.fit_transform(unique_ingredients)
feature_names = vectorizer.get_feature_names_out()

In [16]:
print(len(feature_names))
print(feature_names[-100:])

35895
['zealand' 'zealand leg' 'zero' 'zero calorie' 'zest' 'zest and'
 'zest curl' 'zest cut' 'zest divided' 'zest finely' 'zest for'
 'zest from' 'zest grated' 'zest if' 'zest minced' 'zest of' 'zest only'
 'zest optional' 'zest or' 'zest oranges' 'zest plus' 'zest preferably'
 'zested' 'zested and' 'zested divided' 'zested in' 'zested into'
 'zested optional' 'zested or' 'zested peeled' 'zested then' 'zesty'
 'zesty italian' 'zesty sour' 'zesty style' 'zinfandel' 'zinfandel or'
 'zinfandel wine' 'zing' 'zing zang' 'zinger' 'zinger tea' 'ziti'
 'ziti or' 'ziti pasta' 'ziyad' 'ziyad cut' 'zucchini'
 'zucchini alternating' 'zucchini and' 'zucchini blossoms'
 'zucchini chopped' 'zucchini chunks' 'zucchini coarsely' 'zucchini cubed'
 'zucchini cut' 'zucchini diced' 'zucchini ends' 'zucchini excess'
 'zucchini finely' 'zucchini frozen' 'zucchini grated' 'zucchini halved'
 'zucchini inch' 'zucchini julienned' 'zucchini lightly'
 'zucchini matchsticks' 'zucchini noodles' 'zucchini or' 'zucc

In [19]:
print(X)

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 358669 stored elements and shape (41423, 35895)>
  Coords	Values
  (1, 4990)	1
  (1, 30576)	1
  (1, 15296)	1
  (1, 18399)	1
  (1, 821)	1
  (1, 28063)	1
  (1, 4993)	1
  (1, 30590)	1
  (1, 15323)	1
  (1, 18401)	1
  (1, 1111)	1
  (2, 20284)	1
  (2, 12738)	1
  (2, 14034)	1
  (2, 20298)	1
  (2, 12779)	1
  (3, 13014)	1
  (3, 34010)	1
  (3, 5113)	1
  (3, 20360)	1
  (3, 31369)	1
  (3, 1518)	1
  (3, 11191)	1
  (3, 15780)	1
  (3, 13038)	1
  :	:
  (41419, 23584)	1
  (41419, 23585)	1
  (41419, 29166)	1
  (41419, 1133)	1
  (41419, 20932)	1
  (41419, 20937)	1
  (41420, 21863)	1
  (41420, 28631)	1
  (41420, 6078)	1
  (41420, 25434)	1
  (41420, 28742)	1
  (41420, 20617)	1
  (41420, 20621)	1
  (41420, 22165)	1
  (41420, 25436)	1
  (41421, 10222)	1
  (41421, 20733)	1
  (41421, 5014)	1
  (41421, 5015)	1
  (41421, 10249)	1
  (41422, 6325)	1
  (41422, 31004)	1
  (41422, 6410)	1
  (41422, 31542)	1
  (41422, 31545)	1


In [22]:
X.nonzero()

(array([    1,     1,     1, ..., 41422, 41422, 41422],
       shape=(358669,), dtype=int32),
 array([  821,  1111,  4990, ..., 31004, 31542, 31545],
       shape=(358669,), dtype=int32))

In [25]:
non_z = X.nonzero()

ing_num = 1
ing_inds = []
ing_inds_tmp = []
for i, ing in enumerate(non_z[0]):
    if ing == ing_num:
        ing_inds_tmp.append(non_z[1][i])
    else:
        ing_inds.append(ing_inds_tmp)
        ing_inds_tmp = []
        ing_inds_tmp.append(non_z[1][i])
        ing_num = ing

if len(ing_inds_tmp) > 0:
    ing_inds.append(ing_inds_tmp)

print(len(ing_inds))

41410


In [30]:
for i in range(115, 120):
    print(f"index {i}: ")
    for ind in ing_inds[i]:
        print(feature_names[ind])

index 115: 
and
and greens
fresh
fresh herbs
greens
greens lettuce
herbs
herbs and
lettuce
lettuce mix
mix
prewashed
prewashed fresh
index 116: 
halved
halved lengthwise
hearts
hearts of
lengthwise
lettuce
lettuce halved
of
of romaine
romaine
romaine lettuce
index 117: 
aluminum
aluminum foil
duty
duty aluminum
foil
heavy
heavy duty
index 118: 
as
as farmer
farmer
farmer john
john
link
link sausages
pork
pork link
sausages
sausages such
such
such as
index 119: 
bacon
bacon cut
cut
cut bacon
cut into
inch
inch pieces
into
into inch
pieces
thick
thick cut


In [34]:
from datasets import load_dataset

ds = load_dataset("Scuccorese/food-ingredients-dataset")

README.md:   0%|          | 0.00/407 [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/367M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/476M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/484M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/6676 [00:00<?, ? examples/s]

In [35]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['category', 'subcategory', 'ingredient', 'image'],
        num_rows: 6676
    })
})


In [36]:
ing_ds = ds["train"].remove_columns("image")

In [39]:
ing_ds = pd.DataFrame(ing_ds)

In [41]:
ing_ds = ing_ds.drop_duplicates()

In [43]:
ing_cat_dict = dict()
for _, row in ing_ds.iterrows():
    ing_cat_dict[row["ingredient"]] = row["category"]

In [49]:
for i in range(100):
    for ind in ing_inds[i]:
        if feature_names[ind] in ing_cat_dict.keys():
            print(f"index {i}: ")
            print(feature_names[ind])

index 5: 
sea salt
index 10: 
orange
index 25: 
celery
index 33: 
beef
index 35: 
orange
index 36: 
orange
index 37: 
chicken
index 38: 
rice
index 38: 
white rice
index 42: 
pineapple
index 44: 
lemon
index 46: 
crab
index 49: 
pasta
index 49: 
rotini
index 50: 
lemon
index 56: 
snow peas
index 62: 
garlic
index 66: 
polenta
index 69: 
onion
index 72: 
garlic
index 78: 
cherry
index 80: 
spinach
index 81: 
chicken
index 84: 
peach
index 87: 
lemon
index 94: 
pork
index 96: 
carrot
index 97: 
green olives
