In [32]:
from models import spacy_base_en
from ner_core import NER

In [33]:
ner = NER(spacy_base_en("small"), use_ruler = False)

In [34]:
from utils import re2pat
import json

with open("sample_texts.json") as f:
	texts = json.load(f)

print(texts)

{'archanas-kitchen': ["The Onion Vada Sambar Recipe is a simple healthy and nutritious recipe of the urad dal vada that is pan fried in a paniyaram pan.  In this recipe, I have made the Vada’s healthy by using the Kuzhi Paniyaram pan as opposed to the traditional method of deep frying the vada's.", "Serve the delicious Onion Vada's with a Mixed Vegetable Sambar topped with finely chopped onions.", 'To begin making the Onion Vada Sambar Recipe;  we will first soak the dal for about 3 hours.\n\nUsing a strainer, strain all the excess water from the urad dal. We need to use very little water to grind the urad dal, so that you can shape them well if you are deep frying.\n\nNext we will grind the dal, along with ginger and green chillies using a food processor or a mixer grinder into a very smooth batter. Make sure add very little water while grinding the dal to make a smooth batter. Transfer the batter to a bowl. \n\nAdd in the chopped onions, coriander leaves into the batter. Stir well to

### spaCy default pipeline for English - small

In [35]:
small = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	small[key] = ner.entities

	ner = NER(spacy_base_en("small"), use_ruler = False)

archanas-kitchen
{'Vada': 'PERSON', 'first': 'ORDINAL', 'about 3 hours': 'TIME', 'the Kuzhi Paniyaram Pan': 'ORG', 'a couple minutes': 'TIME', '2': 'CARDINAL', '3': 'CARDINAL', 'Vada Sambar': 'PERSON', 'Vengaya Sambar': 'FAC'}
9
bharatz-kitchen
{'biryani': 'PERSON', 'first': 'ORDINAL', 'i.e': 'NORP', '1': 'CARDINAL', '30': 'CARDINAL', 'Gms Salt': 'ORG', '1.5 Tbsp\n\n\n1': 'QUANTITY', 'RECIPE': 'ORG', '400': 'CARDINAL', '16': 'CARDINAL', 'Spice Mix': 'PERSON', '1.5': 'CARDINAL', '2': 'CARDINAL', 'Salt': 'PERSON', 'Tbsp Ginger Garlic Paste': 'PERSON', 'Mint Paste': 'ORG', 'Juice': 'ORG', '1/3': 'CARDINAL', 'Milk Mix': 'PERSON', 'Biryani Masala Mix': 'PERSON', '350': 'CARDINAL', 'Gms Basmati Rice': 'PRODUCT', '2 Cinnamon Sticks': 'PERSON', '4': 'CARDINAL', 'Kali Mirch Sabut': 'ORG', 'Cubeb': 'ORG', 'Laung': 'GPE', 'Seeds': 'PERSON', 'Shahi Jeera': 'PERSON', '60': 'CARDINAL', '40': 'CARDINAL', '1-Inch': 'QUANTITY', '5': 'CARDINAL', 'MIX': 'ORG', '1/4': 'CARDINAL', 'Saffron': 'ORG', 'BIRYAN

### Insights from spaCy's base en_small model
- Quantities & Durations of time are well-understood
	- Besides using NOUN - VERB dependencies to identify food items and cooking methods in POS/dependency parsing later, we could possibly exploit DURATION - VERB or QUANTITIES - NOUN (ADVERBS - NOUN) relationships to identify food items and cooking methods more reliably. (to be tested)
- Food items, even of Indian origin, are sometimes recognised as named entities, but highly variable with how they are capitalised (TODO: manually record % caught in example texts)
	- Most commonly as PERSON, sometimes as ORG, and rarely as GPE or PRODUCT



### spaCy default pipeline for English - Large
`$ python -m spacy download en_core_web_lg`

In [36]:
ner = NER(spacy_base_en("large"), use_ruler = False)

lg = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	lg[key] = ner.entities

	ner = NER(spacy_base_en("large"), use_ruler = False)

archanas-kitchen
{'Vada Sambar Recipe': 'PERSON', 'the urad dal vada': 'ORG', 'paniyaram pan': 'ORG', 'Vada': 'ORG', 'Kuzhi Paniyaram': 'ORG', 'vada': 'GPE', "Onion Vada's": 'ORG', 'Sambar': 'PERSON', 'first': 'ORDINAL', 'about 3 hours': 'TIME', 'the Kuzhi Paniyaram Pan': 'ORG', 'a couple minutes': 'TIME', 'toor dal': 'ORG', '2': 'CARDINAL', '3 to 4': 'CARDINAL', 'tadka': 'GPE', 'Vada Sambar': 'PERSON'}
17
bharatz-kitchen
{'biryani': 'PERSON', 'gol gappe': 'PERSON', 'first': 'ORDINAL', 'Chicken Biryani': 'PERSON', '1': 'CARDINAL', '1.5 Tbsp': 'MONEY', 'Gms Chicken': 'PERSON', '16': 'CARDINAL', '1.5 tsp': 'QUANTITY', 'Red Chili Powder': 'FAC', 'Salt': 'GPE', '½ tsp': 'QUANTITY', 'Turmeric / Haldi Powder': 'WORK_OF_ART', 'Ginger Garlic Paste': 'PERSON', '2': 'CARDINAL', 'Curd': 'PERSON', 'Saffron': 'ORG', 'Biryani Masala Mix\r\n\r\n': 'PERSON', 'Bhaap': 'PERSON', '350': 'CARDINAL', '400': 'CARDINAL', 'Gms Basmati Rice': 'PERSON', 'Cinnamon Sticks': 'PERSON', 'Daal Chini': 'ORG', '4': 'CA

### spaCy default pipeline for English - Transformer (RoBERTa)

`$ python -m spacy download en_core_web_trf`

In [37]:
import spacy
print(spacy.__version__)

3.7.2


In [38]:
import spacy_transformers
import spacy_curated_transformers

ner = NER(spacy_base_en("transformer"), use_ruler = False)

trf = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	trf[key] = ner.entities

	ner = NER(spacy_base_en("transformer"), use_ruler = False)

archanas-kitchen
{'The Onion Vada Sambar Recipe': 'WORK_OF_ART', 'Kuzhi Paniyaram pan': 'PRODUCT', 'first': 'ORDINAL', 'about 3 hours': 'TIME', 'the Kuzhi Paniyaram Pan': 'PRODUCT', 'a couple minutes': 'TIME', 'the Mini Onion Vadas': 'PRODUCT', '2 cups': 'QUANTITY', '3 to 4 whistles': 'QUANTITY', '2 whistles': 'QUANTITY', 'the Onion Vada Sambar': 'PRODUCT'}
11
bharatz-kitchen
{'few long weeks': 'DATE', 'first': 'ORDINAL', ':\n\n': 'QUANTITY', '1 Liter': 'QUANTITY', '30 Gms': 'QUANTITY', '1.5 Tbsp': 'QUANTITY', '1 Tbsp': 'QUANTITY', '400 Gms': 'QUANTITY', '16': 'CARDINAL', '1.5 tsp': 'QUANTITY', '2 tsp': 'QUANTITY', '½ tsp': 'QUANTITY', '2': 'CARDINAL', '1/3 Cup': 'QUANTITY', '350 – 400  ': 'QUANTITY', '4': 'CARDINAL', '60 Gms': 'QUANTITY', '40': 'CARDINAL', '1-Inch': 'QUANTITY', '5': 'CARDINAL', '1/8 Cup': 'QUANTITY', '1/4 Cup': 'QUANTITY', '2 Tbsp': 'QUANTITY'}
23
zayka-ka-tadka
{'moong dal idli': 'PRODUCT', '1 cup': 'QUANTITY', '2 tablespoon': 'QUANTITY', '1/2': 'CARDINAL', '1 teaspo

### Trained model

In [39]:
import spacy

ner = NER(spacy.load("model-last"), use_ruler = False)

trained = {}

for key in texts:
	print(key)
	for text in texts[key]:
		ner.eval(text)
	print(ner.entities)
	print(len(ner.entities))
	trained[key] = ner.entities

	ner = NER(spacy.load("model-last"), use_ruler = False)

archanas-kitchen
{'Onion': 'DISH', 'Sambar': 'UTENSIL', 'simple': 'UTENSIL', 'urad dal': 'INGREDIENT', 'paniyaram pan': 'UTENSIL', 'Kuzhi Paniyaram pan': 'UTENSIL', 'vada': 'INGREDIENT', "'s": 'METHOD', 'Mixed': 'UTENSIL', 'onions.': 'INGREDIENT', ';': 'INGREDIENT', 'soak': 'METHOD', 'dal': 'UTENSIL', 'strainer': 'UTENSIL', 'urad dal.': 'INGREDIENT', 'ginger': 'INGREDIENT', 'green chillies': 'INGREDIENT', 'batter': 'DISH', 'add': 'METHOD', 'batter.': 'INGREDIENT', 'Add': 'METHOD', 'onions': 'INGREDIENT', 'coriander leaves': 'INGREDIENT', 'Stir': 'METHOD', 'Kuzhi Paniyaram': 'UTENSIL', 'Pan': 'METHOD', 'pan': 'UTENSIL', 'Cover': 'METHOD', 'couple minutes of': 'TIME', 'onion': 'INGREDIENT', 'oil': 'INGREDIENT', 'flip': 'METHOD', 'platter': 'DISH', '2': 'MEAUREMENT', '3 to 4 whistles and': 'TIME', 'pressure': 'DISH', 'mash': 'METHOD', 'toor dal': 'INGREDIENT', 'pressure cooker': 'UTENSIL', 'water': 'INGREDIENT', 'radish': 'METHOD', 'carrot': 'INGREDIENT', 'sambar powder': 'INGREDIENT', 's