## Importing

In [1]:
import pandas as pd
import csv

df = pd.read_csv('training_data_w_ingredients.csv', sep=';', error_bad_lines=False)
df.iloc[0]

Unnamed: 0                                                     1
index                                                          1
name                                         Vietnamese Pho Soup
ingredients    ['8 cups beef broth', '4 cups water', '1 yello...
cuisine                                               Vietnamese
Name: 0, dtype: object

In [2]:
print("Document Count")
print(df.groupby('cuisine')['ingredients'].count())

Document Count
cuisine
Chinese       60
Greek         50
Iberic        40
Italian       50
Thai          50
Vietnamese    50
french        50
korean        50
Name: ingredients, dtype: int64


# Data Exploration

## General look up
The goal is to get a first impression of the most common words.

In [3]:
from collections import Counter
df['ingredients'] = df['ingredients']
Counter(" ".join(df["ingredients"]).split()).most_common(5)

[("'1", 1635),
 ("'2", 789),
 ('cup', 729),
 ('teaspoon', 685),
 ('tablespoons', 605)]

None of these words are relevant for our further analysis of the Data.

Let's have a look at the typical count of ingredients.

In [4]:
df['ingredients'].str.split().str.len().head()

0    100
1     57
2     59
3     44
4     70
Name: ingredients, dtype: int64

## Let's make it more intresting
With scattertextplot we can visualize the words and the frequenz of them

In [5]:
##import
import string
%matplotlib inline
import scattertext as st
import re, io
from pprint import pprint
import pandas as pd
import numpy as np
from scipy.stats import rankdata, hmean, norm
import spacy
import os, pkgutil, json, urllib
from urllib.request import urlopen
from IPython.display import IFrame
from IPython.core.display import display, HTML
from scattertext import CorpusFromPandas, produce_scattertext_explorer

In [15]:
# error handling https://stackoverflow.com/questions/66149878/e053-could-not-read-config-cfg-resumeparser
nlp = spacy.load('en_core_web_sm')

In [16]:
df.head()

Unnamed: 0.1,Unnamed: 0,index,name,ingredients,cuisine,cleanText,cleanTextList,parsed
0,1,1,Vietnamese Pho Soup,"['8 cups beef broth', '4 cups water', '1 yello...",Vietnamese,beef broth water yellow onion cloves garlic ro...,"[beef, broth, water, yellow, onion, cloves, ga...","(beef, broth, water, yellow, onion, cloves, ga..."
1,2,2,Vietnamese Chicken Meatballs,"['1 pound ground chicken', '3 tablespoons fish...",Vietnamese,chicken fish sauce onion cloves garlic lemongr...,"[chicken, fish, sauce, onion, cloves, garlic, ...","(chicken, fish, sauce, onion, cloves, garlic, ..."
2,3,3,Vietnamese Restaurant-Style Grilled Lemongrass...,"['1 pound pork blade steaks (boneless, about 1...",Vietnamese,pork blade steaks boneless light brown sugar g...,"[pork, blade, steaks, boneless, light, brown, ...","(pork, blade, steaks, boneless, light, brown, ..."
3,4,4,Vietnamese Caramel Chicken,"['1 pound chicken thighs (with skin, deboned)'...",Vietnamese,chicken thighs skin deboned oil cloves garlic ...,"[chicken, thighs, skin, deboned, oil, cloves, ...","(chicken, thighs, skin, deboned, oil, cloves, ..."
4,5,5,Cheater Vietnamese Pho (Pho Bo),"['3 marrow bones (beef bone)', '50 ounces beef...",Vietnamese,marrow bones beef bone beef water onion half g...,"[marrow, bones, beef, bone, beef, water, onion...","(marrow, bones, beef, bone, beef, water, onion..."


## Words to Remove 
This code and these word lists are copied over as is.

In [17]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

#Source for list below
#https://en.wikipedia.org/wiki/Cooking_weights_and_measures
#https://thebakingpan.com/ingredient-weights-and-measures/
measures=['ounces','tablespoons','litrbes','liter','millilitres','mL','grams','g', 'kg','teaspoon','tsp', 'tablespoon','tbsp','fluid', 'ounce','oz','fl.oz', 'cup','pint','pt','quart','qt','gallon','gal','smidgen','drop','pinch','dash','scruple','dessertspoon','teacup','cup','cups','c','pottle','gill','dram','wineglass','coffeespoon','pound','pounded','lb','tbsp','plus','firmly', 'packed','lightly','level','even','rounded','heaping','heaped','sifted','bushel','peck','stick','chopped','sliced','halves', 'shredded','slivered','sliced','whole','paste','whole',' fresh', 'peeled', 'diced','mashed','dried','frozen','fresh','peeled','candied','no', 'pulp','crystallized','canned','crushed','minced','julienned','clove','head', 'small','large','medium', 'torn', 'cleaned', 'degree']

#measures = [lemmatizer.lemmatize(m) for m in measures]
#some of these include data leakage words, like 'italian' - ok to remove after including bigrams
data_leaks = ['iberic','greek', 'korean','italianstyle', 'french','thai', 'chinese', 'mexican','spanish','indian','italian']

common_remove=['ground','to','taste', 'and', 'or',  'can',  'into', 'cut', 'grated', 'leaf','package','finely','divided','a','piece','optional','inch','needed','more','drained','for','flake','dry','thinly','cubed','bunch','cube','slice','pod','beaten','seeded','uncooked','root','plain','heavy','halved','crumbled','sweet','with','hot','room','temperature','trimmed','allpurpose','deveined','bulk','seasoning','jar','food','if','bag','mix','in','each','roll','instant','double','such','frying','thawed','whipping','stock','rinsed','mild','sprig','freshly','toasted','link','boiling','cooked','unsalted','container',
'cooking','thin','lengthwise','warm','softened','thick','quartered','juiced','pitted','chunk','melted','cold','coloring','puree','cored','stewed','floret','coarsely','the','blanched','zested','sweetened','powdered','garnish','dressing','soup','at','active','lean','chip','sour','long','ripe','skinned','fillet','from','stem','flaked','removed','stalk','unsweetened','cover','crust', 'extra', 'prepared', 'blend', 'of', 'ring',  'undrained', 'about', 'zest', ' ', '', 'spray', 'round', 'herb', 'seasoned', 'wedge', 'bitesize', 'broken', 'square', 'freshly', 'thickly', 'diagonally']
#common_remove = [lemmatizer.lemmatize(c) for c in common_remove]
#data_leaks = [lemmatizer.lemmatize(d) for d in data_leaks]
# due to using bigrams not including 
useless_singles=['','black','white','red','yellow','seed','breast','confectioner','sundried','broth','bell','baby','juice','crumb','sauce','condensed','smoked','basmati','extravirgin','brown','clarified', 'soy', 'filling', 'pine', 'virgin', 'romano', 'heart', 'shell', 'thigh', 'boneless','skinless','split', 'dark', 'wheat', 'light', 'green', 'vegetable', 'curry', 'orange', 'garam', 'sesame', 'strip', 'sea', 'canola', 'mustard','powder', 'ice', 'bay', 'roasted', 'loaf', 'roast', 'powder']
useless_singles = [lemmatizer.lemmatize(u) for u in useless_singles]

[nltk_data] Downloading package wordnet to /home/l/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Clean up

In [18]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    filtered_words = [w for w in filtered_words if not w in measures]
    filtered_words = [w for w in filtered_words if not w in data_leaks]
    filtered_words = [w for w in filtered_words if not w in common_remove]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]

    return " ".join(filtered_words)


# Apply to the DF series
df['cleanText']=df['ingredients'].map(lambda s:preprocess(s)) 
df['cleanTextList']=df['ingredients'].map(lambda s:preprocess(s).split()) 
df['parsed'] = df.cleanText.apply(nlp)
df.sample(10)
df.head(5)

Unnamed: 0.1,Unnamed: 0,index,name,ingredients,cuisine,cleanText,cleanTextList,parsed
0,1,1,Vietnamese Pho Soup,"['8 cups beef broth', '4 cups water', '1 yello...",Vietnamese,beef broth water yellow onion cloves garlic ro...,"[beef, broth, water, yellow, onion, cloves, ga...","(beef, broth, water, yellow, onion, cloves, ga..."
1,2,2,Vietnamese Chicken Meatballs,"['1 pound ground chicken', '3 tablespoons fish...",Vietnamese,chicken fish sauce onion cloves garlic lemongr...,"[chicken, fish, sauce, onion, cloves, garlic, ...","(chicken, fish, sauce, onion, cloves, garlic, ..."
2,3,3,Vietnamese Restaurant-Style Grilled Lemongrass...,"['1 pound pork blade steaks (boneless, about 1...",Vietnamese,pork blade steaks boneless light brown sugar g...,"[pork, blade, steaks, boneless, light, brown, ...","(pork, blade, steaks, boneless, light, brown, ..."
3,4,4,Vietnamese Caramel Chicken,"['1 pound chicken thighs (with skin, deboned)'...",Vietnamese,chicken thighs skin deboned oil cloves garlic ...,"[chicken, thighs, skin, deboned, oil, cloves, ...","(chicken, thighs, skin, deboned, oil, cloves, ..."
4,5,5,Cheater Vietnamese Pho (Pho Bo),"['3 marrow bones (beef bone)', '50 ounces beef...",Vietnamese,marrow bones beef bone beef water onion half g...,"[marrow, bones, beef, bone, beef, water, onion...","(marrow, bones, beef, bone, beef, water, onion..."


In [19]:
#only our cuisines
new_df = df.loc[(df['cuisine'] == 'Chinese') | (df['cuisine'] == 'Iberic')]
new_df.sample(10)

Unnamed: 0.1,Unnamed: 0,index,name,ingredients,cuisine,cleanText,cleanTextList,parsed
376,377,377,Portuguese_Scallops,"['1 pound sea scallops (fresh)', '1 teaspoon s...",Iberic,sea scallops salt black pepper olive oil port ...,"[sea, scallops, salt, black, pepper, olive, oi...","(sea, scallops, salt, black, pepper, olive, oi..."
373,374,374,Portuguese_Grilled_Pork_Ribs,"['4 pork rib (strips)', '500 milliliters white...",Iberic,pork rib strips milliliters white wine cloves ...,"[pork, rib, strips, milliliters, white, wine, ...","(pork, rib, strips, milliliters, white, wine, ..."
392,393,393,Spanish_Rice,"['1 cup long grain rice', '1 pound lean ground...",Iberic,grain rice beef onion vegetable oil bacon drip...,"[grain, rice, beef, onion, vegetable, oil, bac...","(grain, rice, beef, onion, vegetable, oil, bac..."
302,303,303,Chinese_Fried_Rice,"['2 tablespoons butter (divided)', '2 eggs (be...",Chinese,butter eggs yellow onion carrots peas cloves g...,"[butter, eggs, yellow, onion, carrots, peas, c...","(butter, eggs, yellow, onion, carrots, peas, c..."
372,373,373,Portuguese_Bacalhau,['1 pound salted cod (soaked in cold water for...,Iberic,salted cod soaked water hours potatoes white y...,"[salted, cod, soaked, water, hours, potatoes, ...","(salted, cod, soaked, water, hours, potatoes, ..."
334,335,335,Sesame_Chinese_Chicken_with_Rice,"['4 skinless chicken breasts (boneless)', 'sal...",Chinese,skinless chicken breasts boneless salt pepper ...,"[skinless, chicken, breasts, boneless, salt, p...","(skinless, chicken, breasts, boneless, salt, p..."
348,349,349,Chinese_Green_Beans_with_Ground_Turkey,"['1 cup medium grain rice (uncooked)', '1 tabl...",Chinese,grain rice sesame oil green onions cloves garl...,"[grain, rice, sesame, oil, green, onions, clov...","(grain, rice, sesame, oil, green, onions, clov..."
324,325,325,Chinese_Chicken_and_Broccoli,"['1 head broccoli (cut into florets)', '3 tabl...",Chinese,broccoli florets vegetable oil boneless chicke...,"[broccoli, florets, vegetable, oil, boneless, ...","(broccoli, florets, vegetable, oil, boneless, ..."
394,395,395,Spanish_Omelette,"['1 pound potato (/ 3 medium potatoes, washed ...",Iberic,potato potatoes washed eggs onion olive oil to...,"[potato, potatoes, washed, eggs, onion, olive,...","(potato, potatoes, washed, eggs, onion, olive,..."
349,350,350,Chinese_Chicken_Stir_Fry,"['1 teaspoon chili oil', '2 cloves garlic (min...",Chinese,chili oil cloves garlic ginger spring onions w...,"[chili, oil, cloves, garlic, ginger, spring, o...","(chili, oil, cloves, garlic, ginger, spring, o..."


In [20]:
corpus = st.CorpusFromParsedDocuments(new_df, category_col='cuisine', parsed_col='parsed').build()

print(corpus)

<scattertext.ParsedCorpus.ParsedCorpus object at 0x7f53d8e0abe0>


In [21]:
html = produce_scattertext_explorer(corpus,
                                    category='Chinese',
                                    category_name='Chinese',
                                    not_category_name='Iberic',
                                    width_in_pixels=1000,
                                    minimum_term_frequency=5,
                                    transform=st.Scalers.scale,
                                    metadata=new_df['name'])
file_name = 'output/ChineseIbericCusineScattertextScale.html'
open(file_name, 'wb').write(html.encode('utf-8'))
IFrame(src=file_name, width = 1200, height=700)

TypeError: scale() takes from 1 to 2 positional arguments but 3 were given

In [None]:
new_df.dtypes

In [None]:
#bigram function based on spacy
def bigram(doc):
    # create a list for the result
    result = list()
    # create a list that contains no punctuation
    sentence = list()
    # parse through the document to add all tokens that are words to the sentence list
    for token in doc:
        if token.is_alpha:
            sentence.append(token)
    # parse through the sentence while adding words in groups of two to the result
    for word in range(len(sentence) - 1):
        first_word = sentence[word]
        second_word = sentence[word + 1]
        element = [first_word.text, second_word.text]
        result.append(element)

    return result

In [None]:
def generate_bigrams(text):
    # load English model
    nlp = spacy.load('en')

    # create a document
    doc = nlp(text)

    result = bigram(doc)
    bigrams = ''
    for element in result:
        joined_string = " ".join(element)
        js = joined_string + ","
        bigrams = bigrams + js
        
    
    return bigrams
    

new_df['bigrams'] = new_df['cleanText'].apply(generate_bigrams)

In [None]:
new_df.head(5)

## Absolute number of ingredients

In [None]:
new_df['num_ing'] = [len(x.split()) for x in new_df['cleanText'].tolist()]

In [None]:
Total = new_df['num_ing'].sum()
Total

## Relative number of different ingredients

In [None]:
new_df['rel_ing'] = new_df['num_ing'].div(Total)
new_df.head()

## Top10 most common ingredients

In [None]:
Counter(" ".join(new_df["cleanText"]).split()).most_common(10)

In [None]:
#colors: 

from sklearn.feature_extraction.text import CountVectorizer

import matplotlib.pyplot as plt 
import pandas as pd

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(new_df['cleanText'], 10)

df1 = pd.DataFrame(common_words, columns = ['word' , 'count'])

df1.head(10)

In [None]:
plt.figure()
df1.plot.bar(x='word',y='count', color=['coral'], alpha=0.8,  fontsize=20, figsize=(20, 8),edgecolor = "grey", grid= True)
plt.xlabel('Words', fontsize=40)
plt.ylabel('Frequenz', fontsize=40)
plt.show()

## Top10 most frequent ingredient bigrams

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(df['cleanText'], 10)

df2 = pd.DataFrame(common_words, columns = ['words' , 'count'])

df2.head(10)


In [None]:
plt.figure()
df2.plot.bar(x='words',y='count', color=['coral'], alpha=0.8,  fontsize=20, figsize=(20, 8),edgecolor = "grey", grid= True)
plt.xlabel('Words', fontsize=40)
plt.ylabel('Frequenz', fontsize=40)
plt.show()

In [None]:
print(new_df['parsed'].iloc[0])



In [None]:
import itertools
from tqdm import tqdm

a_list= new_df['cleanTextList']
# a_list = ['hund','katze', 'maus']

pairs = []

for i in a_list:
    for pair in itertools.combinations(i,2):
        pairs.append(pair)

print(pairs[0:10])

    
from collections import Counter
Counter(elem[0:2] for elem in pairs).most_common(20)
