# Data loading

In [1]:
import numpy as np
import pandas as pd

import os
print(os.listdir('./data/'))

['beer_pairing (ENG).xlsx', 'recipes_82k.csv', 'test.json', 'test.json.zip', 'train.json', 'train.json.zip', 'Wine_pairing.xlsx']


In [7]:
df = pd.concat([pd.read_json('./data/train/train.json'), 
                pd.read_json('./data/test/test.json')]).reset_index()

# Lower-casing all ingredients' name.
df['ingredients'] = df['ingredients'].apply(lambda ings : [ing.lower() for ing in ings])

df.head()

Unnamed: 0,index,id,cuisine,ingredients
0,0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


# Calculating ingredient co-occurrences.

Simply calculating the number of recipes in which two ingredients occurred together.

Using itertools

<strong>permutations</strong> : 순열 구하기

<strong>combinations</strong> : 조합 구하기

In [8]:
import itertools

# Test
list(itertools.combinations(df['ingredients'][0][:6], 2))

[('romaine lettuce', 'black olives'),
 ('romaine lettuce', 'grape tomatoes'),
 ('romaine lettuce', 'garlic'),
 ('romaine lettuce', 'pepper'),
 ('romaine lettuce', 'purple onion'),
 ('black olives', 'grape tomatoes'),
 ('black olives', 'garlic'),
 ('black olives', 'pepper'),
 ('black olives', 'purple onion'),
 ('grape tomatoes', 'garlic'),
 ('grape tomatoes', 'pepper'),
 ('grape tomatoes', 'purple onion'),
 ('garlic', 'pepper'),
 ('garlic', 'purple onion'),
 ('pepper', 'purple onion')]

In [9]:
# counting

from collections import Counter

cooc_counts = Counter()
ing_counts = Counter()
for ingredients in df['ingredients']:
    for ing in ingredients:
        ing_counts[ing] += 1    # a ingredient counting
    for (ing_x, ing_y) in itertools.combinations(set(ingredients), 2):
        # add only in a consitent order (x, y)
        if ing_x > ing_y:
            ing_x, ing_y = ing_y, ing_x
        cooc_counts[(ing_x, ing_y)] += 1    # co-occurrences counting

In [13]:
# dataframing

cooc_df = pd.DataFrame(((ing_x, ing_y, ing_counts[ing_x], ing_counts[ing_y], cooc) for (ing_x, ing_y), cooc in cooc_counts.items()), columns = ['ing_x', 'ing_y', 'x_count', 'y_count', 'cooc'])

cooc_df.sample(10)

Unnamed: 0,ing_x,ing_y,x_count,y_count,cooc
5995,fresh lemon juice,large shrimp,2105,517,50
189677,garlic cloves,soba,7772,40,10
229166,green chile,swiss chard,384,134,1
41847,fresh herbs,milk,61,2851,5
413698,baby spinach,low sodium vegetable stock,330,8,1
314306,dark soy sauce,pork rind,377,8,1
421337,egg substitute,egg yolks,87,681,1
77256,green bell pepper,parsley sprigs,1471,176,7
339751,lemon pepper,yellow corn meal,43,430,2
90953,canned tomatoes,dried rosemary,94,209,5


In [14]:
# Pick up Example

cooc_df[cooc_df['ing_x'] == 'pork'].sort_values('cooc', ascending = False).head(8)

Unnamed: 0,ing_x,ing_y,x_count,y_count,cooc
566,pork,salt,367,22534,199
7225,pork,soy sauce,367,4120,114
5144,pork,water,367,9293,107
5131,pork,sugar,367,8064,89
51115,pork,sesame oil,367,2183,55
72121,pork,vegetable oil,367,5516,50
29023,pork,scallions,367,2372,43
73420,pork,shrimp,367,1145,32


In [15]:
# Pick up Example

cooc_df[cooc_df['ing_x'] == 'chillies'].sort_values('cooc', ascending = False).head(8)

Unnamed: 0,ing_x,ing_y,x_count,y_count,cooc
3137,chillies,salt,148,22534,67
3126,chillies,onions,148,10008,63
3125,chillies,garlic,148,9171,59
3130,chillies,ginger,148,2190,47
58906,chillies,garam masala,148,1179,37
19369,chillies,vegetable oil,148,5516,36
3124,chillies,water,148,9293,32
3133,chillies,tomatoes,148,3812,31


Most of the highest co-occurrence counts  are usually popular ingredients.

Simple co-occurrences don't get us the imformation that we actually want, the valid co-occurrence

# Point-wise Mutual Information


공식 : PMI(A,B)=logP(A,B) / (P(A)×P(B))

이 PMI는 x와 y가 독립적으로 발생했다고 가정했을 때의 x,y의 동시 발생 확률과, 측정된 x와 y의 동시발생 확률을 비교함으로써 두 변수가 얼마만큼의 상관도를 가지는지 판단한다.

P(X) = X occurrence counting / number of all recipes

P(X, Y) = X, Y co-occurrences / sum of all co-occurrences


Descriptions : https://www.slideshare.net/RetrieverJo/pmi-twitter-57723391

In [16]:
cooc_df.head()

Unnamed: 0,ing_x,ing_y,x_count,y_count,cooc
0,garbanzo beans,seasoning,181,176,1
1,feta cheese crumbles,seasoning,476,176,3
2,pepper,seasoning,5508,176,25
3,romaine lettuce,seasoning,350,176,3
4,purple onion,seasoning,2372,176,7


In [17]:
ing_counts

Counter({'romaine lettuce': 350,
         'black olives': 283,
         'grape tomatoes': 276,
         'garlic': 9171,
         'pepper': 5508,
         'purple onion': 2372,
         'seasoning': 176,
         'garbanzo beans': 181,
         'feta cheese crumbles': 476,
         'plain flour': 206,
         'ground pepper': 500,
         'salt': 22534,
         'tomatoes': 3812,
         'ground black pepper': 5990,
         'thyme': 467,
         'eggs': 4262,
         'green tomatoes': 125,
         'yellow corn meal': 430,
         'milk': 2851,
         'vegetable oil': 5516,
         'mayonaise': 983,
         'cooking oil': 605,
         'green chilies': 964,
         'grilled chicken breasts': 6,
         'garlic powder': 1785,
         'yellow onion': 1487,
         'soy sauce': 4120,
         'butter': 6078,
         'chicken livers': 78,
         'water': 9293,
         'wheat': 35,
         'black pepper': 3291,
         'shallots': 1837,
         'cornflour': 129,
       

In [18]:
sum(ing_counts.values())

535670

In [19]:
len(ing_counts.values())

7126

In [22]:
# origin - y : len으로 나눔

px = cooc_df['x_count'] / sum(ing_counts.values())
py = cooc_df['y_count'] / (len(ing_counts.values()))    # 여기가 왜 len일까 내가 붙여넣음.. 그래야 되는 거 아닌가?

pxy = cooc_df['cooc'] / cooc_df['cooc'].sum()

cooc_df['PMI'] = np.log(pxy / (px * py))

cooc_df.head()

Unnamed: 0,ing_x,ing_y,x_count,y_count,cooc,PMI
0,garbanzo beans,seasoning,181,176,1,-3.255466
1,feta cheese crumbles,seasoning,476,176,3,-3.123774
2,pepper,seasoning,5508,176,25,-3.45205
3,romaine lettuce,seasoning,350,176,3,-2.81629
4,purple onion,seasoning,2372,176,7,-3.882547


In [24]:
cooc_df.sort_values('PMI', ascending = False).head(10)

Unnamed: 0,ing_x,ing_y,x_count,y_count,cooc,PMI
473897,chocolate extract,chocolate graham crackers,1,1,1,7.113515
275199,ragu classic alfredo sauce,ragu golden veggie fettuccine pasta,1,1,1,7.113515
171257,johnsonville andouille fully cooked sausage,klondike rose red skin potato,1,1,1,7.113515
435670,snow crab,spot prawns,1,1,1,7.113515
252710,dried oysters,wood mushrooms,1,1,1,7.113515
163025,conimex wok olie,conimex woksaus specials vietnamese gember kno...,1,1,1,7.113515
208234,brownie layer,chocolate ice cream mix,1,1,1,7.113515
484430,cooked cut green beans,wish-bone deluxe french dressing,1,1,1,7.113515
200525,dumpling dough,red vinegar,1,1,1,7.113515
460860,bermuda onion,snip fresh dill,1,1,1,7.113515


need min_count