In [1]:
import json
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from tqdm import tqdm
import re
import networkx as nx
from collections import Counter
import time
# Ignore matplotlib warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# read the train.json file as data
j_file = open('train.json',)
train_data = json.load(j_file)
#train_data

In [3]:
# read in the data from the dictionary into pandas dataframe in one line of code
# set index to the id column 
df_train = pd.DataFrame.from_dict(train_data)
df_train = df_train.set_index('id')
df_train.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
22213,indian,"[water, vegetable oil, wheat, salt]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
#List of all ingredients: list of lists
nested_list_ings = list(df_train.ingredients)

In [5]:
# convert a list of lists to one list 
flat_list_ings =[item for sublist in nested_list_ings for item in sublist]

In [6]:
#len(flat_list_ings)

428275

In [7]:
# stop word search
stop_words_all = [w for w in flat_list_ings if re.search(r'\b ?salt|pepper|sauce|water|seasoning|powder?\b', w)]
#len(stop_words_all)

85323

In [8]:
#convert list to set 
stop_words_set = set(stop_words_all)

In [9]:
#add new column which do not contain stop words
df_train['ingredients_nostopw'] = df_train['ingredients'].apply(lambda x : list(set(x)-stop_words_set))

In [10]:
#df_train.shape

(39774, 3)

In [11]:
#df_train.head()

Unnamed: 0_level_0,cuisine,ingredients,ingredients_nostopw
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10259,greek,"[romaine lettuce, black olives, grape tomatoes...","[black olives, garbanzo beans, garlic, romaine..."
25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[eggs, yellow corn meal, green tomatoes, thyme..."
20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[eggs, yellow onion, cooking oil, grilled chic..."
22213,indian,"[water, vegetable oil, wheat, salt]","[vegetable oil, wheat]"
13162,indian,"[black pepper, shallots, cornflour, cayenne pe...","[onions, ground cumin, natural yogurt, boneles..."


In [12]:
#new dataframe with removing list with single items on ingredients_nostopw column
df_train_ext1 = df_train[df_train['ingredients_nostopw'].apply(lambda x: len(x) > 1)]

In [13]:
df_train_ext1.shape

(39633, 3)

In [14]:
#start = time.time()
# from the list of ingredients in a column in given dataframe, this piece will
# return a dictionary of tuples and their weight depending on their presence in a given cuisine.
dic_ings = {}
for i in range(len(df_train_ext1)):
    ing_rows = df_train_ext1['ingredients_nostopw'].iloc[i]
    for ing in ing_rows:
        for ing2 in ing_rows:
            if ing != ing2:
                tup_ings = (ing,ing2)
                if tup_ings in dic_ings.keys():
                    dic_ings[tup_ings] += 1
                else:
                    dic_ings[tup_ings]  = 1
#print(f'{time.time()-start} seconds')

In [15]:
#dic_ings

{('black olives', 'garbanzo beans'): 2,
 ('black olives', 'garlic'): 55,
 ('black olives', 'romaine lettuce'): 7,
 ('black olives', 'grape tomatoes'): 4,
 ('black olives', 'purple onion'): 34,
 ('black olives', 'feta cheese crumbles'): 12,
 ('garbanzo beans', 'black olives'): 2,
 ('garbanzo beans', 'garlic'): 54,
 ('garbanzo beans', 'romaine lettuce'): 5,
 ('garbanzo beans', 'grape tomatoes'): 2,
 ('garbanzo beans', 'purple onion'): 16,
 ('garbanzo beans', 'feta cheese crumbles'): 6,
 ('garlic', 'black olives'): 55,
 ('garlic', 'garbanzo beans'): 54,
 ('garlic', 'romaine lettuce'): 48,
 ('garlic', 'grape tomatoes'): 49,
 ('garlic', 'purple onion'): 399,
 ('garlic', 'feta cheese crumbles'): 58,
 ('romaine lettuce', 'black olives'): 7,
 ('romaine lettuce', 'garbanzo beans'): 5,
 ('romaine lettuce', 'garlic'): 48,
 ('romaine lettuce', 'grape tomatoes'): 16,
 ('romaine lettuce', 'purple onion'): 74,
 ('romaine lettuce', 'feta cheese crumbles'): 20,
 ('grape tomatoes', 'black olives'): 4,
 

In [16]:
df_train_dict1 = pd.DataFrame.from_dict(dic_ings, orient = 'index').reset_index()
df_train_dict1.head()

Unnamed: 0,index,0
0,"(black olives, garbanzo beans)",2
1,"(black olives, garlic)",55
2,"(black olives, romaine lettuce)",7
3,"(black olives, grape tomatoes)",4
4,"(black olives, purple onion)",34


In [17]:
df_train_dict1['Source']= df_train_dict1['index'].apply(lambda x: str(x).split(',')[0].replace('(',''))

In [18]:
df_train_dict1['Target']= df_train_dict1['index'].apply(lambda x: str(x).split(',')[1].replace(')',''))

In [19]:
#df_train_dict1.head()

In [20]:
df_train_dict1 = df_train_dict1.rename(columns = {0: 'Weight'})

In [21]:
#df_train_dict1.head()

In [22]:
df_train_dict1['Source'] = df_train_dict1['Source'].replace("\'",'', regex = True) 
df_train_dict1['Target'] = df_train_dict1['Target'].replace("\'",'', regex = True) 

In [23]:
df_train_dict1['Source'] = df_train_dict1['Source'].replace(r"^\s?",'', regex = True) 
df_train_dict1['Target'] = df_train_dict1['Target'].replace(r"^\s?",'', regex = True) 

In [24]:
df_train_dict1.head()

Unnamed: 0,index,Weight,Source,Target
0,"(black olives, garbanzo beans)",2,black olives,garbanzo beans
1,"(black olives, garlic)",55,black olives,garlic
2,"(black olives, romaine lettuce)",7,black olives,romaine lettuce
3,"(black olives, grape tomatoes)",4,black olives,grape tomatoes
4,"(black olives, purple onion)",34,black olives,purple onion


In [25]:
type(df_train_dict1['Source'][0])

str

In [26]:
df_train_final = df_train_dict1[['Source', 'Target', 'Weight']].copy() 

In [27]:
df_train_final.head()

Unnamed: 0,Source,Target,Weight
0,black olives,garbanzo beans,2
1,black olives,garlic,55
2,black olives,romaine lettuce,7
3,black olives,grape tomatoes,4
4,black olives,purple onion,34


In [28]:
len(df_train_final)

729862

In [29]:
df_train_final.Source

0                                  black olives
1                                  black olives
2                                  black olives
3                                  black olives
4                                  black olives
                          ...                  
729857                     toasted sesame seeds
729858                     toasted sesame seeds
729859                     Chinese rice vinegar
729860                       steamed white rice
729861    store bought low sodium chicken stock
Name: Source, Length: 729862, dtype: object

In [30]:
#unique source
uni_source = list(df_train_final.Source.unique())
len(uni_source)

6157

In [31]:
#unique target
uni_target = list(df_train_final.Target.unique())
len(uni_target)

6184

In [32]:
G_all = nx.from_pandas_edgelist(df_train_final,'Source', 'Target',["Weight"])

In [33]:
#print(nx.info(G_all))

Name: 
Type: Graph
Number of nodes: 6192
Number of edges: 364435
Average degree: 117.7116


In [34]:
#G_all.edges()

In [35]:
len(G_all.nodes())

6192

In [36]:
#G_all.nodes(data=True)

In [37]:
#G_all.edges(data = True)

In [38]:
all_node_list = [f for f, to, edata in G_all.edges(data = True)]
all_node_list_unique = list(set(all_node_list))
len(all_node_list_unique)

3696

In [39]:
selected_edges = [(u,v,e) for u,v,e in G_all.edges(data=True) if e['Weight'] == 150]
#print(selected_edges)

[('garlic', 'parsley', {'Weight': 150}), ('eggs', 'scallions', {'Weight': 150}), ('onions', 'fresh lemon juice', {'Weight': 150}), ('sugar', 'cinnamon sticks', {'Weight': 150}), ('olive oil', 'ground beef', {'Weight': 150}), ('chopped cilantro fresh', 'black beans', {'Weight': 150}), ('all-purpose flour', 'shortening', {'Weight': 150}), ('sesame seeds', 'rice vinegar', {'Weight': 150})]


In [40]:
def island_fun(G, weight):
    '''
    Returns a subgraph from a big graph based on the
    selected weight.
    '''
    G1 = nx.Graph()
    for f, to, edata in G.edges(data=True):
         if edata['Weight'] == weight:
                G1.add_edge(f,to, weight = edata['Weight'])
    return G1

In [41]:
island_fun(G_all, 150).edges(data = True)

EdgeDataView([('garlic', 'parsley', {'weight': 150}), ('eggs', 'scallions', {'weight': 150}), ('onions', 'fresh lemon juice', {'weight': 150}), ('sugar', 'cinnamon sticks', {'weight': 150}), ('olive oil', 'ground beef', {'weight': 150}), ('chopped cilantro fresh', 'black beans', {'weight': 150}), ('all-purpose flour', 'shortening', {'weight': 150}), ('sesame seeds', 'rice vinegar', {'weight': 150})])

In [42]:
# sub_graph100 = island_fun(G_all, 100)
# node_list = [f for f, to, edata in sub_graph100.edges(data = True)]
# node_list_unique = list(set(node_list))
# node_list_unique

In [43]:
# node_size = [sub_graph100.degree(sub)*100 for sub in node_list_unique]
#node_size

In [44]:
# plt.figure(figsize=(12, 12))

# layout = nx.spring_layout(sub_graph100,iterations=50)

# nx.draw_networkx_nodes(sub_graph100, 
#                        layout, 
#                        nodelist=node_list_unique, 
#                        node_size=node_size, # a LIST of sizes, based on g.degree
#                        node_color='purple');
# # 

In [45]:
#G_plot = island_fun(G_all, 100)
#figure(figsize=(10, 8))
#nx.draw_shell(G_plot, with_labels=True)

In [46]:
len(G_all['garlic'])

3217

In [47]:
#list(G_all.nodes())

In [48]:
#G_all['garlic'].keys()#

In [49]:
node_set = set(G_all.nodes())

In [50]:
#%%time
#'shoes' in node_set 

Wall time: 0 ns


False

In [51]:
#%%time
#'garlic' in G_all.nodes()

Wall time: 0 ns


True

In [52]:
#G_all.nodes()

In [56]:
list_items = [itm for itm in G_all['garlic'].items()]

In [57]:
G_all.neighbors('garlic')

<dict_keyiterator at 0x1fe97b9df40>

In [58]:
def extract_neigh(G_in, item_search):
    '''
    Returns list of 5 nearest neighbor items 
    sorted with highest value weight for a given search item.
    '''
    list_nodes = [itm for itm, edata in G_in[item_search].items()]
    list_wt = [edata['Weight'] for itm, edata in G_in[item_search].items()]
    data = {'Target_name': list_nodes, 'Weight': list_wt}
    df_out = pd.DataFrame(data)    
    df_out = df_out.sort_values('Weight', ascending = False).reset_index(drop=True)
    nei_list = df_out.Target_name[0:5]    
    return list(nei_list)

In [59]:
#%%time
#extract_neigh(G_all, 'oil')

Wall time: 4.99 ms


['onions', 'garlic', 'sugar', 'ginger', 'tomatoes']

In [60]:
G_all.degree['garlic']

3217