In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import os
import json
import torch
import numpy as np
import pickle
import torch.utils.data as data
import torch.nn as nn
from functools import partial
from tqdm import tqdm
from itertools import chain
from datetime import datetime
import torch
import torch.nn.init as init
# Filters
MAX_NAME = 15
MAX_INGR = 5
MAX_INGR_TOK = 20
MAX_STEP_TOK = 256

df_r = pd.read_csv('../input/food-com-recipes-and-user-interactions/PP_recipes.csv')
# raw_recipe_df = pd.read_csv('../input/food-com-recipes-and-user-interactions/RAW_recipes.csv')
# ingr_map_df = pd.read_pickle('../input/food-com-recipes-and-user-interactions/ingr_map.pkl')


In [2]:
# Remember to enable the internet to install packages
!pip install pytorch-pretrained-bert==0.6.2


from pytorch_pretrained_bert import OpenAIGPTTokenizer

Collecting pytorch-pretrained-bert==0.6.2
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 KB[0m [31m829.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: pytorch-pretrained-bert
Successfully installed pytorch-pretrained-bert-0.6.2
[0m

In [3]:
TOKENIZER = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
GPT_VOCAB_SIZE = len(TOKENIZER.encoder)

# Extra tokens
EOS_INDEX = GPT_VOCAB_SIZE          # End of step
PAD_INDEX = GPT_VOCAB_SIZE + 1      # PAD
START_INDEX = GPT_VOCAB_SIZE + 2    # START document
END_INDEX = GPT_VOCAB_SIZE + 3      # END document
SOS_INDEX = GPT_VOCAB_SIZE + 4      # Start of step

100%|██████████| 815973/815973 [00:00<00:00, 2620807.26B/s]
100%|██████████| 458495/458495 [00:00<00:00, 1827663.38B/s]


In [4]:
def get_device(use_cuda=True):
    cuda_available = torch.cuda.is_available()
    use_cuda = use_cuda and cuda_available

    # Prompt user to use CUDA if available
    if cuda_available and not use_cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")

    # Set device
    device = torch.device('cuda:0' if use_cuda else 'cpu')

    print('Device: {}'.format(device))
    if use_cuda:
        print('Using CUDA {}'.format(torch.cuda.current_device()))
    return use_cuda, device

In [5]:
TECHNIQUES_LIST = [
    'bake',
    'barbecue',
    'blanch',
    'blend',
    'boil',
    'braise',
    'brine',
    'broil',
    'caramelize',
    'combine',
    'crock pot',
    'crush',
    'deglaze',
    'devein',
    'dice',
    'distill',
    'drain',
    'emulsify',
    'ferment',
    'freez',
    'fry',
    'grate',
    'griddle',
    'grill',
    'knead',
    'leaven',
    'marinate',
    'mash',
    'melt',
    'microwave',
    'parboil',
    'pickle',
    'poach',
    'pour',
    'pressure cook',
    'puree',
    'refrigerat',
    'roast',
    'saute',
    'scald',
    'scramble',
    'shred',
    'simmer',
    'skillet',
    'slow cook',
    'smoke',
    'smooth',
    'soak',
    'sous-vide',
    'steam',
    'stew',
    'strain',
    'tenderize',
    'thicken',
    'toast',
    'toss',
    'whip',
    'whisk',
]

# Including the padding technique
PAD_TECHNIQUE_INDEX = len(TECHNIQUES_LIST)
N_TECHNIQUES = len(TECHNIQUES_LIST) + 1

USE_CUDA, DEVICE = get_device()

Device: cuda:0
Using CUDA 0


In [7]:
def pad_recipe_info(df_r, max_name_tokens=15, min_ingredients=3, max_ingredients=20,
                    max_ingr_tokens=20, max_step_tokens=256):
    """
    Pads relevant recipe tokenized representations

    Arguments:
        df_r {pd.DataFrame} -- Recipe information DataFrame, containing the columns:
                                name_tokens
                                step_tokens
                                ingredient_tokens

    Keyword Arguments:
        max_name_tokens {int} -- Maximum tokens in a recipe name (default: {15})
        min_ingredients {int} -- Minimum # ingredients in a recipe (default: {3})
        max_ingredients {int} -- Maximum # ingredients in a recipe (default: {20})
        max_ingr_tokens {int} -- Maximum # tokens in an ingredient (default: {20})
        max_step_tokens {int} -- Maximum # steps in a recipe (default: {256})
    
    Returns:
        pd.DataFrame -- Padded recipe information DataFrame
    """
    start = datetime.now()

    # Pad name
    df_r['name_tokens'] = df_r['name_tokens'].agg(lambda n: pad_name(n, max_name_tokens))
    print('{} - Padded names to maximum {} tokens'.format(
        datetime.now() - start, max_name_tokens
    ))

    # Pad steps
    df_r['steps_tokens'] = df_r['steps_tokens'].agg(lambda s: pad_steps(s, max_step_tokens))
    print('{} - Padded steps to maximum {} tokens'.format(
        datetime.now() - start, max_step_tokens
    ))

    # Clip ingredients
    df_r['ingredient_tokens'] = df_r['ingredient_tokens'].agg(lambda i:
        i[:np.random.randint(min_ingredients, max_ingredients+1)]
    )
    df_r['ingredient_ids'] = df_r['ingredient_ids'].agg(lambda i:
        i[:np.random.randint(min_ingredients, max_ingredients+1)]
    )
    print('{} - Clipped ingredients randomly between {} and {} incidents'.format(
        datetime.now() - start, min_ingredients, max_ingredients
    ))

    # Pad ingredients + generate mask
    df_r['ingredient_mask'] = df_r['ingredient_tokens'].agg(lambda i: get_ingr_mask(i, max_ingredients))
    df_r['ingredient_tokens'] = df_r['ingredient_tokens'].agg(
        lambda i: pad_ingredients(i, max_ingredients, max_ingr_tokens)
    )
    print('{} - Padded ingredients to maximum {} tokens and {} ingredients'.format(
        datetime.now() - start, max_ingr_tokens, max_ingredients
    ))

    # Pad ingredient IDs
    df_r['ingredient_id_mask'] = df_r['ingredient_ids'].agg(lambda i: get_ingr_mask(i, max_ingredients))
    n_ingredients_og = max(chain.from_iterable(df_r['ingredient_ids'].values)) + 1
    pad_ingr = n_ingredients_og
    # n_ingredients = n_ingredients_og + 1
    df_r['ingredient_ids'] = df_r['ingredient_ids'].agg(
        lambda i: pad_ingredient_ids(i, max_ingredients, pad_ingr)
    )
    print('{} - Padded ingredient IDs to maximum {} ingredients w pad ingredient {}'.format(
        datetime.now() - start, max_ingredients, pad_ingr
    ))

    # Pad techniques mask - we will never attend on the pad technique
    df_r['techniques'] =  df_r['techniques'].apply(json.loads)
    df_r['techniques_mask'] = df_r['techniques'].agg(lambda x: x + [sum(x) == 0])
#     print(df_r['techniques_mask'])
#     for t_mask in df_r['techniques_mask']:
#         print((type(t_mask), t_mask))
    df_r['techniques'] = df_r['techniques_mask'].agg(get_technique_embedding_indices)
    print('{} - Processed techniques and the associated masks'.format(
        datetime.now() - start
    ))

    return df_r

In [8]:
def load_recipe_tensors(df_r, device,
                        cols=['name_tokens', 'ingredient_tokens', 'ingredient_mask', 'steps_tokens'],
                        types=[torch.LongTensor, torch.LongTensor, torch.LongTensor, torch.LongTensor]):
    """
    Loads recipe component data into the GPU as tensors

    Arguments:
        df_r {pd.DataFrame} -- Recipe information DataFrame
        device {torch.Device} -- Device onto which to load the data
        cols {list} -- List of columns from which to retrieve tensors
        types {list} -- List of types of tensors to create

    Returns:
        list -- Tensor with tokens for each column specified
    """
    start = datetime.now()

    df_r = df_r.set_index('i').sort_index()
    print('{} - Sorted recipes DF by recipe ID'.format(datetime.now() - start))
    print(cols)
    print(types)
    created_tensors = []
    max_col_len = max(len(k) for k in cols if k is not None)
    
    # Load only relevant tensors to GPU
    total_size = 0
    for col, tens_type in zip(cols, types):
        print("{} - Creating tensor for {}.".format(datetime.now() - start, col))
        # Short-circuit
        if col is None:
            created_tensors.append(None)
            continue

        # Load column tensor
        if col == 'ingredient_tokens':
            flattened_ingredients = df_r['ingredient_tokens'].agg(
                lambda i_list: list(chain.from_iterable(i_list))
            ).values.tolist()
            col_tensor = tens_type(flattened_ingredients).to(device)
        else:
            # There mare functional changes that will likely need to be made each created tensor copies data. not serten
            col_tensor = tens_type(df_r[col].values.tolist()).to(device) 
        created_tensors.append(col_tensor)
        col_size = col_tensor.element_size() * col_tensor.nelement()
        total_size += col_size
        print('{} {} ({:,.3f} MB)'.format(
            '{}:'.format(col).ljust(max_col_len + 1),
            ' x '.join(['{:,}'.format(s) for s in col_tensor.size()]).ljust(15),
            col_size / 1024 / 1024
        ))

    print('{} - Loaded tensors to GPU - TOTAL {:,.3f} MB'.format(
        datetime.now() - start, total_size / 1024 / 1024
    ))
    return created_tensors


In [9]:

def __standard_token_padding(tokens:list, max_tokens:int, padding=PAD_INDEX):
    padded_tokens = None
    try:
        padded_tokens = tokens + [padding]*(max_tokens - len(tokens))
    except TypeError:
        if isinstance(tokens, str) :
            tokens = json.loads(tokens)
            padded_tokens =  tokens + [padding]*(max_tokens - len(tokens))
        else:
            raise TypeError(f"tokens is {type(tokens)}")
    except Exception:
        raise Exception
    finally:
        return padded_tokens


def pad_name(name_tokens:list, max_name_tokens:int=15) -> list:
    """
    Pads the name tokens
    """
    return __standard_token_padding(name_tokens, max_name_tokens)

def pad_steps(step_tokens:list, max_step_tokens:int=256) -> list:
    """
    Padds the step tokens
    """
    return __standard_token_padding(step_tokens, max_step_tokens)

def pad_ingredients(ingredient_tokens:list, max_ingredients:int=20, max_ingr_tokens:int=20):
    # Pad ingredients to maximum ingredient length
    new_tokens = [ __standard_token_padding(i[:max_ingr_tokens], max_ingr_tokens)  for i in ingredient_tokens[:max_ingredients] ]

    # Pad with empty ingredients
    #new_tokens += [[PAD_INDEX]*max_ingr_tokens] * (max_ingredients - len(ingredient_tokens[:max_ingredients]))
    new_tokens += [__standard_token_padding([],max_ingr_tokens )]
    return new_tokens

def pad_ingredient_ids(ingredient_ids:list, max_ingredients:int, pad_ingredient:int):
    if not isinstance(ingredient_ids, list):
        raise TypeError(f"step_tokens is {type(ingredient_ids)}")

    # Pad ingredients to maximum ingredient length
    return ingredient_ids[:max_ingredients] + \
        [pad_ingredient]*(max_ingredients - len(ingredient_ids[:max_ingredients]))


In [10]:
def get_ingr_mask(ingredient_tokens, max_ingredients=20):
    return [1]*len(ingredient_tokens[:max_ingredients]) + [0]*(max_ingredients - len(ingredient_tokens[:max_ingredients]))


def get_technique_embedding_indices(technique_mask):
    technique_onehot = np.array(technique_mask)
    return np.arange(N_TECHNIQUES) * technique_onehot + (technique_onehot == 0) * PAD_TECHNIQUE_INDEX


In [11]:
df_r['ingredient_ids'] = df_r['ingredient_ids'].apply(json.loads).values
print(type(df_r['ingredient_ids']))
n_ingredients_og = max(chain.from_iterable(df_r['ingredient_ids'])) + 1
PAD_INGR = n_ingredients_og
N_INGREDIENTS = n_ingredients_og + 1
df_r

<class 'pandas.core.series.Series'>


Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"
...,...,...,...,...,...,...,...,...
178260,323143,76862,"[40480, 6444, 1964, 9369, 486, 569, 17551, 40481]","[[8780], [11835, 1762, 4465, 31494], [6812], [...","[40480, 40482, 729, 2525, 715, 485, 26641, 404...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,"[840, 208, 2499, 2683, 1925, 335, 1511]"
178261,149114,145962,"[40480, 17027, 24715, 974, 11877, 40481]","[[6812], [5940], [30645, 4785, 6821], [6953], ...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[2499, 4717, 1168, 6270, 6324, 7040]"
178262,34200,65066,"[40480, 12187, 11434, 1738, 2627, 40481]","[[6167, 20930, 510], [1353], [15022, 6953], [6...","[40480, 40482, 500, 246, 1719, 5024, 240, 2366...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",2,"[2378, 7655, 3219, 2320, 5168, 5319, 4189, 268..."
178263,30618,77358,"[40480, 870, 488, 1325, 519, 2220, 2417, 488, ...","[[12395, 38308, 40118], [3137, 15022], [30878,...","[40480, 40482, 562, 481, 10734, 240, 23667, 58...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[5627, 2807, 5412, 3399, 7979, 1093, 1257, 780..."


In [12]:
df_r = pad_recipe_info(
    df_r, max_name_tokens=MAX_NAME, min_ingredients=3, max_ingredients=MAX_INGR,
    max_ingr_tokens=MAX_INGR_TOK, max_step_tokens=MAX_STEP_TOK
)

0:00:01.955170 - Padded names to maximum 15 tokens
0:00:06.398871 - Padded steps to maximum 256 tokens
0:00:07.993531 - Clipped ingredients randomly between 3 and 5 incidents
0:00:13.142409 - Padded ingredients to maximum 20 tokens and 5 ingredients
0:00:14.192449 - Padded ingredient IDs to maximum 5 ingredients w pad ingredient 8023
0:00:18.033866 - Processed techniques and the associated masks


In [13]:
df_r

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,ingredient_mask,ingredient_id_mask,techniques_mask
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[None, None, None, None, [40479, 40479, 40479,...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[58, 58, 58, 58, 58, 58, 58, 58, 58, 9, 58, 58...",0,"[389, 7655, 6270, 1527, 8023]","[1, 1, 1, 1, 0]","[1, 1, 1, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481, 4...","[None, None, None, None, [40479, 40479, 40479,...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[0, 58, 58, 58, 58, 58, 58, 58, 58, 9, 58, 58,...",0,"[2683, 4969, 800, 5298, 8023]","[1, 1, 1, 1, 0]","[1, 1, 1, 1, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481,...","[None, None, None, None, None, [40479, 40479, ...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[58, 58, 58, 58, 4, 58, 58, 58, 58, 58, 58, 11...",1,"[1257, 7655, 6270, 8023, 8023]","[1, 1, 1, 1, 1]","[1, 1, 1, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
3,74301,168258,"[40480, 10025, 31156, 40481, 40479, 40479, 404...","[None, None, None, None, [40479, 40479, 40479,...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[0, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58...",0,"[7940, 3609, 7060, 6265, 8023]","[1, 1, 1, 1, 0]","[1, 1, 1, 1, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[None, None, None, None, [40479, 40479, 40479,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[58, 58, 58, 58, 58, 58, 58, 58, 58, 9, 58, 58...",0,"[3484, 6324, 7594, 8023, 8023]","[1, 1, 1, 1, 0]","[1, 1, 1, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...
178260,323143,76862,"[40480, 6444, 1964, 9369, 486, 569, 17551, 404...","[None, None, None, [40479, 40479, 40479, 40479...","[40480, 40482, 729, 2525, 715, 485, 26641, 404...","[0, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58...",1,"[840, 208, 2499, 2683, 8023]","[1, 1, 1, 0, 0]","[1, 1, 1, 1, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
178261,149114,145962,"[40480, 17027, 24715, 974, 11877, 40481, 40479...","[None, None, None, [40479, 40479, 40479, 40479...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[0, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58, 58...",0,"[2499, 4717, 1168, 6270, 6324]","[1, 1, 1, 0, 0]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
178262,34200,65066,"[40480, 12187, 11434, 1738, 2627, 40481, 40479...","[None, None, None, None, None, [40479, 40479, ...","[40480, 40482, 500, 246, 1719, 5024, 240, 2366...","[0, 58, 58, 58, 58, 58, 58, 58, 58, 9, 58, 58,...",2,"[2378, 7655, 3219, 2320, 5168]","[1, 1, 1, 1, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
178263,30618,77358,"[40480, 870, 488, 1325, 519, 2220, 2417, 488, ...","[None, None, None, None, [40479, 40479, 40479,...","[40480, 40482, 562, 481, 10734, 240, 23667, 58...","[58, 58, 58, 58, 4, 58, 58, 58, 58, 9, 58, 58,...",0,"[5627, 2807, 5412, 8023, 8023]","[1, 1, 1, 1, 0]","[1, 1, 1, 0, 0]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."


In [14]:
tensors_to_load = [
        ('name_tensor', 'name_tokens'),
        ('calorie_level_tensor', 'calorie_level'),
        ('technique_tensor', 'techniques'),
        ('ingr_tensor', 'ingredient_ids'),
        ('steps_tensor', 'steps_tokens'),
        ('ingr_mask_tensor', 'ingredient_id_mask'),
        ('tech_mask_tensor', 'techniques_mask'),
    ]
tensor_names, tensor_cols = zip(*tensors_to_load)
print(tensor_names)
print(tensor_cols)
# Load tensors into memory


('name_tensor', 'calorie_level_tensor', 'technique_tensor', 'ingr_tensor', 'steps_tensor', 'ingr_mask_tensor', 'tech_mask_tensor')
('name_tokens', 'calorie_level', 'techniques', 'ingredient_ids', 'steps_tokens', 'ingredient_id_mask', 'techniques_mask')


In [None]:
memory_tensors = load_recipe_tensors(
    df_r, DEVICE, cols=tensor_cols, types=[torch.LongTensor] * len(tensors_to_load)
)