In [1]:
import json
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.feature_extraction.text import CountVectorizer
def make_dataset_recipe(file = 'full_format_recipes.json', 
                        save = 'full_format_recipes.csv',
                        expand = False):
    if os.path.exists(save):
        df = pd.read_csv('full_format_recipes.csv')
        print(f'{len(df)} Register loaded')
        return df

    title = []
    cat = []
    fat = []
    cal = []
    missing = 0
    # Open the recipe file
    with open('full_format_recipes.json', 'r') as f:
        json_file = json.loads(f.read())
    # Iterate recipes
    for recipe in json_file:
        try:
            title.append(recipe['title'])
            cat.append(recipe['categories'])
            fat.append(recipe['fat'])
            cal.append(recipe['calories'])
        except KeyError as e:
            missing += 1
    # Make dataset dict
    print(f'{len(title)} Registers loaded {missing} missed')
    c1 = pd.Series(data=title)
    c2 = pd.Series(data=cat)
    c3 = pd.Series(data=fat)
    c4 = pd.Series(data=cal)
    dataset = [c1, c2, c3, c4]
    df = pd.DataFrame(dataset).transpose()
    df.columns = ['Recipe Title', 'Recipe Ingredients', 'Fat Value', 'Calories']
    df = df.dropna()
    
    if expand:
        tags = df['Recipe Ingredients'].apply(pd.Series)
        tags = tags.rename(columns = lambda x: 'ingredient_' + str(x))
        df = pd.concat([df[:], tags[:]], axis=1)
        df.drop(['Recipe Ingredients'], axis=1, inplace=True)
        df = df.fillna(value=0)
    df[['Fat Value', 'Calories']] = df[['Fat Value', 'Calories']].apply(pd.to_numeric)
    print(f'Before drop Na: {len(df)} registers')
    df.to_csv(save)
    print(f'File saved to: {save}')
    del title
    del cat
    del fat
    del cal
    return df

def bag_builder(savepath='vectorizer.model'):
    if os.path.exists(savepath):
        with open(savepath, 'rb') as f:
            vectorizer = pickle.load(f)
        print(f'Bag of words loaded, Vocab Size: {len(vectorizer.vocabulary_)}')
        return vectorizer

    df = make_dataset_recipe()
    list_ingredients = df['Recipe Ingredients'].apply(str).apply(lambda x: x
                                                               .replace('[', '')
                                                               .replace(']', '').
                                                      replace('\'', '')).tolist()
    # Bags of words
    vectorizer = CountVectorizer()
    vectorizer.fit(list_ingredients)
    with open(savepath, 'wb') as f:
        pickle.dump(vectorizer, f)
    print(f'Bag of words saved, Vocab Size: {len(vectorizer.vocabulary_)}')
    return vectorizer

In [2]:
vectorizer = bag_builder()

Bag of words loaded, Vocab Size: 715


{'sandwich': 565,
 'bean': 43,
 'fruit': 247,
 'tomato': 660,
 'turkey': 667,
 'vegetable': 675,
 'kid': 331,
 'friendly': 243,
 'apple': 18,
 'lentil': 351,
 'lettuce': 352,
 'cookie': 157,
 'food': 233,
 'processor': 517,
 'onion': 448,
 'pork': 507,
 'bake': 34,
 'bastille': 40,
 'day': 183,
 'new': 426,
 'years': 707,
 'eve': 219,
 'dried': 202,
 'port': 508,
 'winter': 702,
 'chill': 127,
 'bon': 60,
 'appétit': 19,
 'soup': 603,
 'stew': 622,
 'dairy': 180,
 'potato': 511,
 'fennel': 224,
 'gourmet': 262,
 'york': 712,
 'cheese': 119,
 'pasta': 474,
 'side': 590,
 'vegetarian': 676,
 'quick': 527,
 'easy': 207,
 'fall': 220,
 'california': 95,
 'basil': 38,
 'bacon': 33,
 'summer': 629,
 'beef': 44,
 'ginger': 256,
 'sauté': 571,
 'stir': 623,
 'fry': 248,
 'spring': 614,
 'soy': 607,
 'sauce': 569,
 'salad': 561,
 'mustard': 422,
 'picnic': 493,
 'lunch': 368,
 'mayonnaise': 390,
 'ham': 283,
 'pea': 477,
 'sugar': 627,
 'conscious': 152,
 'free': 240,
 'wheat': 695,
 'gluten': 