In this exercise, I build and expand a cheese and wine pairings database with the high-level goal of linking wine varieties and recipe ingredients.

This is part of an assignment for my Data Science 511 course.

In [26]:
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from pprint import pprint
import json
import re
import requests

In [27]:
url = 'http://sedimentality.com/drinking-wine/list-of-wine-and-food-pairings/'

In [28]:
# Request the info at the URL
request = requests.get(url)

In [29]:
# Parse the request as a BeautifulSoup object
soup = BeautifulSoup(request.text, 'html.parser')

In [30]:
# Looked through soup object to see what class I needed for targeted data
div = soup.find_all('div', {'class' : 'entry-content mt-80 text-left'})[0]
# pprint(div)

In [31]:
def get_wine_info():
    # Get a list of the different wine types (ie. Sauvignon Blanc, Merlot, etc.)
    wine_types = [par.text for par in div.find_all('h2') if par.find('strong')]
    # A list containing the different types of wine and their following pairings
    wine_info = [par.text for par in div.find_all(re.compile('h2|p')) if par.find('strong')]
#     pprint(wine_info)
    return wine_types, wine_info

In [41]:
def create_wine_pairing_dict():
    pairings = defaultdict(dict)
    
    wine_types, wine_info = get_wine_info()
    # Returns list of the indices where the wine types appear in the wine info
    intervals = [wine_info.index(wine) for wine in wine_types]

    last = 0
    # Get interval of wine type to the next wine type. In between the two wine types is a list of the 
    # food categories and their paired foods
    for interval in intervals[1:]:
        # Wine type is the first indice and everything until the next indice is the pairing info
        # Ex. of item in info - 'Seafood: fatty white fish, oysters, scallops, lobster, shrimp, sushi'
        wine, *info = wine_info[last:interval]
        for pairing in info:
            try:
                # Split info on the colon to get the category and its paired foods
                food_category, foods = re.search(r'(.+):(.+)', pairing).groups()
            except AttributeError:
                print(pairing)
            else:
                # Split foods on commas and semi colons
                foods_list = re.split(';|,', foods) 
                # Strip out white space
                foods_list = [elemt.strip() for elemt in foods_list]
            # Use the default dict to create food category dictionaries for each wine.
            pairings[wine][food_category] = foods_list
        # Update the interval start point
        last = interval
            
#     pprint(pairings)
    return pairings


pairings object:
```
defaultdict(<class 'dict'>,
            {'Chardonnay': {'Cheese/nuts': ['mild',
                                            'semi-soft cheeses with unoaked '
                                            'Chardonnay',
                                            'asiago',
                                            'havarti',
                                            'Stilton or other blue-veined '
                                            'cheeses with oaky Chardonnay',
                                            'almonds and nearly any toasted '
                                            'nut'],
                            'Desserts': ['banana bread', 'vanilla pudding'],
                            'Fruits and Veggies': ['potato',
                                                   'apple',
                                                   'squash',
                                                   'mango'],
                            'Herbs and Spices': ['tarragon', 'sesame', 'basil'],
                            'Meat/poultry': ['veal', 'chicken', 'pork'],
                            'Sauces': ['cream sauces', 'pesto'],
                            'Seafood': ['halibut',
                                        'shrimp',
                                        'crab',
                                        'lobster']},
             'Merlot': {'Cheese/nuts': ['Parmesan',
                                        'Pecorino-Romano',
                                        'chestnuts',
                                        'walnuts'],
                        'Desserts': ['dark chocolate', 'berries', 'fondue'],
                        'Fruits and Veggies': ['caramelized onions',
                                               'tomatoes',
                                               'plums'],
                        'Herbs and Spices': ['mint', 'rosemary', 'juniper'],
                        'Meat/poultry': ['grilled meats', 'steak'],
                        'Sauces': ['bolognese', 'bearnaise'],
                        'Seafood': ['grilled meatier fish', 'ahi tuna']},
             'Pinot Noir': {'Cheese/nuts': ['goat cheese', 'brie', 'walnuts'],
                            'Desserts': ['creme brulee', 'white chocolate'],
                            'Fruits and Veggies': ['mushrooms',
                                                   'dried fruits',
                                                   'figs',
                                                   'strawberries'],
                            'Herbs and Spices': ['truffle',
                                                 'nutmeg',
                                                 'cinnamon',
                                                 'clove'],
                            'Meat/poultry': ['lamb',
                                             'sausage',
                                             'filet mignon',
                                             'chicken'],
                            'Sauces': ['mushroom sauces',
                                       'light-medium red sauces'],
                            'Seafood': ['ahi tuna', 'salmon']},
             'Riesling': {'Cheese/nust': ['Havarti',
                                          'gouda',
                                          'candied walnuts or pecans'],
                          'Desserts': ['apple pie', 'caramel sauce'],
                          'Fruits and Veggies': ['apricots',
                                                 'chili peppers',
                                                 'pears'],
                          'Herbs and Spices': ['rosemary',
                                               'ginger',
                                               'Thai or Indian spices'],
                          'Meat/poultry': ['smoked sausage',
                                           'duck',
                                           'foie gras'],
                          'Sauces': ['BBQ', 'spicy', 'chutney'],
                          'Seafood': ['sea bass', 'trout']},
             'Sauvignon Blanc': {'Cheese/nuts': ['feta',
                                                 'goat cheese',
                                                 'pine nuts'],
                                 'Desserts': ['sorbet',
                                              'key lime pie',
                                              'meringue',
                                              'mango'],
                                 'Fruits and Veggies': ['citrus',
                                                        'green apple',
                                                        'asparagus'],
                                 'Herbs and Spices': ['chives',
                                                      'tarragon',
                                                      'cilantro'],
                                 'Meat/poultry': ['chicken', 'turkey', 'pork'],
                                 'Sauces': ['citrus and light cream sauces'],
                                 'Seafood': ['fatty white fish',
                                             'oysters',
                                             'scallops',
                                             'lobster',
                                             'shrimp',
                                             'sushi']},
             'Syrah': {'Cheese/nuts': ['sharp cheddar',
                                       'Roquefort/bleu-veined cheeses',
                                       'hazelnuts',
                                       'walnuts'],
                       'Desserts': ['black forest cake',
                                    'rhubarb pie',
                                    'coffee-based desserts'],
                       'Fruits and Veggies': ['currants',
                                              'stewed tomatoes',
                                              'beets'],
                       'Herbs and Spices': ['oregano', 'sage'],
                       'Meat/poultry': ['roasted game',
                                        'pepperoni',
                                        'spicy sausage',
                                        'braised pork shoulder'],
                       'Sauces': ['BBQ', 'heavy red sauces'],
                       'Seafood': ['ahi tuna', 'salmon']}})
```

_Had to provide option to split on semi-colon since one ingredient description had complete thoughts separated by semi-colons, however, this does make the Cheese/nuts pairings for syrah less clear._


In [43]:
def export_data(pairings):
    json_dict = dict(pairings)

    with open('./data/wine_ingredient-pairings.json', 'w') as outfile:
        json.dump(json_dict, outfile)
        
export_data(create_wine_pairing_dict())

In [44]:
def load_pairings_dict(path):
    with open(path) as infile:
        pairings = json.load(infile)
    return pairings

Now, I will use the cuisine prediction dataset -- https://www.kaggle.com/kaggle/recipe-ingredients-dataset -- to pair our foods from the wine pairings with ingredients they appear most often with in the cuisine prediction.

In [45]:
def get_cuisine_prediction_data(path):
    with open(path, 'r') as infile:
        json_data = json.load(infile)
    return json_data

In [46]:
def get_recipes_list():
    cuisine_path = './data/train.json'
    cusinie_prediction_data = get_cuisine_prediction_data(cuisine_path)

    recipes_list = []

    for dict_ in cusinie_prediction_data:
        recipes_list.append(dict_['ingredients'])
        
    return recipes_list
    
# Check that function works
# pprint(get_recipes_list()[0])

In [47]:
def rec_dd():
    # Create recrusive defaultdict
    return defaultdict(rec_dd)

wines = rec_dd()

In [48]:
def create_food_prediction_data():
    # Sets wines to recursive defaultdict
    wines = rec_dd()
    
    # Load json data containing pairings object
    path = './data/wine_ingredient-pairings.json'
    pairings = load_pairings_dict(path)
    
    # List of recipe lists
    recipes_list = get_recipes_list()

    # Keys of pairings are wine types
    # Value of pairings is a dictionary where keys are food categories, values are list of foods
    for wine, food_cat_dict in pairings.items():
        food_cats = {}
        # keys are food categories
        # values are list of foods
        for food_cat, ingrds in food_cat_dict.items():
            # Loop over each food in the category
            for ingrd in ingrds:
                ingrd_list = []
                # Loop over recipes
                for recipe in recipes_list:
                    # Loop over each food in the recipe
                    for item in recipe:
                        # Check to see if ingrd from initial pairing dict is found in any item in each recipe
                        # Not an exact match, so things like "feta" will match with "feta cheese"
                        if ingrd in item and len(recipe) != 1:
                            # Returns list of items in the recipe that don't contain this ingrd
                            paired_list = (item_ for item_ in recipe if ingrd not in item_)
                            # create list of all recipes that have ingrd in their list
                            ingrd_list.extend(paired_list)
    #                       # this assumes there are no repeat ingredients in a food category for a wine type.

                # Keeps data manageable to an extent by only grabbing the 10 most common
                ingrd_list_count = Counter(ingrd_list).most_common(10)
                wines[wine][food_cat][ingrd] = ingrd_list_count
    print(wines.keys())
    pprint(wines['Sauvignon Blanc'])
    pprint(wines['Merlot'])
    
create_food_prediction_data()

dict_keys(['Sauvignon Blanc', 'Chardonnay', 'Riesling', 'Pinot Noir', 'Syrah', 'Merlot'])
defaultdict(<function rec_dd at 0x10d3dd1e0>,
            {'Cheese/nuts': defaultdict(<function rec_dd at 0x10d3dd1e0>,
                                        {'feta': [('olive oil', 302),
                                                  ('salt', 259),
                                                  ('garlic cloves', 150),
                                                  ('purple onion', 147),
                                                  ('dried oregano', 147),
                                                  ('extra-virgin olive oil',
                                                   121),
                                                  ('tomatoes', 117),
                                                  ('pepper', 113),
                                                  ('garlic', 107),
                                                  ('ground black pepper', 106)],
                