### In this notebook I am pulling cocktails and their ingredients from the web and create a simple algorithm that finds cocktails that are similar to each other based on their ingredients. The end goal is to create a cocktail recommendation engine.

#### Step 1: Scrape a list of cocktails and their characteristics from the website cocktaildb.com. This website comes with a free API and there are a few different ways to hit this API. The one that I chose to use is *List all cocktails by first letter*. Details about the cocktaildb API can be found in this [link](https://www.thecocktaildb.com/api.php).


In [1]:
# import necassary libraries

import pandas as pd
import numpy as np

import requests
from requests.auth import HTTPBasicAuth
import string
import warnings
import json

warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
def get_url_list():

    # gets the url for each potential first letter of a cocktail name - a,b,c.. 1,2,.. etc and stores them in a list

    url_list = []
    main_url = 'https://www.thecocktaildb.com/api/json/v1/1/search.php?f='

    for i in string.printable:
        url_list.append(main_url+i)

    return url_list

In [3]:
def scrape_cocktail_list():

  # scrapes the information for a list of url's

    cocktail_list = []
    url_list = get_url_list()

    for i in url_list:
        try:
            r = requests.get(i, verify=False)
            cocktail_list.append(r.json())
        except:
            pass
    return cocktail_list

In [58]:
cocktail_list = scrape_cocktail_list()

In [59]:
cocktail_list[0:2]

[{'drinks': None},
 {'drinks': [{'dateModified': '2016-10-05 12:36:28',
    'idDrink': '15346',
    'strAlcoholic': 'Alcoholic',
    'strCategory': 'Cocktail',
    'strCreativeCommonsConfirmed': 'No',
    'strDrink': '155 Belmont',
    'strDrinkAlternate': None,
    'strDrinkThumb': 'https://www.thecocktaildb.com/images/media/drink/yqvvqs1475667388.jpg',
    'strGlass': 'White wine glass',
    'strIBA': None,
    'strImageAttribution': None,
    'strImageSource': None,
    'strIngredient1': 'Dark rum',
    'strIngredient10': None,
    'strIngredient11': None,
    'strIngredient12': None,
    'strIngredient13': None,
    'strIngredient14': None,
    'strIngredient15': None,
    'strIngredient2': 'Light rum',
    'strIngredient3': 'Vodka',
    'strIngredient4': 'Orange juice',
    'strIngredient5': None,
    'strIngredient6': None,
    'strIngredient7': None,
    'strIngredient8': None,
    'strIngredient9': None,
    'strInstructions': 'Blend with ice. Serve in a wine glass. Garnish wit

In [5]:
# here are all the columns that might be available for each cocktail

cocktail_list[1]['drinks'][0].keys()

dict_keys(['idDrink', 'strDrink', 'strDrinkAlternate', 'strTags', 'strVideo', 'strCategory', 'strIBA', 'strAlcoholic', 'strGlass', 'strInstructions', 'strInstructionsES', 'strInstructionsDE', 'strInstructionsFR', 'strInstructionsIT', 'strInstructionsZH-HANS', 'strInstructionsZH-HANT', 'strDrinkThumb', 'strIngredient1', 'strIngredient2', 'strIngredient3', 'strIngredient4', 'strIngredient5', 'strIngredient6', 'strIngredient7', 'strIngredient8', 'strIngredient9', 'strIngredient10', 'strIngredient11', 'strIngredient12', 'strIngredient13', 'strIngredient14', 'strIngredient15', 'strMeasure1', 'strMeasure2', 'strMeasure3', 'strMeasure4', 'strMeasure5', 'strMeasure6', 'strMeasure7', 'strMeasure8', 'strMeasure9', 'strMeasure10', 'strMeasure11', 'strMeasure12', 'strMeasure13', 'strMeasure14', 'strMeasure15', 'strImageSource', 'strImageAttribution', 'strCreativeCommonsConfirmed', 'dateModified'])

In [43]:
cocktail_list[1]['drinks'][1]

{'dateModified': '2016-07-18 22:27:04',
 'idDrink': '15395',
 'ingredient_and_quantity1': 'Absolut Kurant - 1/2 oz ',
 'ingredient_and_quantity10': None,
 'ingredient_and_quantity11': None,
 'ingredient_and_quantity12': None,
 'ingredient_and_quantity13': None,
 'ingredient_and_quantity14': None,
 'ingredient_and_quantity15': None,
 'ingredient_and_quantity2': 'Grand Marnier - 1/4 oz ',
 'ingredient_and_quantity3': 'Chambord raspberry liqueur - 1/4 oz ',
 'ingredient_and_quantity4': 'Midori melon liqueur - 1/4 oz ',
 'ingredient_and_quantity5': 'Malibu rum - 1/4 oz ',
 'ingredient_and_quantity6': 'Amaretto - 1/4 oz ',
 'ingredient_and_quantity7': 'Cranberry juice - 1/2 oz ',
 'ingredient_and_quantity8': 'Pineapple juice - 1/4 oz ',
 'ingredient_and_quantity9': None,
 'strAlcoholic': 'Alcoholic',
 'strCategory': 'Shot',
 'strCreativeCommonsConfirmed': 'No',
 'strDrink': '1-900-FUK-MEUP',
 'strDrinkAlternate': None,
 'strDrinkThumb': 'https://www.thecocktaildb.com/images/media/drink/uxyw

In [15]:
# create columns that show ingredients and their quantity

for i in range(len(cocktail_list)):
    try:
        for j in range(len(cocktail_list[i]['drinks'])):
            for k in range(1,16):
                if cocktail_list[i]['drinks'][j]['strIngredient'+str(k)] is not None:
                    try:
                        cocktail_list[i]['drinks'][j]['ingredient_and_quantity'+str(k)] = cocktail_list[i]['drinks'][j]['strIngredient'+str(k)] + " - " + cocktail_list[i]['drinks'][j]['strMeasure'+str(k)]
                    except:
                        cocktail_list[i]['drinks'][j]['ingredient_and_quantity'+str(k)] = cocktail_list[i]['drinks'][j]['strIngredient'+str(k)] 
                else:
                    cocktail_list[i]['drinks'][j]['ingredient_and_quantity'+str(k)] = None
    except:
        pass

In [16]:
def get_ingredients(cocktail_dict, col_name):

  # pulls the ingredients of each cocktail (that are stored in 'strIngredient1', 'strIngredient2' etc) in one string

    ingredient = cocktail_dict[col_name + '1']
    i = 2

    ingredient_list = ""

    while ingredient:

        ingredient_list = ingredient + ", " + ingredient_list
        ingredient = cocktail_dict[col_name + str(i)]

        i = i+1

    return ingredient_list

In [17]:
def cocktail_data_clean(cocktail_list):

    # creates a list of dictionaries, one for each cocktail with the information of interest

    cocktails_info = []

    for i in range(len(cocktail_list)):
        try:
            for j in range(len(cocktail_list[i]['drinks'])):
                cocktail = {}
                cocktail['drink'] = cocktail_list[i]['drinks'][j]['strDrink']
                cocktail['ingredients'] = get_ingredients(cocktail_list[i]['drinks'][j],'strIngredient')
                cocktail['ingredients_and_quantities'] = get_ingredients(cocktail_list[i]['drinks'][j],'ingredient_and_quantity')
                cocktail['instructions'] = cocktail_list[i]['drinks'][j]['strInstructions']

                cocktails_info.append(cocktail)
        except:
            pass   #ignore cocktails for which there is no data

    return cocktails_info

In [18]:
cocktails_info = cocktail_data_clean(cocktail_list)

# get 2 cocktails as an example
cocktails_info[2:4]

[{'drink': '110 in the shade',
  'ingredients': 'Tequila, Lager, ',
  'ingredients_and_quantities': 'Tequila - 1.5 oz , Lager - 16 oz , ',
  'instructions': 'Drop shooter in glass. Fill with beer'},
 {'drink': '151 Florida Bushwacker',
  'ingredients': 'Vanilla ice-cream, Coconut liqueur, Milk, Cointreau, Dark Creme de Cacao, 151 proof rum, Light rum, Malibu rum, ',
  'ingredients_and_quantities': 'Vanilla ice-cream - 1 cup , Coconut liqueur - 1 oz , Milk - 3 oz , Cointreau - 1 oz , Dark Creme de Cacao - 1 oz , 151 proof rum - 1/2 oz Bacardi , Light rum - 1/2 oz , Malibu rum - 1/2 oz , ',
  'instructions': 'Combine all ingredients. Blend until smooth. Garnish with chocolate shavings if desired.'}]

#### Step 2: Find cocktail similarity using Tf-idf vectorizer and cosine similarity of the ingredients

In [19]:
cocktail_df = pd.DataFrame(cocktails_info)

In [20]:
cocktail_df.drop_duplicates(inplace=True)

In [21]:
cocktail_df.head()

Unnamed: 0,drink,ingredients,ingredients_and_quantities,instructions
0,155 Belmont,"Orange juice, Vodka, Light rum, Dark rum,","Orange juice - 1 shot , Vodka - 1 shot , Light...",Blend with ice. Serve in a wine glass. Garnish...
1,1-900-FUK-MEUP,"Pineapple juice, Cranberry juice, Amaretto, Ma...","Pineapple juice - 1/4 oz , Cranberry juice - 1...",Shake ingredients in a mixing tin filled with ...
2,110 in the shade,"Tequila, Lager,","Tequila - 1.5 oz , Lager - 16 oz ,",Drop shooter in glass. Fill with beer
3,151 Florida Bushwacker,"Vanilla ice-cream, Coconut liqueur, Milk, Coin...","Vanilla ice-cream - 1 cup , Coconut liqueur - ...",Combine all ingredients. Blend until smooth. G...
4,252,"Wild Turkey, 151 proof rum,","Wild Turkey - 1/2 shot , 151 proof rum - 1/2 s...","Add both ingredients to shot glass, shoot, and..."


In [23]:
def similar_cocktail(cocktail_df, chosen_cocktail):

    # string pre-processing
    cocktail_df['ingredients'] = cocktail_df['ingredients'].str.lower().str.replace('[^\w\s]','')

    # implement tf-idf vectorizer
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cocktail_df['ingredients'])
    arr = X.toarray()

    similarity_table = pd.DataFrame(cosine_similarity(arr), columns=cocktail_df['drink'], index=cocktail_df['drink'])
    
    for column in similarity_table.columns:
            similarity_table[column] = np.where(similarity_table[column] >= 1, 0, similarity_table[column])
        
    similar_cocktail= similarity_table.idxmax()

    new_cocktail = similar_cocktail[chosen_cocktail]

    return new_cocktail

In [36]:
# test 1 cocktail

new_cocktail = similar_cocktail(cocktail_df, 'A1')

In [35]:
new_cocktail

'Munich Mule'

#### Quick exploration of the data

In [26]:
cocktail_df.describe()

Unnamed: 0,drink,ingredients,ingredients_and_quantities,instructions
count,441,441,441,441
unique,441,437,439,406
top,Gin Rickey,champagne creme de cassis,"Olive - 1 , Dry Vermouth - 1/3 oz , Gin - 1 2/...","Shake all ingredients with ice, strain into a ..."
freq,1,2,2,12


In [27]:
new_df = cocktail_df.ingredients.str.split(expand=True).stack().value_counts().reset_index()
new_df.columns = ['ingredient', 'frequency']

# get the 10  most commonly used ingredients
new_df.head(10)

Unnamed: 0,ingredient,frequency
0,juice,195
1,lemon,116
2,rum,96
3,sugar,95
4,gin,87
5,orange,84
6,vodka,76
7,lime,67
8,water,67
9,cream,48


#### Experimenting with pickling the model file for deployment (optional)

In [28]:
import pickle

In [29]:
pickle.dump(similar_cocktail, open('similar_cocktail.pkl','wb'))

In [30]:
pickle.dump(cocktail_df, open('cocktail_df.pkl','wb'))

In [31]:
similar_cocktail = pickle.load(open('similar_cocktail.pkl','rb'))

In [32]:
cocktail_df = pickle.load(open('cocktail_df.pkl','rb'))

In [33]:
similar_cocktail(cocktail_df, 'A1')

'Talos Coffee'

In [53]:
def cocktail_data_clean_more_columns(cocktail_list):

    # creates a list of dictionaries, one for each cocktail with the information of interest

    cocktails_info = []

    for i in range(len(cocktail_list)):
        try:
            for j in range(len(cocktail_list[i]['drinks'])):
                cocktail = {}
                cocktail['drink'] = cocktail_list[i]['drinks'][j]['strDrink']
                cocktail['ingredients'] = get_ingredients(cocktail_list[i]['drinks'][j],'strIngredient')
                cocktail['ingredients_and_quantities'] = get_ingredients(cocktail_list[i]['drinks'][j],'ingredient_and_quantity')
                cocktail['instructions'] = cocktail_list[i]['drinks'][j]['strInstructions']
                cocktail['alcoholic'] = cocktail_list[i]['drinks'][j]['strAlcoholic']
                cocktail['glass'] = cocktail_list[i]['drinks'][j]['strGlass']

                cocktails_info.append(cocktail)
        except:
            pass   #ignore cocktails for which there is no data

    return cocktails_info

In [54]:
cocktail_data = pd.DataFrame(cocktail_data_clean_more_columns(cocktail_list))

In [55]:
cocktail_data

Unnamed: 0,alcoholic,drink,glass,ingredients,ingredients_and_quantities,instructions
0,Alcoholic,155 Belmont,White wine glass,"Orange juice, Vodka, Light rum, Dark rum,","Orange juice - 1 shot , Vodka - 1 shot , Light...",Blend with ice. Serve in a wine glass. Garnish...
1,Alcoholic,1-900-FUK-MEUP,Old-fashioned glass,"Pineapple juice, Cranberry juice, Amaretto, Ma...","Pineapple juice - 1/4 oz , Cranberry juice - 1...",Shake ingredients in a mixing tin filled with ...
2,Alcoholic,110 in the shade,Beer Glass,"Tequila, Lager,","Tequila - 1.5 oz , Lager - 16 oz ,",Drop shooter in glass. Fill with beer
3,Alcoholic,151 Florida Bushwacker,Beer mug,"Vanilla ice-cream, Coconut liqueur, Milk, Coin...","Vanilla ice-cream - 1 cup , Coconut liqueur - ...",Combine all ingredients. Blend until smooth. G...
4,Alcoholic,252,Shot glass,"Wild Turkey, 151 proof rum,","Wild Turkey - 1/2 shot , 151 proof rum - 1/2 s...","Add both ingredients to shot glass, shoot, and..."
5,Alcoholic,24k nightmare,Shot glass,"151 proof rum, Rumple Minze, Jägermeister, Gol...","151 proof rum - 1/2 oz Bacardi , Rumple Minze ...","Add over ice,shake and pour."
6,Alcoholic,3 Wise Men,Collins glass,"Jim Beam, Johnnie Walker, Jack Daniels,","Jim Beam - 1/3 oz , Johnnie Walker - 1/3 oz , ...",put them them in a glass... and slam it to tha...
7,Alcoholic,3-Mile Long Island Iced Tea,Collins Glass,"Lemon, Bitters, Sweet and sour, Coca-Cola, Vod...","Lemon - Garnish with, Bitters - 1 wedge , Swee...",Fill 14oz glass with ice and alcohol. Fill 2/3...
8,Alcoholic,410 Gone,Collins Glass,"Coca-Cola, Peach Vodka,","Coca-Cola, Peach Vodka - 2-3 oz,",
9,Alcoholic,50/50,Collins Glass,"Orange juice, Grand Marnier, Vanilla vodka,","Orange juice - Fill with , Grand Marnier - 1 s...",fill glass with crushed ice. Add vodka. Add a ...


In [60]:
cocktail_df.drink.unique()

array(['155 Belmont', '1-900-FUK-MEUP', '110 in the shade',
       '151 Florida Bushwacker', '252', '24k nightmare', '3 Wise Men',
       '3-Mile Long Island Iced Tea', '410 Gone', '50/50', '501 Blue',
       '57 Chevy with a White License Plate', '69 Special', '747',
       '747 Drink', '9 1/2 Weeks', 'A1', 'ABC', 'Ace', 'ACID', 'Adam',
       'AT&T', 'A. J.', 'Avalon', 'Apello', 'Affair', 'Abilene',
       'Almeria', 'Addison', 'Applecar', 'Acapulco', 'Affinity',
       'Aviation', 'After sex', 'Applejack', 'Afterglow', 'Afternoon',
       'Alexander', 'Autodafé', 'Allegheny', 'Americano', 'B-52', 'B-53',
       'Bijou', 'Boxcar', 'Big Red', 'Bellini', 'Bramble', 'Balmoral',
       'Bluebird', 'Brooklyn', 'Bora Bora', 'Boomerang', 'Barracuda',
       'Brigadier', 'Broadside', 'Buccaneer', 'Brain Fart', 'Blackthorn',
       'Bob Marley', 'Bible Belt', 'Bubble Gum', 'Bumble Bee',
       'Baby Eskimo', 'Boston Sour', 'Bahama Mama', 'Casino',
       'Cafe Savoy', 'Caipirinha', 'Cream Sod