### In this notebook I am pulling cocktails and their ingredients from the web and create a algorithm that finds cocktails that are similar to each other based on their ingredients. The end goal is to create a cocktail recommendation engine.

#### Step 1: Scrape a list of cocktails and their characteristics from the website cocktaildb.com. This website comes with a free API and there are a few different ways to hit this API. The one that I chose to use is *List all cocktails by first letter*. Details about the cocktaildb API can be found in this [link](https://www.thecocktaildb.com/api.php).


In [1]:
# import necassary libraries

import pandas as pd
import numpy as np

import requests
from requests.auth import HTTPBasicAuth
import string
import warnings
import json

warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [2]:
def get_url_list():

    # gets the url for each potential first letter of a cocktail name - a,b,c.. 1,2,.. etc and stores them in a list

    url_list = []
    main_url = 'https://www.thecocktaildb.com/api/json/v1/1/search.php?f='

    for i in string.printable:
        url_list.append(main_url+i)

    return url_list

In [3]:
def scrape_cocktail_list():

  # scrapes the information for a list of url's

    cocktail_list = []
    url_list = get_url_list()

    for i in url_list:
        try:
            r = requests.get(i, verify=False)
            cocktail_list.append(r.json())
        except:
            pass
    return cocktail_list

In [4]:
cocktail_list = scrape_cocktail_list()

In [5]:
# here are all the columns that might be available for each cocktail

cocktail_list[1]['drinks'][0].keys()

dict_keys(['idDrink', 'strDrink', 'strDrinkAlternate', 'strTags', 'strVideo', 'strCategory', 'strIBA', 'strAlcoholic', 'strGlass', 'strInstructions', 'strInstructionsES', 'strInstructionsDE', 'strInstructionsFR', 'strInstructionsIT', 'strInstructionsZH-HANS', 'strInstructionsZH-HANT', 'strDrinkThumb', 'strIngredient1', 'strIngredient2', 'strIngredient3', 'strIngredient4', 'strIngredient5', 'strIngredient6', 'strIngredient7', 'strIngredient8', 'strIngredient9', 'strIngredient10', 'strIngredient11', 'strIngredient12', 'strIngredient13', 'strIngredient14', 'strIngredient15', 'strMeasure1', 'strMeasure2', 'strMeasure3', 'strMeasure4', 'strMeasure5', 'strMeasure6', 'strMeasure7', 'strMeasure8', 'strMeasure9', 'strMeasure10', 'strMeasure11', 'strMeasure12', 'strMeasure13', 'strMeasure14', 'strMeasure15', 'strImageSource', 'strImageAttribution', 'strCreativeCommonsConfirmed', 'dateModified'])

In [6]:
def get_ingredients(cocktail_dict):

  # pulls the ingredients of each cocktail (that are stored in 'strIngredient1', 'strIngredient2' etc) in one string

    ingredient = cocktail_dict['strIngredient1']
    i = 2

    ingredient_list = ""

    while ingredient:

        ingredient_list = ingredient + ", " + ingredient_list
        ingredient = cocktail_dict['strIngredient'+str(i)]

        i = i+1

    return ingredient_list

In [9]:
def cocktail_data_clean(cocktail_list):

    # creates a list of dictionaries, one for each cocktail with the information of interest

    cocktails_info = []

    for i in range(len(cocktail_list)):
        try:
            for j in range(len(cocktail_list[i]['drinks'])):
                cocktail = {}
                cocktail['drink'] = cocktail_list[i]['drinks'][j]['strDrink']
                cocktail['ingredients'] = get_ingredients(cocktail_list[i]['drinks'][j])
                cocktail['instructions'] = cocktail_list[i]['drinks'][j]['strInstructions']

                cocktails_info.append(cocktail)
        except:
            pass   #ignore cocktails for which there is no data

    return cocktails_info

In [10]:
cocktails_info = cocktail_data_clean(cocktail_list)

# get 2 cocktails as an example
cocktails_info[2:4]

[{'drink': '110 in the shade',
  'ingredients': 'Tequila, Lager, ',
  'instructions': 'Drop shooter in glass. Fill with beer'},
 {'drink': '151 Florida Bushwacker',
  'ingredients': 'Vanilla ice-cream, Coconut liqueur, Milk, Cointreau, Dark Creme de Cacao, 151 proof rum, Light rum, Malibu rum, ',
  'instructions': 'Combine all ingredients. Blend until smooth. Garnish with chocolate shavings if desired.'}]

#### Step 2: Find cocktail similarity using Tf-idf vectorizer and cosine similarity of the ingredients

In [11]:
cocktail_df = pd.DataFrame(cocktails_info)

In [12]:
cocktail_df.head()

Unnamed: 0,drink,ingredients,instructions
0,155 Belmont,"Orange juice, Vodka, Light rum, Dark rum,",Blend with ice. Serve in a wine glass. Garnish...
1,1-900-FUK-MEUP,"Pineapple juice, Cranberry juice, Amaretto, Ma...",Shake ingredients in a mixing tin filled with ...
2,110 in the shade,"Tequila, Lager,",Drop shooter in glass. Fill with beer
3,151 Florida Bushwacker,"Vanilla ice-cream, Coconut liqueur, Milk, Coin...",Combine all ingredients. Blend until smooth. G...
4,252,"Wild Turkey, 151 proof rum,","Add both ingredients to shot glass, shoot, and..."


In [13]:
def similar_cocktail(cocktail_df, chosen_cocktail):

    # string pre-processing
    cocktail_df['drink'] = cocktail_df['drink'].str.lower().str.replace('[^\w\s]','')
    cocktail_df['ingredients'] = cocktail_df['ingredients'].str.lower().str.replace('[^\w\s]','')
    cocktail_df['instructions'] = cocktail_df['instructions'].str.lower().str.replace('[^\w\s]','')

    # implement tf-idf vectorizer
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(cocktail_df['ingredients'])
    arr = X.toarray()

    similarity_table = pd.DataFrame(cosine_similarity(arr), columns=cocktail_df['drink'], index=cocktail_df['drink'])
    for column in similarity_table.columns:
        similarity_table[column] = np.where(similarity_table[column] >= 1, 0, similarity_table[column])
    similar_cocktail= similarity_table.idxmax()

    return similar_cocktail[chosen_cocktail][0]

In [20]:
# test 1 cocktail

similar_cocktail(cocktail_df, 'a1')

'talos coffee'

#### Quick exploration of the data

In [21]:
cocktail_df.describe()

Unnamed: 0,drink,ingredients,instructions
count,916,916,916
unique,440,437,406
top,zorro,wild turkey 151 proof rum,shake all ingredients with ice strain into a c...
freq,4,7,24


In [22]:
new_df = cocktail_df.ingredients.str.split(expand=True).stack().value_counts().reset_index()
new_df.columns = ['ingredient', 'frequency']

# get the 10  most commonly used ingredients
new_df.head(10)

Unnamed: 0,ingredient,frequency
0,juice,400
1,lemon,236
2,rum,195
3,sugar,192
4,gin,184
5,orange,172
6,vodka,151
7,water,136
8,lime,135
9,cream,109


#### Experimenting with pickling the model file for deployment (optional)

In [24]:
import pickle

In [25]:
pickle.dump(similar_cocktail, open('similar_cocktail.pkl','wb'))

In [26]:
pickled_rec = pickle.load(open('similar_cocktail.pkl','rb'))

In [29]:
pickled_rec(cocktail_df, 'a1')

'talos coffee'