# Recommending the Ingredients

*We always tell our model to ignore the last ingredient in the recipe

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir('/content/drive/My Drive/Colab Notebooks/final project/whattocook/code')
os.getcwd()

'/content/drive/My Drive/Colab Notebooks/final project/whattocook/code'

In [0]:
# Load packages
import numpy as np
import pandas as pd
import itertools
from collections import Counter
from gensim.models import Word2Vec
import sys
sys.path.append('../code/')
from text_preprocess import *

In [0]:
# Load data
f = open('../data/train.json','r', encoding = 'utf-8')
data = pd.read_json(f)

In [0]:
# Clean the ingredient text data
corpus = text_preprocess(data)

In [6]:
lengths = []
for i in corpus:
    lengths.append(len(i))

np.max(lengths), data['cuisine'][np.argmax(lengths)]

(65, 'italian')

In [7]:
all_ingredients = list(itertools.chain.from_iterable(corpus))
len(np.unique(all_ingredients))

6687

## We will build the model evaluator below

In [0]:
import numpy as np
import random

class ModelEvaluator:
    """
    This class calculates the % of the actual ingredient being in the top n recommended items 
    based on the test set. A list of 50 items absent in the given recipe are randomly drawn and 
    ranked together with the 1 actual ingredient.
    """
    
    def __init__(self, all_items, n):
        self.all = all_items
        self.n = n
    
    def get_missing_items(self, recipe, sample_size, seed = 208):
        
        missing_items = set(self.all)-set(recipe)
        
        random.seed(seed)
        rec_sample = random.sample(missing_items, sample_size)
        
        return set(rec_sample)

    def _if_in_topn_(self, actual_item, recommended_items, n):
        
        return int(actual_item in recommended_items[:n])
    
    def evaluate_model(self, model, X_te):
        hit_rate = []
        
        for i in range(0,len(X_te)): 
            candidates = self.get_missing_items(X_te[i], 50)
            recommendation = model.recommend_items(X_te[i], candidates)
            hit_rate.append(self._if_in_topn_(X_te[i][-1], recommendation, self.n))
            
            if i % 500 == 0:
                print('Iteration:', i) 
        
        return np.sum(hit_rate)/len(hit_rate)

evaluator = ModelEvaluator(all_items = all_ingredients, n = 5)

## Popularity Model (Baseline)

In [0]:
import pandas as pd
import numpy as np
from collections import Counter

class popularity:
    """
    This class takes in the training corpus and a recipe, and ranks the candidate 
    ingredients based on their appearance in the training corpus.
    We use the raw corpus for this class.
    """
    
    def __init__(self, all_items):
        self.all = all_items
        self.popularity = Counter(all_items)

    def recommend_items(self, recipe, candidates):
        candidates = list(candidates)
        candidates.append(recipe[-1])
        count = []
        
        for i in candidates:
            count.append(self.popularity[i])
        
        rec = [candidates[i] for i in np.argsort(count)]        
        rec.reverse() # descending order
        
        return rec

In [10]:
from sklearn.model_selection import train_test_split
corpus_tr, corpus_te, cuisine_tr, cuisine_te = train_test_split(corpus, data['cuisine'],
                                                                test_size = 0.2,
                                                                random_state = 0)
all_tr = list(itertools.chain.from_iterable(corpus_tr))
all_te = list(itertools.chain.from_iterable(corpus_te))

popular_model = popularity(all_items = all_tr) ############# REPLACE WITH TRAINING
# popular_model.recommend_items(['salt','water'],['romaine lettuce','pepper','garlic'])

evaluator.evaluate_model(popular_model, corpus_te)


Iteration: 0
Iteration: 500
Iteration: 1000
Iteration: 1500
Iteration: 2000
Iteration: 2500
Iteration: 3000
Iteration: 3500
Iteration: 4000
Iteration: 4500
Iteration: 5000
Iteration: 5500
Iteration: 6000
Iteration: 6500
Iteration: 7000
Iteration: 7500


0.8295411690760528

## Collaborative Filtering Model

### Generate Data Matrix 

In [0]:
import numpy as np
import itertools

def corpus_to_matrix(corpus, all_ingredients = None):
    """
    This function takes in the recipe corpus and expands it into a matrix of 0's and 1's 
    to facilitate the calculation of cosine similiarity.
    
    Input: Corpus, i.e. a list of lists
    
    Output: A matrix of size len(corpus) by (# of unique ingredients in the corpus)
    """
    
    if all_ingredients == None:
        all_ingredients = list(itertools.chain.from_iterable(corpus))
    
    unique_ingredients = np.unique(all_ingredients)
    recipe = np.zeros([len(corpus),len(unique_ingredients)])
    
    for i in range(0, len(corpus)):
        
        for j in range(0, len(unique_ingredients)):
            
            if unique_ingredients[j] in corpus[i]:
                recipe[i][j] = 1
                
    return recipe       

# recipe = corpus_to_matrix(corpus)

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

class collaborative:
    """
    This class computes the cosine similarity (i.e. Pearson correlation) between the new recipe 
    and the training recipes. And then, it ranks the items in the candidate list along with the
    last item in the new recipe, based on the 1000 training recipes that are the most similar 
    to the new recipe (excluding the last ingredient when computing cosine similarities).
    """
    
    def __init__(self, all_items, corpus = corpus):
        self.recipe = corpus_to_matrix(corpus)
        self.all = all_items
        self.unique = np.unique(all_items)
        
    def recommend_items(self, recipe, candidates):
        candidates = list(candidates)
        candidates.append(recipe[-1])
        
        new_recipe = corpus_to_matrix([recipe[-1]], self.all)
        sim = cosine_similarity(self.recipe, new_recipe)
        order = np.argsort(sim, axis=0).tolist() 
        order.reverse() # descending order
        nearest_1000 = []
        for i in range(0,1000):
            nearest_1000.append((self.recipe[order[i]]).tolist())
        nearest_1000 = np.array(nearest_1000)
        
        popularity = np.sum(nearest_1000, axis=0)[0]
        count_1000 = []
        
        for i in candidates:
            count_1000.append(popularity[np.where(self.unique == i)[0]])
        
        rec = [candidates[int(i)] for i in np.argsort(count_1000, axis=0)]
        rec.reverse()
        
        return rec

In [0]:
collab_model = collaborative(all_items = all_tr, corpus = corpus_tr)
# collab_model.recommend_items(['salt','water'],['romaine lettuce','pepper','garlic'])

In [0]:
evaluator.evaluate_model(collab_model, corpus_te)

Iteration: 0


Tradeoff: increase the neighborhood size $\Leftrightarrow$ introducing bias to the small cuisines