In [1]:
import math
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tqdm.notebook as tqdm # progress bars
import implicit # Fast, sparse ALS implementation

from itertools import product

from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Common Utilities

In [2]:
'''
Data Preprocessing
'''

def safe_train_test_split(users, items, test_size=0.25, random_state=0):
    '''
    Performs a train test split on interactions, guaranteeing that every user 
    will be represented in both the train and the test split (assuming each user appears twice).
    Note that sklearn's train_test_split function, even when stratified by user does not guarantee this,
    unless the test_size is such that every user should be in both splits (0.5 if only 2 reviews/user)
    
    users is a list of user_ids
    items is a list of items_ids
    test_size is the approximate proportion of samples to be considered "test samples"
    
    The indices of users and items should correlate to represent all positive interactions
    '''
    
    user_train = []
    user_test = []
    
    item_train = []
    item_test = []
    
    random.seed(random_state)
    
    for user_id in tqdm.tqdm(users.unique()):
        examples = list(users[users.values == user_id].index)
        random.shuffle(examples)
        
        # Not enough samples to perform proper split: pull one into test, then throw the rest in train
        if len(examples) * test_size < 1:
            user_test.append(user_id)
            item_test.append(items[examples[0]])
            
            for i in range(1, len(examples)):
                user_train.append(user_id)
                item_train.append(items[examples[i]])
            
        # Enough to perform proper split: throw into train and test according to test_size
        else:
            test_samples = int(len(examples) * test_size)
            
            for i in range(0, test_samples):
                user_test.append(user_id)
                item_test.append(items[examples[i]])
                
            for j in range(test_samples, len(examples)):
                user_train.append(user_id)
                item_train.append(items[examples[j]])
            
    return user_train, user_test, item_train, item_test

def make_recipe_ingr_xref(recipes):
    '''
    recipes is the pandas dataframe for preprceossed recipes. The ingredient_ids column should store lists of integers
    
    Returns new pandas dataframe, a cross-reference table for all recipes and ingredients
    '''
    
    recipe_ids = []
    ingr_ids = []
    
    for row in tqdm.tqdm(recipes['ingredient_ids'].index):
        for ingr_id in recipes['ingredient_ids'][row]:
            recipe_ids.append(recipes.loc[row, 'i'])
            ingr_ids.append(ingr_id)
            
    return pd.DataFrame.from_dict({'i': recipe_ids, 'ingr': ingr_ids})

In [3]:
'''
Reccomendation Generation
'''

def get_positive_interactions(interactions, user_id):
    '''
    interactions is a pandas dataframe describing all known positive interactions, for filtering purposes
    user_id is an integer specifying the user for which to pull the positive interactions
    
    returns a list of recipe_ids, specifying the recipes the given user has interacted with
    '''
    
    return interactions[interactions['u'] == user_id]['i'].values

def get_reccomendations(predictions, k, filtered_recipe_ids=[], filter_val=-100):
    '''
    General purpose function for pulling reccomendations based on a value for each recipe
    
    predictions is a numpy array of length num_recipes, that assigns a value to each recipe
    k is an integer, the number of reccomendations to generate
    filtered_recipe_ids is a list of recipe_ids, specifying which recipes should not be included in the reccomendations
    filter_val is the value that replaces filtered recipe_ids for filtering purposes
    
    Returns recipe_ids, a list of the top k reccomended recipes, with index 0 being the most reccomended
    '''
    
    # Replace all recipes to be filtered out with the filter value, which should be a lower value
    predictions[filtered_recipe_ids] = filter_val
    
    # Get the top k reccomendations for the user (NOTE: these are not in order)
    unsorted_recs = np.argpartition(predictions, -k)[-k:]
    
    # Sort the reccomendations by their predictions values, in descending order
    sorted_recs = sorted(unsorted_recs, key=lambda rec: predictions[rec], reverse=True)

    return sorted_recs

def collab_filter_reccomendations(item_embedding, user_embedding, user_id, k, interactions=None, filter_recs=False):
    '''
    item embedding is of shape (items, features) describing the features for each item
    user_embedding is of shape (users, features) describing the features for each user
    user_id is an integer specifying the user to get reccomendations for
    k is an integer specifying the number of reccomendations to pull
    interactions is a pandas dataframe describing all known positive interactions, for filtering purposes
        Note: interactions is only required if filter_recs is true
    filter_recs is a boolean, specifying whether or not to filter known positive interactions specified in interactions
    
    Returns a list of recipe_ids similar to the general-purpose get_reccomendations function
    '''
    
    predictions = item_embedding @ user_embedding[user_id, :]
    
    filtered_recipe_ids = get_positive_interactions(interactions, user_id) if filter_recs else []

    return get_reccomendations(predictions, k, filtered_recipe_ids)

def collab_filter_train_test_matrix(item_embedding, user_embedding, k, interactions):
    '''
    parameters are defined as for collab_filter_reccomendations
    
    Returns a tuple (train_rec_matrix, test_rec_matrix) where both are matrices of size (num_users, k) providing
        the top k reccomendations for each user (most reccomended at index 0). train_rec_matrix does not filter known
        positive interactions, while test_rec_matrix does filter known positive interactions
    '''
    
    train_rec_matrix = []
    test_rec_matrix = []
    
    for user_id in tqdm.tqdm(range(user_embedding.shape[0])):
        predictions = item_embedding @ user_embedding[user_id, :]
        filtered_recipe_ids = get_positive_interactions(interactions, user_id)
        
        train_recs = get_reccomendations(predictions, k)
        test_recs = get_reccomendations(predictions, k, filtered_recipe_ids)

        train_rec_matrix.append(train_recs)
        test_rec_matrix.append(test_recs)

    return np.array(train_rec_matrix), np.array(test_rec_matrix)

def similar_recipes(embeddings, recipe_id, k):
    '''
    Embeddings is a matrix of shape (num_recipes, num_feature)
    recipe_id is an integer, the recipe to find similar recipes to
    
    Note that embeddings could be tf-idf embeddings to produce content-based reccomendations ("similar recipes")
        OR embeddings could be item_embeddings generated from collaborative filtering ("customers who liked this recipe also liked")
    
    Returns a list of recipe_ids, the most similar recipes to the given recipe
    '''
    
    # Values for each recipe based on similarity to given recipe
    similarities = cosine_similar_predictions(embeddings, embeddings[recipe_id])
    
    # Get reccomendations, filtering out the queries recipe
    return get_reccomendations(similarities, k, filtered_recipe_ids=[recipe_id])

def mean_profile_reccomendations(embeddings, user_id, k, interactions, filter_recs=False):
    '''
    Parameters defined as above
    
    Returns a list of recipe_ids, reccomended based on the mean embedding of all recipes the user has interacted with
    '''
    
    interacted_ids = get_positive_interactions(interactions, user_id)
    
    interacted_embeddings = embeddings[interacted_ids]

    average_embedding = np.mean(interacted_embeddings, axis=0)

    similarities = cosine_similar_predictions(embeddings, average_embedding)

    return get_reccomendations(similarities, k, filtered_recipe_ids=interacted_ids if filter_recs else [])

def mean_profile_train_test_matrix(embeddings, k, interactions):
    '''
    Produces a matrix of size (num_users, k) providing the top k reccomendations for each user
    
    This takes like 12 minutes to run with tf_idf embeddings, I think because of the ~8000 features
    This takes 6 minutes for collab_filter embeddings, with only 8 features. Probably longer b/c it normalizes similarities
    '''
    
    train_rec_matrix = []
    test_rec_matrix = []
    
    for user_id in tqdm.tqdm(range(max(interactions['u']) + 1)):
    #for user_id in tqdm.tqdm(range(10)):
        interacted_ids = get_positive_interactions(interactions, user_id)
        interacted_embeddings = embeddings[interacted_ids]

        average_embedding = np.mean(interacted_embeddings, axis=0)
        similarities = cosine_similar_predictions(embeddings, average_embedding)

        train_recs = get_reccomendations(similarities, k)
        test_recs = get_reccomendations(similarities, k, filtered_recipe_ids=interacted_ids)

        train_rec_matrix.append(train_recs)
        test_rec_matrix.append(test_recs)
        
    return np.array(train_rec_matrix), np.array(test_rec_matrix)

def random_rec_matrix(num_users, num_recipes, k):
    '''
    Function to create a dummy reccomendation matrix, as a baseline for other models
    '''
    
    rec_matrix = []
    
    for user_id in tqdm.tqdm(range(num_users)):
        rec_matrix.append([random.randint(0, num_recipes - 1) for i in range(k)])
        
    return np.array(rec_matrix)

In [4]:
#print(mean_profile_reccomendations(tf_idf_embeddings, 0, 20, train_interactions))
#print(mean_profile_reccomendations(model.item_factors, 0, 20, train_interactions))
#print(collab_filter_reccomendations(model.item_factors, model.user_factors, 0, 20, train_interactions))
#print(train_interactions[train_interactions['u'] == 0]['i'].values)

# Performance suggests this method is poor at generalizing to new recipes...
'''
train_rec_matrix, test_rec_matrix = mean_profile_train_test_matrix(tf_idf_embeddings, 10, train_interactions)
print(mean_precision_at_k(train_matrix, positive_train_interactions, train_rec_matrix, adjusted=True)) # 0.597
print(mean_precision_at_k(test_matrix, positive_test_interactions, test_rec_matrix, adjusted=True)) # 0.001286
'''

# Performance also suggests poor generalization, but better generalization than tf_idf reccomendations
'''
train_rec_matrix, test_rec_matrix = mean_profile_train_test_matrix(model.item_factors, 10, train_interactions)
print(mean_precision_at_k(train_matrix, positive_train_interactions, train_rec_matrix, adjusted=True)) # 0.222729
print(mean_precision_at_k(test_matrix, positive_test_interactions, test_rec_matrix, adjusted=True)) # 0.00817
'''

'\ntrain_rec_matrix, test_rec_matrix = mean_profile_train_test_matrix(model.item_factors, 10, train_interactions)\nprint(mean_precision_at_k(train_matrix, positive_train_interactions, train_rec_matrix, adjusted=True)) # 0.222729\nprint(mean_precision_at_k(test_matrix, positive_test_interactions, test_rec_matrix, adjusted=True)) # 0.00817\n'

# TF-IDF Similarity Measurements (Content-Based Filtering)

In [5]:
'''
Content-based filtering utilities
'''

def get_tf_idf_embeddings(recipe_ingr_xref):
    '''
    recipe_ingr_xref is a pandas Dataframe, like the one returned from makr_recipe_ingr_xref
    
    returns a sparse matrix of tf-idf embeddings for each recipe
    ''' 
    
    # Create the document frequency matrix
    ingr_freq = csr_matrix((np.ones(len(recipe_ingr_xref)), (recipe_ingr_xref['i'], recipe_ingr_xref['ingr'])))
    
    # Generate the tf-idf embeddings from the document frequency matrix
    tf = TfidfTransformer()
    tf_idf_embeddings = tf.fit_transform(ingr_freq) 
    
    return tf_idf_embeddings

def cosine_similar_predictions(embeddings, query):
    '''
    embeddings is a matrix of shape (num_recipes, num_features)
    query is a list of length num_features
    '''
    
    return cosine_similarity(embeddings, query.reshape(1, -1)).squeeze()
    

In [6]:
# Load the data
recipes = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\PP_recipes.csv')
ingr_map = pd.read_pickle(r'E:\TFRS\food.com recipes and interactions\ingr_map.pkl')
raw_recipes = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\RAW_recipes.csv')

# Transform all the ingredient id strings into lits of integers
recipes['ingredient_ids'] = recipes['ingredient_ids'].map(lambda str: [int(ingr_id) for ingr_id in str[1:-1].split(', ')])

# Create a cross-reference tables for all recipes and ingredients
recipe_ingr_xref = make_recipe_ingr_xref(recipes)

# Get the tf-idf embeddings
tf_idf_embeddings = get_tf_idf_embeddings(recipe_ingr_xref)

# Merge preprocessed and raw recipe tables
full_recipes = recipes.merge(raw_recipes, left_on='id', right_on='id')

  0%|          | 0/178265 [00:00<?, ?it/s]

In [8]:
recipe_id = 13
k = 10

content_recs = similar_recipes(tf_idf_embeddings, recipe_id, k)
collab_recs = similar_recipes(model.item_factors, recipe_id, k)

print(f'If you liked {full_recipes[full_recipes["i"] == recipe_id].iloc[0]["name"]}, you might also like:')
for i, rec in enumerate(content_recs):
    print(full_recipes[full_recipes['i'] == rec].iloc[0]['name'])
    
print()    

print(f'Customers who liked {full_recipes[full_recipes["i"] == recipe_id].iloc[0]["name"]} also liked:')
for i, rec in enumerate(collab_recs):
    print(full_recipes[full_recipes['i'] == rec].iloc[0]['name'])

NameError: name 'model' is not defined

In [None]:
# Old-style content-based reccomendations

# Pull so many more similar recipes to a given recipe
# We can't compute the entire similarity matrix: it's too much data
# Seems to work reasonably well, though, based on recipe names!
'''
TEST_RECIPE = 0
NUM_RECIPES = 10 # (the first one is the recipe itself)

similarities = cosine_similarity(embeddings, embeddings[TEST_RECIPE])
most_similar = np.argsort(similarities.squeeze())[-NUM_RECIPES:][::-1]
print(most_similar)
raw_recipes[raw_recipes['id'].isin(recipes[recipes.index.isin(most_similar)]['id'].values)]
'''

# Collaborative Filtering Model (Matrix Completion)

A parameter search across iterations revealed that a model with 8 factors, 8 alpha performed much better at 30 iterations than at 15 (relatively speaking)

### Run the following 2 cells to get all functions for collaborative filtering

In [9]:

'''
Various metrics for evaluating collaborative filtering models

Metric implementations based in part from: https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
Formulas from: http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html
'''

# Individual user metrics
###########################

def precision_at_k(interaction_matrix, positive_interactions, user_id, recs):
    '''
    interaction_matrix is a matrix of shape (items, users) that you wish to evaluate reccomendations on
    positive_interactions should be a Pandas Series mapping user_id to the number positive interactions they have in the given matrix
    user_id is the integer of the user to evaluate
    recs is a list of k recipes ids that are being reccomended, in order
    
    Returns the precision@k: the proportion of reccomendations made that are relevant out of total reccomendations
    '''
    
    relevant_reccomendations = interaction_matrix[recs, user_id].sum()
    
    return relevant_reccomendations / len(recs)

def adjusted_precision_at_k(interaction_matrix, positive_interactions, user_id, recs):
    '''
    Similar to precision_at_k, but will adjust the precision if there are fewer relevant items than reccomendations made
    '''

    relevant_reccomendations = interaction_matrix[recs, user_id].sum()
    max_possible_relevant = min(len(recs), positive_interactions[user_id])
    
    return relevant_reccomendations / max_possible_relevant
    
def average_precision(interaction_matrix, positive_interactions, user_id, recs, adjusted=False):
    '''
    Parameters defined as above
    
    adjusted determines whether or not to use adjust precision@k when computing average precision
    '''
    
    precision_func = adjusted_precision_at_k if adjusted else precision_at_k
    
    precisions = [precision_func(interaction_matrix, positive_interactions, user_id, recs[:k+1]) for k in range(len(recs)) if interaction_matrix[recs[k], user_id] == 1]
    
    return (1 / positive_interactions[user_id]) * sum(precisions)

# Aggregate metrics
######################

def mean_precision_at_k(interaction_matrix, positive_interactions, rec_matrix, adjusted=False):
    '''
    rec_matrix is a matrix of shape (users, reccomendations) with item_ids recomended for each user
    
    Computes the mean precision at k across all users
    '''
    
    precision_func = adjusted_precision_at_k if adjusted else precision_at_k
    
    total = 0
    
    for user_id in tqdm.tqdm(range(interaction_matrix.shape[1])):
        total += precision_func(interaction_matrix, positive_interactions, user_id, rec_matrix[user_id])
    
    return total / interaction_matrix.shape[1]

def mean_average_precision(interaction_matrix, positive_interactions, rec_matrix, adjusted=False):
    '''
    Computes the mean average precision across all users (excuse the double "mean")
    '''

    total = 0
    
    for user_id in tqdm.tqdm(range(interaction_matrix.shape[1])):
        total += average_precision(interaction_matrix, positive_interactions, user_id, rec_matrix[user_id], adjusted=adjusted)
    
    return total / interaction_matrix.shape[1]
    
'''
interaction_matrix = np.array([[0, 0, 1, 0, 0, 1, 1, 0, 0], [0, 0, 0, 1, 0, 0, 1, 0, 0]]).T
positive_interactions = interaction_matrix.sum(axis=0)

user_id = 0
recs = [2, 3, 4, 5, 6]

average_precision(interaction_matrix, positive_interactions, user_id, recs) # Should be 0.7 if everything is correct
'''

'\ninteraction_matrix = np.array([[0, 0, 1, 0, 0, 1, 1, 0, 0], [0, 0, 0, 1, 0, 0, 1, 0, 0]]).T\npositive_interactions = interaction_matrix.sum(axis=0)\n\nuser_id = 0\nrecs = [2, 3, 4, 5, 6]\n\naverage_precision(interaction_matrix, positive_interactions, user_id, recs) # Should be 0.7 if everything is correct\n'

In [11]:
'''
Data loading and splitting
'''

# NOTE: these splits do not all represent all users. Training data does, but test and validation are missing some users
# Also note there is a very large range in interaction number: between 2 and 6000 reviews per person (25% are just 2)
df1 = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\interactions_train.csv')
df2 = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\interactions_test.csv')
df3 = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\interactions_validation.csv')

# Combine the interaction data together, so that we can create our own splits
# 718379 interactions (raw)
df = pd.concat((df1, df2, df3), ignore_index=True)

# Dropping all the interactions that are less than 3
# This complicates things, because we need to reset the user ids
# I think this is something to do in the future, but will require some more work
'''
# Dropping all ratings that are less than 3 (consider those negative interations)
# 689321 interactions (losing 29,000 interactions)
df.drop(df[df['rating'] < 3].index, inplace=True)

# Dropping all users that now have less than 2 interactions (lose 552 users, 552 interactions)
# 688769 interactions
too_few = df.groupby('u')['rating'].count() < 2 # Dataframe storing True/False for each user having too few ratings
too_few_users = too_few.index[too_few] # Stores a list of user ids with too few ratings

df.drop(df[df['u'].isin(too_few_users)].index, inplace=True)

# Resulting interaction matrix is 24392 x 173600 (99.98% sparse, which does seem high...)
# Can consider eliminating some recipes with fewer than 1 rating (but that is 80601 recipes, nearly 50%)
'''

# In theory, we could do k-fold cross validation, but then we'd need to have k interactions per user
# So I think deciding to change the minimum number of reviews is a precondition to even attempting k-fold cross validation

user_train, user_test, item_train, item_test, =\
    safe_train_test_split(df['u'], df['i'], test_size=0.25, random_state=1)

# Total users: 25075
# Total items: 178264

# Build the new dataframes from the train-test split
train_interactions = pd.DataFrame({'u': user_train, 'i': item_train, 'rating': 1})
test_interactions = pd.DataFrame({'u': user_test, 'i': item_test, 'rating': 1})

# Store the number of positive interactions associated with each user to avoid recomputing for precision@k metrics
positive_train_interactions = train_interactions.groupby('u')['rating'].count()
positive_test_interactions = test_interactions.groupby('u')['rating'].count()

# Supplement both with a 0 rating at the max user and recipe id to make matrices the same size
# This is a dumb solution to the problem of some recipes only appearing once
# I can only stratify across one thing anyways, so I think we're going to have problems with some recipes
# having a learned embedding. The good news is the metrics will be more user-focused, so it should be OK

# Basically, this is a hacky solution, but I think it's the best way to do it (without eliminating massive
# swathes of data and creating a complicated dual-class stratification algorithm which probably won't even
# end up working because users and recipes probably aren't distributed nicely)

train_interactions = train_interactions.append({'u': max(df['u']), 'i': max(df['i']), 'rating': 0}, ignore_index=True)
test_interactions = test_interactions.append({'u': max(df['u']), 'i': max(df['i']), 'rating': 0}, ignore_index=True)

train_matrix = csr_matrix((np.ones(len(train_interactions)), (train_interactions['i'], train_interactions['u'])))
test_matrix = csr_matrix((np.ones(len(test_interactions)), (test_interactions['i'], test_interactions['u'])))
complete_matrix = train_matrix + test_matrix

  0%|          | 0/25076 [00:00<?, ?it/s]

AttributeError: 'DataFrame' object has no attribute 'append'

In [None]:
# Note: mean precision takes a lot longer when you include 100 reccomendations
'''
train_p_at_k = mean_precision_at_k(train_matrix, positive_train_interactions, train_rec_matrix[:, :10], adjusted=False)
test_p_at_k = mean_precision_at_k(test_matrix, positive_test_interactions, test_rec_matrix[:, :10], adjusted=False)

train_adj_p_at_k = mean_precision_at_k(train_matrix, positive_train_interactions, train_rec_matrix[:, :10], adjusted=True)
test_adj_p_at_k = mean_precision_at_k(test_matrix, positive_test_interactions, test_rec_matrix[:, :10], adjusted=True)

train_mean_p = mean_average_precision(train_matrix, positive_train_interactions, train_rec_matrix[:, :10], adjusted=False)
test_mean_p = mean_average_precision(test_matrix, positive_test_interactions, test_rec_matrix[:, :10], adjusted=False)

train_mean_p_adj = mean_average_precision(train_matrix, positive_train_interactions, train_rec_matrix[:, :10], adjusted=True)
test_mean_p_adj = mean_average_precision(test_matrix, positive_test_interactions, test_rec_matrix[:, :10], adjusted=True)

rand_p_at_k = mean_precision_at_k(test_matrix, positive_test_interactions, random_matrix[:, :10], adjusted=False)
rand_adj_p_at_k = mean_precision_at_k(test_matrix, positive_test_interactions, random_matrix[:, :10], adjusted=True)
rand_mean_p = mean_average_precision(test_matrix, positive_test_interactions, random_matrix[:, :10], adjusted=False)
rand_mean_p_adj = mean_average_precision(test_matrix, positive_test_interactions, random_matrix[:, :10], adjusted=True)
'''

In [None]:
'''
train_rec_matrix, test_rec_matrix = train_test_rec_matrix(model.item_factors, model.user_factors, 100, train_interactions)
random_matrix = random_rec_matrix(train_matrix, 100)

assert mean_precision_at_k(train_matrix, positive_train_interactions, test_rec_matrix, adjusted=False) == 0
'''

In [None]:
'''
print(train_p_at_k, test_p_at_k, rand_p_at_k)
print(train_adj_p_at_k,test_adj_p_at_k, rand_adj_p_at_k)
print(train_mean_p, test_mean_p, rand_mean_p)
print(train_mean_p_adj, test_mean_p_adj, rand_mean_p_adj)
'''
'''
Pre proper filtering (10 recs):
0.09332828202262554 0.006751475514435921 2.7915137980539158e-05
0.24018909499248356 0.024444925521652214 4.320199925559632e-05
0.1406715379458244 0.00753182893600831 5.866304081570052e-06
0.17384917444761513 0.0185085742058478 7.638693794620157e-06

Proper filtering @10 recs
0.09339208805229554 0.00802759610783195 4.386664539799011e-05
0.2378505058906556 0.02794409756245771 7.975753708725475e-05
0.13841172806680735 0.009886528168548337 2.1529499979817866e-05
0.17088549890951885 0.021114400425549178 4.146888425163155e-05

Proper filtering @100 recs
0.02811652576167531 0.003499361939703625 4.426543308342641e-05
0.3901281604724576 0.09905876639766126 0.0007262624622543001
0.15152008275914267 0.012511155940690546 3.548401140167055e-05
0.25815126573941893 0.07473983983342303 0.0004636069582234532


'''

In [None]:
def make_results_dict():
    return {
        'factors': [],
        'regularization': [],
        'alpha': [],
        'iterations': [],
        'train_p_at_k': [],
        'test_p_at_k': [],
        'train_mean_p': [],
        'test_mean_p': []
    }

def compute_metrics(model, step, k, train_interactions, factors, regularization, alpha, results):
    def compute(iteration, time):
        if (iteration + 1) % step == 0:
            train_rec_matrix, test_rec_matrix = collab_filter_train_test_matrix(model.item_factors, model.user_factors, k, train_interactions)
            
            train_adj_p_at_k = mean_precision_at_k(train_matrix, positive_train_interactions, train_rec_matrix, adjusted=True)
            test_adj_p_at_k = mean_precision_at_k(test_matrix, positive_test_interactions, test_rec_matrix, adjusted=True)

            train_mean_p_adj = mean_average_precision(train_matrix, positive_train_interactions, train_rec_matrix, adjusted=True)
            test_mean_p_adj = mean_average_precision(test_matrix, positive_test_interactions, test_rec_matrix, adjusted=True)
            
            results['factors'].append(factors)
            results['regularization'].append(regularization)
            results['alpha'].append(alpha)
            results['iterations'].append(iteration+1)
            
            results['train_p_at_k'].append(train_adj_p_at_k)
            results['test_p_at_k'].append(test_adj_p_at_k)
            results['train_mean_p'].append(train_mean_p_adj)
            results['test_mean_p'].append(test_mean_p_adj)
            
            print(f'Iteration: {iteration + 1}')
            print(f'Adjusted precision@k: Train: {train_adj_p_at_k}, Test: {test_adj_p_at_k}')
            print(f'Adjusted mean precision: Train: {train_mean_p_adj}, Test: {test_mean_p_adj}')
            print()
    
    return compute

def train_model(factors, regularization, alpha, iterations, metric_steps, train_interactions, results, k=10, random_state=0):
    model = implicit.als.AlternatingLeastSquares(factors=factors, iterations=iterations, regularization=regularization, random_state=random_state)
    
    model.fit_callback = compute_metrics(model, metric_steps, k, train_interactions, factors, regularization, alpha, results)
    
    model.fit(alpha * train_matrix)
    
    return model

In [None]:
factors = [8]
regularizations = [0.1]
alphas = [64]
iteration_step = 30
max_iterations = 30

k = 10

results = make_results_dict()

for factor, regularization, alpha in product(factors, regularizations, alphas):
    print(f'Factors: {factor}, regularization: {regularization}, alpha: {alpha}')
    
    model = train_model(factor, regularization, alpha, max_iterations, iteration_step, train_interactions, results)

In [None]:
'''
A bunch of functions that I don't think I need anymore, but am keeping just in case

Implementations for collaborative filtering reccomendation generation that compute things in blocks.
This was an attempt to speed up reccomendation generation b/c it allows for faster matrix multiplication.
However, getting the top k elements takes far longer, so the improvement is margianl on CPU, and non-existent on GPU

Also, some older implemenetations of reccomendation generation
'''

def get_block_predictions(item_matrix, user_matrix, start_user_id, step):
    return (item_matrix @ user_matrix[start_user_id:start_user_id + step, :].T).T

def get_block_reccomendations(predictions, k):
    # Get the top k reccoemndations for each user (row) in the prediction matrix provided
    unsorted_recs = np.argpartition(predictions, -k)[:, -k:]
    
    # Pull the predicted values for those reccomendations
    rec_values = np.take_along_axis(predictions, unsorted_recs, axis=1)
    
    # Get the indices to sort the reccoemndation by predicted values
    rec_order = np.argsort(rec_values)[:, ::-1]
    
    # Use the sorted indices to get the reccomendations in sorted order
    sorted_recs = np.take_along_axis(unsorted_recs, rec_order, axis=1)
    
    return sorted_recs

def get_block_rec_matrix(item_matrix, user_matrix, k, block_size=4):
    rec_matrix = []
    
    for start_user_id in tqdm.tqdm(range(0, user_matrix.shape[0], block_size)):
        predictions = get_block_predictions(item_matrix, user_matrix, start_user_id, block_size)
        recs = get_block_reccomendations(predictions, k, train_interactions, filter_recs)
        
        rec_matrix.append(recs)

    return np.vstack(rec_matrix)

def train_test_block_rec_matrix(item_matrix, user_matrix, k, train_interactions, block_size=4):
    train_rec_matrix = []
    test_rec_matrix = []
    
    for start_user_id in tqdm.tqdm(range(0, user_matrix.shape[0], block_size)):
        # Get the predictions without filtering out known-positives from the test set
        predictions = get_block_predictions(item_matrix, user_matrix, start_user_id, block_size)
        train_recs = get_block_reccomendations(predictions, k)
        
        # Get the ids of the interactions to filter out
        filter_idxs = np.array([train_interactions[train_interactions['u'] == user_id]['i'].values for user_id in range(start_user_id, start_user_id+block_size)])
        print(filter_idxs)
        
        # Give the reccomendations from the train set low values to prevent them from being reccomended
        zeros = np.array([np.full_like(f, 0) for f in filter_idxs])
        test_predictions = np.put_along_axis(predictions, filter_idxs, zeros, axis=1)
        
        # Regenerate reccomendations with the filtered predictions
        test_recs = get_block_reccomendations(test_predictions, k)
        
        train_rec_matrix.append(train_recs)
        test_rec_matrix.append(test_recs)

    return np.vstack(train_rec_matrix), np.vtack(test_rec_matrix)

def custom_create_rec_matrix(item_matrix, user_matrix, k, train_interactions=None, filter_recs=False):
    '''
    Returns a numpy array of size (users, k) with the top k reccomendations for each user
    '''

    rec_matrix = []
    
    for user_id in tqdm.tqdm(range(user_matrix.shape[0])):
        recs = get_reccomendations(item_matrix, user_matrix, user_id, k, train_interactions, filter_recs)[0]
        
        rec_matrix.append(recs)

    return np.array(rec_matrix)

def create_rec_matrix(model, train_matrix, k, test_mode=False):
    '''
    model is implicit reccomendation model
    train_matrix is the interaction matrix the model trained on
    k is the number of reccomendations to generate per user
    test mode: if true, will filter out recomendations from the train matrix
    '''
    
    rec_matrix = []
    
    for user_id in tqdm.tqdm(range(train_matrix.shape[1])):
        recs = [rec[0] for rec in model.recommend(user_id, train_matrix.T, k, filter_already_liked_items=test_mode)]
        
        rec_matrix.append(recs)

    return np.array(rec_matrix)

def get_reccomendations(item_matrix, user_matrix, user_id, k, train_interactions=None, filter_recs=False):
    '''
    item matrix is of shape (items, features) describing the features for each item
    user_matrix is of shape (users, features) describing the features for each user
    user_id is an integer, the user to get reccomendations for
    train_interactions is a pandas dataframe describing all positive interactions the model trained on (faster than interaction matrix)
    filter determines whether or not to eliminate reccomendations that have a positive interaction in the train_matrix
    
    Returns a tuple, (reccomendations, values) where reccomendations is a list of length k with reccomendation indices, and values gives 
    
    Note: this was created because implicit's built in reccomend function is slow, and doesn't filter out the training reccomendations
    '''
    
    # Get all predicted values for the user
    predictions = item_matrix @ user_matrix[user_id, :]
    
    if filter_recs:
        #trained_recs_a = np.nonzero(train_matrix[:, user_id])[0]
        trained_recs = train_interactions[train_interactions['u'] == user_id]['i'].values
        
        predictions[trained_recs] = 0
    
    # Get the top k reccomendations for the user(NOTE: this function does not return them in sorted order)
    unsorted_recs = np.argpartition(predictions, -k)[-k:]
    
    # Sort the reccomendations produced by their values, in descending order (that's how the metrics expect them)
    sorted_recs = sorted(unsorted_recs, key=lambda rec: predictions[rec], reverse=True)
    
    # Get the values associated with these reccomendations
    sorted_values = predictions[sorted_recs]
    
    return sorted_recs, sorted_values
    
def train_test_reccomendations(item_matrix, user_matrix, user_id, k, train_interactions):
    '''
    Produces both train and test reccomendations for a user at once, which should be more efficient
    '''
    
    # Get all predicted values for the user
    predictions = item_matrix @ user_matrix[user_id, :]
    
    # Get unfiltered reccomendations
    unsorted_recs = np.argpartition(predictions, -k)[-k:]
    sorted_recs = sorted(unsorted_recs, key=lambda rec: predictions[rec], reverse=True)
    
    # Filter the predictions
    trained_recs = train_interactions[train_interactions['u'] == user_id]['i'].values
    predictions[trained_recs] = 0
        
    # Get the filtered predictions
    unsorted_filtered_recs = np.argpartition(predictions, -k)[-k:]
    sorted_filtered_recs = sorted(unsorted_filtered_recs, key=lambda rec: predictions[rec], reverse=True)
    
    return sorted_recs, sorted_filtered_recs
    
def train_test_rec_matrix(item_matrix, user_matrix, k, train_interactions):
    '''
    Produces a train and test reccomendation matrix
    
    This is about 30% faster than generating them seperately
    '''
    
    train_rec_matrix = []
    test_rec_matrix = []
    
    for user_id in tqdm.tqdm(range(user_matrix.shape[0])):
        train_recs, test_recs = train_test_reccomendations(item_matrix, user_matrix, user_id, k, train_interactions)
        
        train_rec_matrix.append(train_recs)
        test_rec_matrix.append(test_recs)

    return np.array(train_rec_matrix), np.array(test_rec_matrix)


In [None]:
df1 = pd.DataFrame(results)
#df2.groupby(['factors', 'regularization', 'alpha']).max()
df = pd.concat((df, df1))
df.to_csv('param_search1.csv', index=False)

In [None]:
df.groupby(['factors', 'regularization', 'alpha']).max()

In [None]:
# Non-negative matrix factorization
# I don't NMF is what we want. The matrix is far too sparse, and this was too slow

from sklearn.decomposition import NMF

# Ratings matrix
# 16 components: 257 iterations, 3790.7093189610523 error, 0.0275 reconstructed
# 32 components: 87 iterations, 3707.651615877227 error, 0.0917 reconstructed
# 64 components: 172 iterations, 3583.3304672483646 error, 0.1284 reconstructed
# 128 components: 137 iterations, 3404.455889599025 error, 0.1835 reconstructed, found a new reccoemendation
#      (very slow, 20+ minutes, violation started going up at about 100 iterations in)
# 256 components: 237 iterations, 3167 error, 0.3119 reconstructed
#      (very, very slow, 3+ hours, violation also went up at about 100 iterations)

model = NMF(n_components=256, random_state=0, max_iter=500, verbose=1)

W = model.fit_transform(ratings_matrix)

H = model.components_

print(model.n_iter_)
print(model.reconstruction_err_)

good = 0
total = 0
for i in range(0, 100):
    for j in range(0, 100):
        if ratings_matrix[i, j] != 0:
            total += 1
            print(i, j, ratings_matrix[i, j], W[i] @ H[:, j])
            if W[i] @ H[:, j] > 1:
                good += 1
        elif W[i] @ H[:, j] > 1:
            print('NEW ONE!')
print(good / total)
print(good, total)

In [None]:
# Matrix Factorization in Keras: requires sampling from negative interactions, handling balance
# Overfit like crazy with the metrics I had selected (unsure how that compares to other metrics)
# Does allow for custom, tunable objective functions
# Is the direction we may go in with neural collaborative filtering...

def mf_model(num_users, num_recipes, rank):
    user_input = keras.Input((1, ))
    recipe_input = keras.Input((1, ))
    
    user_embedding = layers.Embedding(num_users, rank)(user_input)
    recipe_embedding = layers.Embedding(num_recipes, rank)(recipe_input)
    
    dot_product = layers.Dot(axes=(2))([user_embedding, recipe_embedding])
    
    return keras.Model(inputs=[user_input, recipe_input], outputs=dot_product)

# A rank of 5 stopped the validation loss from going below 19.5421 (which is really bad)
# Rank 32: increase trainable paramters to 6 million. Validation loss is still awful
# Increasing batch size to 512 sped things up, unsure on performance differences

# Rank 16, still overfitting very strongly

model = mf_model(max(df['u'] + 1), max(df['i'] + 1), 16)
model.summary()

model.compile(loss='mean_squared_error', optimizer=keras.optimizers.Adam(learning_rate=0.001))

history = model.fit(x=X_train, y=y_train, batch_size=512, epochs=10, validation_data=(X_test, y_test))

In [None]:
# Parameter search for implicit ALS weighted regularized matrix factorization (WRMF)
# This method is the most hopeful of any of the methods explores thus far

import itertools

def get_rmse(y_true, y_w, y_h, block_size=4):
    total_rmse = 0
    
    for i in tqdm.tqdm(range(0, y_true.shape[0], block_size)):
        total_rmse += mean_squared_error(y_true[:, i:i + block_size].toarray(), y_w @ y_h[i:i + block_size].T, squared=False)

    total_rmse /= (binary_matrix.shape[0] / block_size)
    
    return total_rmse
    
alphas = [32, 64, 128]
factors = [32, 64, 128]
regularizations = [0.1, 1, 10]
iterations = 15

combos = itertools.product(alphas, factors, regularizations)
mse_scores = {}
hit_rate_scores = {}

for alpha, factor, regularization in combos:
    # initialize a model
    model = implicit.als.AlternatingLeastSquares(factors=factor, random_state=0, calculate_training_loss=True, iterations=iterations, regularization=regularization)

    # train the model on a sparse matrix of item/user/confidence weights
    model.fit(alpha * binary_matrix)

    mse = get_mse(binary_matrix, model.item_factors, model.user_factors)
    
    W = model.item_factors
    H = model.user_factors

    good = 0
    total = 0
    for i in range(0, 100):
        for j in range(0, 100):
            if binary_matrix[i, j] != 0:
                total += 1
                if W[i] @ H[j] > 0.5:
                    good += 1
                    
    hit_rate = good / total
    
    print(alpha, factor, regularization, mse, hit_rate)
    
    mse_scores[(alpha, factor, regularization)] = mse
    hit_rate_scores[(alpha, factor, regularization)] = hit_rate

# Doc2Vec Embeddings
(Didn't seem to work very well, on hold until later)
Note: may not have worked well due to swapping the i column, and the index of the recipes

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Following documentation here: https://radimrehurek.com/gensim/models/doc2vec.html

In [None]:
ingr_map

In [None]:
simple_ingr_map = ingr_map[['id', 'replaced']].set_index('id').drop_duplicates().to_dict()['replaced']
recipes['ingredients'] = recipes['ingredient_ids'].map(lambda ingredient_ids: [simple_ingr_map[ingregient_id] for ingregient_id in ingredient_ids])
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(recipes['ingredients'])]

In [None]:
model = Doc2Vec(vector_size=5, window=2, min_count=1, workers=4)
model.build_vocab(documents, progress_per=1000)

In [None]:
model.train(documents=documents, total_examples=model.corpus_count, epochs=30)

In [None]:
TEST_RECIPE = 2
NUM_RECIPES = 10 # (the first one is the recipe itself)

similarities = model.docvecs.distances(TEST_RECIPE)
most_similar = np.argsort(similarities.squeeze())[-NUM_RECIPES:]
raw_recipes[raw_recipes['id'].isin(recipes[recipes.index.isin(most_similar)]['id'].values)]

In [None]:
#reads in all of the datasets 
recipes = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\PP_recipes.csv')
raw_recipes = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\RAW_recipes.csv')
raw_interactions = pd.read_csv(r'E:\TFRS\food.com recipes and interactions\RAW_interactions.csv')

#combines the raw recipes and numbered recipes into one table
#merges them by the recipe id number
recipes_info = recipes.merge(raw_recipes, left_on='id', right_on='id')

#merges all of the recipe information with the ratings
recipes_interact = recipes_info.merge(raw_interactions, left_on='id', right_on='recipe_id')

#can easily change which columns we want to output by updating this 
keep_cols = ['name','id','minutes', 'contributor_id', 'description', 'submitted', 'tags', 'nutrition', 'n_steps', 'ingredient_tokens', 'ingredient_ids', 'steps','steps_tokens', 'user_id', 'date', 'rating', 'review']

#prints out the merged table with only the desired columns
new_recipes_info = recipes_interact[keep_cols]
new_recipes_info