## Project utility functions

### Springboard Capstone 2 project: building a recommendation engine
### John Burt


### Purpose of this notebook:

Collect several functions that get used by more than one notebook. 


## Alternating Least Squares method

About this code: I originally used an ALS algorithm written by my friend and colleague Matt Borthwick, but then I discovered the [OSS project implicit](https://implicit.readthedocs.io/en/latest/als.html), which implemented a much faster ALS algorithm. I've kept Matt's code here because it might come in handy some time, but for the project I always use implicit now. The entry function, do_ALS_df allows you to specify which version of the algorithm to use.

For more about the optimizations used by implicit's ALS method, see [Applications of the Conjugate Gradient Method for ImplicitFeedback Collaborative Filtering](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.379.6473&rep=rep1&type=pdf)

In [1]:
# Matt Borthwick's implementation of Alternating Least Squares
import pandas as pd
import numpy as np

from numpy import eye
from numpy.linalg import solve
from numpy.random import rand
from scipy import sparse

#*****************************************************
# ALS calculation using Matt Borthwick's implementation
def do_ALS_MB(iu_mx_sparse, 
              n_factors=4, 
              n_iterations=10, 
              regularization=0.01, 
              weighted=True, 
              verbose=True ):
    #    unrated items should be recorded as zero
    #
    #    iu_mx_sparse should have an element-wise multiply method, an element-wise minimum method, 
    #    and a shape attribute, like scipy.sparse matrices do
    if verbose: 
        print("setting up matrices, %d iterations: "%(n_iterations),end='')
    n_users, n_items = iu_mx_sparse.shape
    X = rand(n_users, n_factors)
    Y_T = rand(n_items, n_factors)
    r = iu_mx_sparse.minimum(1)
    if regularization:
        regularization *= eye(n_factors)
    for iteration in range(1, n_iterations+1):
        if verbose: print("%d,"%(iteration),end='')
        for u in range(n_users):
            A = r[u].multiply(Y_T.T) @ Y_T
            b = iu_mx_sparse[u] @ Y_T
            X[u] = solve(A + regularization, b[0])
        for i in range(n_items):
            A = (r[:, i].multiply(X)).T @ X
            b = iu_mx_sparse[:, i].T @ X
            Y_T[i] = solve(A + regularization, b[0])
#     Q_hat = np.dot(X,Y_T.T)
    
    return X, Y_T


#*****************************************************
# ALS calculation using the implicit ALS package
import os  
import implicit
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight

def do_ALS_implicit(iu_mx_sparse, 
                        n_factors=4, 
                        n_iterations=10, 
                        regularization=0.01, 
                        weighted=True, 
                        verbose=True ,
                        bm25_K1=10,
                        bm25_B=0.8,
                        use_native=True, 
                        use_cg=True, 
                        use_gpu=False, 
                   ):
    
    os.environ['MKL_NUM_THREADS'] = '1'
    
    als = AlternatingLeastSquares(factors=n_factors, 
            regularization=regularization, use_native=use_native, 
            use_cg=use_cg, use_gpu=use_gpu, iterations=n_iterations,
            calculate_training_loss=True, num_threads=0)
    
    if verbose: 
        print('fitting ALS model')
        
    if weighted:
        als.fit(bm25_weight(sparse.csr_matrix(np.nan_to_num(iu_mx_sparse)), 
                            K1=bm25_K1, B=bm25_B), show_progress=verbose)
    else:
        als.fit(sparse.csr_matrix(np.nan_to_num(iu_mx_sparse)), show_progress=verbose)
    
    return als.item_factors, als.user_factors

#*****************************************************
def scale_to_mx(mx1, mx2):
    """Scale mx2 to mx1's mean and std"""
    return ((mx2-np.nanmean(mx2)) * (np.nanstd(mx1)/np.nanstd(mx2)) + 
            np.nanmean(mx1))
    
#*****************************************************
def do_ALS_df(df, ALS_method='implicit', scale=True, return_utilmx=True, **args):
    """run ALS on array to fill in NaN cells 
    with a useful iu_mx_sparse estimate:
    user_factors = user hidden factors
    item_factors = item hidden factors
    """

    if ALS_method != 'implicit':
        item_factors, user_factors = do_ALS_MB(
            sparse.csr_matrix(np.nan_to_num(df.values)), **args)
    else:
        item_factors, user_factors = do_ALS_implicit(
            sparse.csr_matrix(np.nan_to_num(df.values)), **args)
        
    if not return_utilmx:
        return item_factors, user_factors

    else:
        if scale:
            return (pd.DataFrame( scale_to_mx(df.values, np.dot(item_factors, user_factors.T)),
                index=df.index, columns=df.columns), item_factors, user_factors)
        else:
            return (pd.DataFrame( np.dot(item_factors, user_factors.T),
                index=df.index, columns=df.columns), item_factors, user_factors)


## Funcs to generate train / test datasets for models

### Split test into X ('liked' games) and y (target)

- From n_liked+n_recs top rated games, randomly select n_liked as "liked games" to use as model input (X values).
- Remaining n_recs top rated games assigned as holdouts to test for recommendations (y values).


In [1]:
def get_test_Xy(test, n_liked = 10, n_recs = 10 ):
    """Split top rated item for users into X and y item ID arrays"""

    # total number of top ratings to split between n_liked & n_recs
    n_top = n_liked + n_recs
    
    # arrays to hold X and y item IDs
    test_X = np.zeros([test.shape[0],n_liked])
    test_y = np.zeros([test.shape[0],n_recs])
    item_ids = test.columns.values

    # for each user (row), select top n_top items and 
    #  split them into X and y categories
    for i in range(test.shape[0]):
        # row = a user's ratings (including nans)
        row = test.iloc[i,:].values 
        # get indices to descending sort of ratings (nans sorted to bottom)
        idx = np.argsort(-row)
        # top n_top highest rated item IDs
        top = item_ids[idx[:(n_top)]]
        # randomize order
        np.random.shuffle(top)
        # assign to X (liked items) and y(items to test for recced)
        test_X[i,:] = top[:n_liked]
        test_y[i,:] = top[n_liked:]
    
    return (pd.DataFrame(test_X,index=test.index), 
            pd.DataFrame(test_y,index=test.index) )


### Split data into train/test sets

Split by user, so that all ratings by a given user are either in the train or test set. 

Test sets are further split into X and y sets of top rated game IDs

NOTE: the df passed must be in format users (rows) x items (cols)

In [3]:
from sklearn.model_selection import train_test_split

def train_test_split_utilitymx(df, 
                                test_size = 0.1,
                                n_liked = 10,
                                n_recs = 10 ):
    """Split utility matrix df into train and test sets.
        df is format: items (rows) x user (cols) x ratings.
        
        Output is:
            train = user x item rating matrix
            test_X = n_liked top rated item IDs
            test_y = n_recs top rated item IDs"""

    # split into train and test sets
    train, test = train_test_split(df, test_size=test_size )

    # set train and test index to userID
    train = train.set_index('userID')
    test = test.set_index('userID')
    
    # convert the test ratings into X and y matrices
    test_X, test_y = get_test_Xy(test, 
                    n_liked = n_liked,
                    n_recs = n_recs )
    
    return train, test, test_X, test_y