## Board game recommendation engine for new users
### Method 2b: use existing users in filled utility matrix as proxies to new user
### Model and data save

#### John Burt


### Purpose of this notebook:

Generate the data required by the web app that uses Model 2b. 


## Load data from file

- Set up environment.
- Read unfilled boardgame rating utility matrix


In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np

from datetime import datetime

pd.options.display.max_rows = 100

srcdir = './data/'

inputpath = srcdir+'bgg_game_mx_unfilled_v2.h5'
outputpath = srcdir+'bgg_pu_data.pkl'

# load the unfilled item-user utility matrix, transpose to user-item matrix
ui_mx = pd.read_hdf(inputpath, 'mx').T

print('original: #users X #games:', ui_mx.shape)


original: #users X #games: (65168, 12120)


## Recommender as sklearn estimator

This model object was used to tune the model hyperparameters, and it generates the data I will need for my recommender app.

In [2]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.decomposition import TruncatedSVD, PCA
from scipy.spatial.distance import cdist
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

from numpy.random import shuffle

# import utility functions
import sys
sys.path.append('./')
import recsys_utilities
import importlib
importlib.reload(recsys_utilities)
from recsys_utilities import do_ALS_df

class Recommender_ALS_proxy_users(BaseEstimator, ClassifierMixin):
    """recommender engine as an estimator"""

    def __init__(self, 
                regularization=0.1, 
                n_factors=100,
                n_iterations=8,
                scale=True,
                weighted=True,
                bm25_K1=10,
                bm25_B=0.8,
                n_top_items=10,
                n_proxy_users=10,
                verbose=False
                ):
        """
        Called when initializing the model
        """
        # model parameters
        self.regularization = regularization
        self.n_factors = n_factors
        self.n_iterations = n_iterations
        self.scale = scale
        self.weighted = weighted
        self.bm25_K1 = bm25_K1
        self.bm25_B = bm25_B
        
        self.n_top_items = n_top_items
        self.n_proxy_users = n_proxy_users
        
        self.verbose = verbose
        
        # internal data used for making recommendations
        self.user_factors = None
        self.item_factors = None
        self.item_IDs = None
        self.top_rated = None
        self.item_dict = None

    # ******************************************************************
    def set_params(self, **params):
        self.__dict__.update(params)

    # ******************************************************************
    def get_top_rated_cols(self, mx, n_top):
        top = np.zeros((mx.shape[0],n_top), dtype=np.int64)
        for i in range(mx.shape[0]):
            idx = np.argsort(-mx[i,:])
            top[i,:] = idx[:n_top]
        return top

    # ******************************************************************
    def fit(self, X, y=None):
        """ Train the recommender.
            This means, fill the item matrix with estimated ratings using ALS.
            You can also pass a filled matrix (has no empty cells / NaN values),
              and the fit function will skip the ALS step.
            X = pd DataFrame filled/unfilled utility matrix
        """
        # fill utility matrix using ALS if it contains NaNs
        self.user_factors, self.item_factors = do_ALS_df(
            X, ALS_method='implicit', return_utilmx=False,
            n_iterations=self.n_iterations, 
            regularization=self.regularization, 
            n_factors=self.n_factors, 
            verbose=self.verbose,
            scale=self.scale,
            weighted=self.weighted,
            bm25_K1=self.bm25_K1,
            bm25_B=self.bm25_B,
            use_native=True,
            use_cg=True,
            use_gpu=False,
            )   
        
        # create array of game IDs
        self.item_IDs = X.columns.values.astype(int)
        
        self.top_rated = self.get_top_rated_cols(X.values, self.n_top_items)

        # create gameID - column index mapping
        self.item_dict = {key: value for (key, value) in zip(self.item_IDs, range(len(self.item_IDs)))}

        return self
    
    # ******************************************************************
    def get_sorted_proxy_index(self, user_liked):
        liked_idx_set = set([self.item_dict[gameID] for gameID in user_liked])
        scores = [-len(liked_idx_set.intersection(row)) for row in self.top_rated]
        return np.argsort(scores)

    # ******************************************************************
    def ratings_from_factors(self, row_index):
        return (np.dot(self.user_factors[row_index,:], self.item_factors.T))
    
    # ******************************************************************
    def recommend_games_by_proxy(self, user_liked, num2rec=10): 
        
        # get indices to proxy users
        proxy_idx = self.get_sorted_proxy_index(user_liked)

        # average/median ratings for all games among proxies
        ratings = np.mean(self.ratings_from_factors(proxy_idx), axis=0)

        # desc sort ratings
        game_idx = np.argsort(-ratings)

        # select num2rec top rated game IDs        
        return self.item_IDs[game_idx[:num2rec]]
        
    # ******************************************************************
    def predict(self, X, y=None, num2rec=10):
        """predict == recommend board IDs = y
        X = array of user liked game IDs
        """

        # recommend game IDs for each row of liked games in X
        y = np.zeros([X.shape[0], num2rec])
        for i in range(X.shape[0]):
            y[i,:] = self.recommend_games_by_proxy(X[i,:], num2rec=num2rec)

        return y

    # ******************************************************************
    def score(self, y_true, y_pred):
        """mean percent of y_true game IDs in y_pred
        Both arrays are shape [num_users, num_gameIDs]"""
        
        n_recced = y_pred.shape[1]
        
        result = np.zeros([y_true.shape[0]])
        for i,(x,y) in enumerate(zip(y_true, y_pred)):
            result[i] = len(set(x).intersection(y))/n_recced        
        
        return np.mean(result)
    

## Fit the model to generate recommender data

In [3]:
from time import time

# model defaults
defaults = {
    'n_factors': 100,
    'weighted': False,
    'bm25_K1': 4,
    'bm25_B': 0.5,
    'n_iterations': 20,
    'verbose': True,
    'scale': True,
    'regularization': 0.3,
    'n_top_items': 20,
    'n_proxy_users': 100,
    }
rec = Recommender_ALS_proxy_users(**defaults)

start = time()
rec.fit(ui_mx)
print('fit time:',time()-start)


fitting ALS model


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))


fit time: 242.4343454837799


## Save model data for the recommender web app to use

I split the data into two dataframes: user data and item (boardgame) data, and then save them to compressed pickle files. These will be read by the recommender object used by my web app.

In [4]:
import bz2
import pickle

# load game info file
item_data = pd.read_hdf(srcdir+'bgg_game_data_big_v2.h5', key='gamedata')

print('item_data',item_data.shape)

# combine and save data for games (items)
item_factor_cols = ['factor_%d'%(i) for i in range(rec.item_factors.shape[1])]
item_data = pd.concat([item_data, 
           pd.DataFrame(rec.item_factors, index=item_data.index, columns=item_factor_cols)],
          axis=1)
#           axis=1).sort_index().reset_index()
# create at runtime
# rec.item_dict

# combine data for users
user_top_rated = ['top_%d'%(i) for i in range(rec.top_rated.shape[1])]
user_factor_cols = ['factor_%d'%(i) for i in range(rec.user_factors.shape[1])]
user_mx = np.hstack([rec.top_rated, rec.user_factors])
user_data = pd.DataFrame(user_mx, columns=user_top_rated+user_factor_cols)

# save user data
with bz2.BZ2File(outputpath, 'wb') as pickle_out:
    pickle.dump([user_data, item_data], pickle_out)


item_data (12120, 97)


## Read the data file and check to be sure everything is OK

In [5]:
with bz2.BZ2File(outputpath, 'rb') as pickle_in:
        [user_data2, item_data2] = pickle.load(pickle_in)

print(user_data2.shape, item_data2.shape)

(65168, 120) (12120, 197)


In [6]:
user_data2.head()

Unnamed: 0,top_0,top_1,top_2,top_3,top_4,top_5,top_6,top_7,top_8,top_9,top_10,top_11,top_12,top_13,top_14,top_15,top_16,top_17,top_18,top_19,factor_0,factor_1,factor_2,factor_3,factor_4,factor_5,factor_6,factor_7,factor_8,factor_9,factor_10,factor_11,factor_12,factor_13,factor_14,factor_15,factor_16,factor_17,factor_18,factor_19,factor_20,factor_21,factor_22,factor_23,factor_24,factor_25,factor_26,factor_27,factor_28,factor_29,...,factor_50,factor_51,factor_52,factor_53,factor_54,factor_55,factor_56,factor_57,factor_58,factor_59,factor_60,factor_61,factor_62,factor_63,factor_64,factor_65,factor_66,factor_67,factor_68,factor_69,factor_70,factor_71,factor_72,factor_73,factor_74,factor_75,factor_76,factor_77,factor_78,factor_79,factor_80,factor_81,factor_82,factor_83,factor_84,factor_85,factor_86,factor_87,factor_88,factor_89,factor_90,factor_91,factor_92,factor_93,factor_94,factor_95,factor_96,factor_97,factor_98,factor_99
0,3118.0,3131.0,3939.0,11084.0,10491.0,5791.0,361.0,2967.0,9375.0,3441.0,10081.0,2960.0,8000.0,3887.0,926.0,7442.0,5826.0,4813.0,1356.0,6677.0,0.023696,-0.02967,0.023242,0.019633,0.004823,0.011019,0.011215,-0.017083,0.004597,0.031957,-0.007063,0.020294,0.010455,-0.017938,-0.031931,-0.041231,0.042373,0.026827,0.001124,0.029544,0.015678,-0.002664,-0.018747,0.003638,-0.01973,-0.000167,0.014731,-0.000233,0.013295,0.019796,...,-0.017438,-0.029551,0.002173,0.020962,0.041823,0.05765,0.016788,0.016725,-0.011302,0.040024,-0.004026,0.024266,0.016099,-0.038507,0.051362,0.006909,-0.003483,0.021699,0.045065,0.025485,-0.026146,-0.039294,-0.022407,-0.052466,-0.009208,-0.001864,0.020473,-0.026006,0.01064,0.016793,-0.011084,0.01542,0.024743,0.029231,-0.009518,0.000122,0.024512,0.04029,-0.002616,0.000242,0.027601,0.002297,0.024338,0.006643,0.03696,-0.003498,0.013157,0.035412,0.017838,0.006095
1,2432.0,361.0,6570.0,1068.0,3118.0,35.0,38.0,4859.0,3590.0,2928.0,6098.0,2813.0,4752.0,3641.0,6503.0,890.0,3002.0,77.0,1720.0,5071.0,0.046416,0.015882,0.010615,-0.008456,0.010715,-0.008144,-0.006815,0.060591,0.01859,0.010745,0.008359,-0.018226,-0.012888,0.021524,-0.017059,0.009903,-0.007216,0.040845,-0.037797,0.006236,0.010826,-0.004285,-0.02618,-0.039756,-0.036775,-0.002875,0.018887,0.022078,0.02891,-0.029093,...,-0.020805,0.007129,0.02727,0.012115,0.048497,0.018701,0.006347,0.012726,0.007,-0.02372,0.040976,-0.002408,-0.002856,0.016417,0.001098,-0.037159,0.051445,-0.032401,0.014714,0.015535,-0.003065,-0.011442,-0.028922,-0.004751,0.019132,0.021509,0.021846,0.011233,0.024193,-0.004827,0.027418,-0.004363,0.024325,0.046667,-0.040085,0.038747,0.012596,0.015159,0.00822,0.016487,-0.006898,0.010072,0.005497,-0.004321,0.04227,-0.003262,-0.016759,0.037889,0.003724,0.002653
2,3469.0,3887.0,1356.0,2953.0,2027.0,561.0,689.0,2656.0,1093.0,2888.0,5681.0,3176.0,221.0,3046.0,639.0,361.0,94.0,3399.0,4455.0,1308.0,0.010644,-0.010583,-0.004711,-0.018043,-0.00943,-0.008507,0.010808,0.02314,-0.01561,0.054263,-0.010362,0.004776,0.007174,0.016485,-0.021392,-0.006607,-0.00666,0.015703,0.013855,0.019107,0.029251,0.014008,0.020599,-0.013189,-0.024138,-0.000135,0.01703,-0.029738,0.016662,-0.028611,...,-0.009754,-0.042924,0.012371,0.029256,0.018396,0.005504,0.001468,0.059884,-0.014659,0.020064,0.025019,0.021504,0.016704,0.003432,0.036493,-0.015598,0.003808,-0.02658,0.000995,0.024486,0.001706,-0.02298,0.017286,0.015527,0.013258,0.003799,-0.001113,-0.012487,0.004419,0.050129,-0.013064,0.026863,0.001402,-0.014252,0.000172,-0.036304,0.004938,-0.009365,-0.009885,0.011208,-0.038684,0.020613,0.000706,-0.009174,0.01577,-0.000911,0.033726,0.053988,0.003661,0.005352
3,589.0,4802.0,5721.0,7057.0,4281.0,6547.0,4813.0,6560.0,7899.0,5680.0,6477.0,6471.0,7392.0,7028.0,7093.0,5791.0,7076.0,1416.0,5397.0,4506.0,-0.000238,-0.008035,-0.003374,0.011638,0.015678,0.029368,-0.007788,0.002804,0.098841,-0.011876,0.007123,0.062746,0.010721,0.019687,0.00579,0.022373,-0.016297,0.01639,-0.004997,0.014155,-0.009845,-0.01899,-0.04067,-0.048383,-0.018477,0.017455,0.009415,-0.009638,0.022755,-0.003275,...,-0.043914,0.026418,-0.028086,0.018827,0.026337,0.005193,0.03321,0.029096,0.000318,0.019936,0.032484,0.008596,0.025138,-0.018627,0.034205,0.037294,0.036009,-0.005903,0.038307,0.00289,0.043462,0.053891,-0.012169,-0.014281,-0.044484,0.017362,-0.021101,0.020803,0.028332,0.044759,0.022127,-0.022567,-0.018102,0.0202,-0.053236,0.025347,-0.009715,0.014858,-0.071032,0.010185,0.032757,-0.024396,0.01738,0.041643,0.003167,0.018209,-0.029818,0.002321,0.003942,-0.004255
4,5151.0,577.0,6894.0,16.0,3163.0,3586.0,674.0,4458.0,4506.0,5118.0,2164.0,7027.0,3462.0,6927.0,2.0,8973.0,10762.0,1560.0,4172.0,77.0,0.013762,-0.01135,0.043482,0.013309,0.032187,-0.004644,0.042616,0.045788,0.012559,-0.015457,-0.004083,0.002136,-0.01472,0.064345,0.02185,-0.009831,-0.007667,0.004507,0.013439,-0.035172,0.01793,0.016422,0.02779,0.053618,-0.020977,0.009973,0.008651,0.025997,0.024266,-0.013644,...,0.013162,-0.011085,0.004065,0.034137,-0.01104,0.007576,0.002476,0.006872,-0.015723,-0.000269,0.033838,0.003964,0.020906,-0.003446,-0.02845,0.007462,-0.005143,0.031694,-0.023956,0.041996,0.037913,0.000664,-0.047788,0.01223,0.016653,-0.024421,-0.040722,0.01185,-0.023841,0.013628,-0.013579,-0.023501,-0.011847,0.060571,-0.016757,-0.007754,-0.003628,-0.013914,-0.011766,-0.008676,0.032206,-0.007839,-0.037766,0.014352,0.019315,0.048079,0.037119,-0.004639,0.011499,0.037202


In [7]:
item_data2.head()

Unnamed: 0,id,name,nrate,pic_url,nrating_pages,minplayers,maxplayers,minage,mean_rating,weight,categories,mechanics,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,f_11,f_12,f_13,f_14,f_15,f_16,f_17,f_18,f_19,f_20,f_21,f_22,f_23,f_24,f_25,f_26,f_27,f_28,f_29,f_30,f_31,f_32,f_33,f_34,f_35,f_36,f_37,...,factor_50,factor_51,factor_52,factor_53,factor_54,factor_55,factor_56,factor_57,factor_58,factor_59,factor_60,factor_61,factor_62,factor_63,factor_64,factor_65,factor_66,factor_67,factor_68,factor_69,factor_70,factor_71,factor_72,factor_73,factor_74,factor_75,factor_76,factor_77,factor_78,factor_79,factor_80,factor_81,factor_82,factor_83,factor_84,factor_85,factor_86,factor_87,factor_88,factor_89,factor_90,factor_91,factor_92,factor_93,factor_94,factor_95,factor_96,factor_97,factor_98,factor_99
0,1,Die Macher,4847,https://cf.geekdo-images.com/micro/img/PIQmNBM...,48,3,5,14,7.62458,4.3438,"Economic,Negotiation,Political","Area Majority / Influence,Auction/Bidding,Dice...",1.911611,-4.495354,-0.108733,1.119163,2.791414,-2.567496,0.719896,1.285631,-2.851566,2.365947,3.705543,-0.63545,1.078332,-0.207218,0.729938,2.054715,-1.294588,-3.193749,0.001678,3.281759,-1.426101,0.997327,-0.79134,0.232995,-0.59063,1.214,-0.043406,-0.786426,-0.140232,-1.472948,0.714334,-3.109634,1.017507,2.286563,1.146317,0.952942,-0.837478,-1.96978,...,-0.049066,0.059288,2.532311,-0.019876,-0.253356,0.195501,0.535128,0.936334,0.827018,-1.054472,-0.099941,-0.504634,-0.620528,1.238232,-0.041657,0.087655,2.305157,0.868975,1.749902,0.57682,0.56725,1.322432,-1.450188,0.32908,1.604602,-0.459977,-0.146853,0.305205,0.311168,0.460974,0.554325,0.999497,-0.370996,2.038579,0.989693,0.107411,0.847713,0.296982,0.224763,-0.414266,0.064943,1.289571,-0.222316,0.165279,1.04076,0.434383,-0.732859,0.622113,0.92985,0.766291
1,2,Dragonmaster,527,https://cf.geekdo-images.com/micro/img/GjYccOg...,5,3,4,12,6.60073,1.963,"Card Game,Fantasy",Trick-taking,-0.153944,-0.777451,0.322471,-1.040266,-0.745753,-0.966518,0.812564,0.433441,-0.790137,0.692419,-0.934725,-0.434717,-0.765145,0.732197,0.009502,-1.032225,0.426779,-0.082929,-0.716631,-1.734721,-0.720104,0.442501,-0.589254,0.002103,1.169967,0.976414,-0.3235,-1.009982,0.581649,0.861082,-0.313428,0.575894,-0.921301,0.069271,-0.779044,1.847705,-0.652857,-0.061337,...,0.308105,0.420574,-0.512846,0.242747,-0.045152,0.171633,-0.246921,0.56654,0.366719,-0.430717,0.012908,-0.129611,-0.421716,-0.310927,0.486917,-0.058075,0.33887,0.074619,0.174698,0.277425,0.250542,-0.180377,0.581856,0.592388,0.050016,0.52745,0.544281,-0.064469,0.261319,-0.204791,0.004854,0.271544,-0.497663,0.605392,0.554117,-0.282685,-0.133512,-0.331854,0.231047,0.366618,0.189361,0.182131,0.471249,-0.346022,0.770613,0.129419,-0.053476,0.760185,0.211024,0.104652
2,3,Samurai,13705,https://cf.geekdo-images.com/micro/img/4XUy5Qx...,137,2,4,10,7.44046,2.4986,"Abstract Strategy,Medieval","Area Majority / Influence,Hand Management,Set ...",4.470921,-4.154788,-0.067312,2.843255,0.917307,-1.710353,-0.45187,-0.30183,-0.629635,-1.810433,4.908455,-0.893578,-3.49924,-3.082759,-2.23368,-4.027125,-3.840475,-0.69884,-3.521832,0.412432,1.761121,-0.916375,-0.444081,1.273712,0.667228,0.792622,1.476094,2.662929,2.982897,-0.680837,0.580444,-0.956434,1.806713,-0.013796,4.465508,2.414053,-1.729162,2.318742,...,-1.194237,0.150401,2.286968,-1.826846,0.517335,0.207002,1.72331,1.41838,1.459092,1.403288,1.8052,1.811844,1.32959,0.542388,0.74841,3.047418,1.301613,1.395887,-0.307171,0.017298,1.605599,1.865651,-1.916614,-0.905131,3.49862,0.638633,3.120589,0.168294,0.547564,1.670428,0.392156,1.712405,-0.14363,2.851227,1.13654,0.07093,1.158137,0.453415,0.846737,2.728018,0.477708,0.814559,1.542229,2.422384,2.785595,2.442645,0.98806,1.412753,2.444647,-1.073471
3,4,Tal der Könige,324,https://cf.geekdo-images.com/micro/img/x2IMJSP...,3,2,4,12,6.60991,2.6667,Ancient,"Action Points,Area Majority / Influence,Auctio...",-0.303761,-0.972163,-0.438319,-0.310437,-0.117369,-0.780264,-0.577689,-0.421791,-0.626641,1.867386,-0.98462,1.559616,0.353567,0.570663,0.707809,-0.600833,0.249802,0.568009,-0.392118,0.304263,0.574162,0.389692,0.30252,-1.080803,0.607084,0.900921,-0.065958,-0.166042,-0.628068,0.96359,0.382888,-0.156586,-0.912466,-0.626424,0.954035,0.242849,0.223642,0.396412,...,-0.091799,1.217172,0.350066,-0.15978,0.19499,-0.094127,-0.099927,0.069933,-0.042978,0.355667,-0.298893,-0.082158,0.157586,-0.411629,-0.310548,-0.189352,-0.254253,0.646098,0.478772,0.03607,-0.07455,0.209956,-0.305511,0.317254,-0.253118,0.42277,-0.02041,-0.115981,-0.646802,-0.107361,0.22387,0.610173,-0.163878,0.954901,-0.06192,-0.006878,-0.265367,-0.424513,0.248579,0.32423,-0.406088,0.359627,0.248646,-0.032668,0.332351,-0.152854,-0.271311,0.598035,0.12329,-0.128971
4,5,Acquire,17228,https://cf.geekdo-images.com/micro/img/SR4x4Yj...,172,2,6,12,7.34838,2.5061,Economic,"Hand Management,Investment,Market,Ownership,St...",4.909727,-3.912008,2.16426,2.244061,-1.352101,-3.085392,1.833186,4.169309,-0.438964,1.690578,2.966778,-2.330718,-1.893568,-0.640086,-2.42955,-2.476554,-1.696573,-4.607919,-0.581532,4.979954,1.668029,-0.998889,-0.014329,-0.598639,3.112369,1.232262,1.313164,1.514037,-0.414605,-0.201231,0.973001,-6.796674,-1.002924,0.211945,-3.120346,-1.791853,1.030825,5.827307,...,-0.033114,1.811418,1.542718,0.295156,1.095523,-1.212827,1.128566,-0.644129,0.588613,-2.728676,1.365646,1.714968,1.398179,2.182101,-0.751642,-0.499359,2.279562,0.132453,1.865424,0.406883,2.082913,0.57958,0.009297,-0.736354,2.278294,2.559249,3.103773,0.261339,1.520507,-0.024503,1.848472,2.277542,1.498346,3.269897,0.100034,1.228838,3.002239,1.762821,0.279569,0.9839,1.350413,1.699222,0.248485,0.9685,3.9908,-0.636696,0.74223,0.911554,0.187574,1.144274
