### Matrix Factorization with Alternating Least Squares

### Springboard Capstone 2 project: building a recommendation engine
### John Burt


### Purpose of this notebook:

Generate an ALS filled ratings matrix of user (rows) x games (cols)


#### The method:

- Load data into a pandas dataframe from provided csv files.

- Use pivot to convert the data into a games(rows) X users(cols) rating matrix, with NaNs where users haven't rated games (majority of cells).

- Drop users who rated too few games, or gave outlier ratings.

- Fit ALS to fill in ratings for all users

- Save ALS filled array



## Load input data



In [1]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np

from datetime import datetime

pd.options.display.max_rows = 100

srcdir = './data/'

# load the boardgame title data
titledata = pd.read_csv(srcdir+'bgg_gamelist.csv')

# rename the gameID column
titledata=titledata.rename(columns = {"id":'gameID'})
titledata.reset_index(inplace=True)

utility_df = pd.read_hdf(srcdir+'bgg_game_mx_unfilled.h5', 'mx')

# should not be necessary: gameID should already be index
# rp_filt = rp_filt.set_index('gameID')

print(titledata.shape)
print(utility_df.shape)
utility_df.head()

(12600, 7)
(12600, 69910)


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,...,69861,69862,69863,69864,69865,69866,69867,69868,69869,69870,69871,69872,69873,69874,69875,69876,69877,69878,69879,69880,69881,69882,69883,69884,69885,69886,69887,69888,69889,69890,69891,69892,69893,69894,69895,69896,69897,69898,69899,69900,69901,69902,69903,69904,69905,69906,69907,69908,69909,69910
gameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,,5.0,,,,9.0,,,,,,7.7,,9.0,,,,,,10.0,,,,8.0,,,,,,,6.0,,,,,,,,,,,,,,,,,,,,...,,,,,,9.0,,,,,,,,,,8.0,,,8.0,,,,,,,,,,,,,,,,,9.4,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,4.0,,8.0,9.0,6.0,,,,,,,,,,,,7.0,,7.9,,,,,,,,,8.0,,,7.0,,,,,,,,,,,8.0,,,,8.0,8.0,7.0,,...,8.0,9.0,6.5,8.5,,,,,,,,,,,,,,,,,,,,,,,,,8.0,8.0,,,,,,8.0,8.5,,,,7.5,,9.0,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,,
5,,7.0,,,,,,,,,,,,,,,,,,,,,,6.0,,,,9.5,8.0,,6.0,,,,,,,,,,,,8.0,,,,,8.0,,,...,7.0,,,7.5,,,,,,,,6.0,,,,8.0,,,,6.0,8.0,,,,,,,,,9.0,,,,,,7.8,8.69,,,7.0,7.3,,9.0,,,6.0,7.0,,,


## Alternating Least Squares matrix factorization

### Use ALS to compute the missing ratings.


[Here is a useful description of this method.](https://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/)

Notes: 

- The ALS algorithm I'm using is from an OSS package called implicit. This is wrapped in a utility function in recsys_utilities.py: do_ALS_df(), which takes a dataframe as input and outputs an ALS filled ratings dataframe with same indices and column names.


- The parameters used for the ALS were determined using hyperparameter tuning in a separate notebook.

In [2]:
# import utility functions
import sys
sys.path.append('./')
import recsys_utilities
import importlib
importlib.reload(recsys_utilities)
from recsys_utilities import do_ALS_df

# model defaults (from HP tuning runs)
params = {
    'regularization': 0.3,
    'n_factors': 2500,
    'n_iterations': 20,
    'weighted': False,
    'bm25_K1': 4,
    'bm25_B': 0.5,
    'verbose': True,
    'scale': True,
    'use_native': True,
    'use_cg': True,
    'use_gpu': False,
    }

utility_df_filled, item_factors, user_factors = do_ALS_df(
    utility_df, ALS_method='implicit', **params)


fitting ALS model


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




## Save the ALS filled matrix to an HDF5 file

In [3]:
# user_ratings_filled.to_csv(srcdir+'bgg_game_mx_filled.csv')
utility_df_filled.to_hdf(srcdir+'bgg_game_mx_filled_v2.h5', key='mx')


In [4]:
utility_df_filled.shape

(12600, 69910)

In [5]:
utility_df_filled.tail()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,...,69861,69862,69863,69864,69865,69866,69867,69868,69869,69870,69871,69872,69873,69874,69875,69876,69877,69878,69879,69880,69881,69882,69883,69884,69885,69886,69887,69888,69889,69890,69891,69892,69893,69894,69895,69896,69897,69898,69899,69900,69901,69902,69903,69904,69905,69906,69907,69908,69909,69910
gameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
280896,6.513273,7.773731,6.421499,7.438921,6.403575,6.295657,6.349405,6.376484,7.424681,6.133234,6.459544,5.842012,5.733686,8.057378,7.849255,6.417606,6.833074,7.772849,7.179876,5.846406,6.439947,7.177695,6.699678,7.514334,7.618284,6.507744,6.63647,6.870068,5.809164,6.473354,7.222481,7.311959,6.929796,6.636192,6.157415,7.059566,7.595337,5.973001,6.540643,6.374126,6.424639,6.606966,6.230058,6.079908,6.96012,7.139957,6.044809,6.496947,6.436804,6.766707,...,6.969313,7.052066,6.546884,7.527596,6.925769,6.385993,6.375522,6.626757,6.159528,6.306253,5.971025,7.252043,6.2716,6.589958,7.105066,6.58282,7.258923,6.988389,6.848467,6.591293,6.491876,6.261829,6.830843,6.301555,6.065643,6.495884,6.312449,8.619366,6.619871,7.187155,6.192543,6.678814,6.414287,5.588275,5.651472,5.962449,5.003896,6.497991,5.913747,6.605277,6.442219,6.471323,6.523296,6.839727,6.560006,7.378529,6.338491,7.04493,6.645025,6.62942
282524,7.444178,6.178032,7.058345,7.19921,5.472513,6.236969,6.847563,6.854704,7.532144,5.719596,6.836666,7.307568,7.732418,6.758873,6.550468,6.647713,6.333265,8.011545,5.265086,5.839897,6.389369,7.079901,5.428813,6.63201,7.986572,6.382566,7.494252,6.542061,6.210764,6.972356,7.01232,7.25059,8.768015,7.289477,5.975432,7.396895,7.601387,7.436098,6.120398,5.9767,6.900632,6.057969,7.033801,5.217354,7.816957,7.990566,6.249875,6.254347,7.24592,5.860622,...,6.160862,5.956359,7.003799,4.732624,6.468446,5.706438,7.38617,5.743571,5.44718,5.814926,7.859742,6.828037,6.064782,6.840959,6.260266,6.768538,6.775764,7.631747,6.645567,5.405219,6.097203,6.432796,7.680037,7.416716,7.936163,5.829932,6.988635,10.405951,5.211066,7.149088,5.700087,6.009498,6.424799,5.119444,5.745913,5.612118,6.187667,7.921329,6.933319,6.572538,7.064493,7.013351,6.16263,6.013021,5.308948,6.772508,7.030873,5.189259,6.604479,6.971354
283355,6.911289,5.908933,5.807483,6.514535,6.459871,7.69271,6.365845,6.423894,6.550807,6.10789,6.546838,6.459908,6.267465,6.55448,7.059748,6.883205,5.784601,7.371205,5.71896,7.478695,6.661923,6.383657,6.199766,7.56272,6.762201,6.185794,6.011409,5.523315,7.261415,6.831782,7.157633,6.782115,7.182593,5.300737,6.060097,7.051616,6.816155,6.543633,6.244187,6.624313,6.686586,6.664356,6.718791,7.227031,6.401342,6.52813,6.829483,6.653029,6.980946,6.046051,...,7.260801,7.036358,6.588491,8.434208,6.339489,6.194489,6.342604,5.941347,6.333204,6.597045,6.415401,6.675786,6.837813,6.53936,7.003824,6.895262,7.392916,6.400503,5.414898,6.259892,6.831536,6.664467,6.669391,7.169945,6.440711,7.144681,6.742985,6.83292,7.239484,7.473683,5.622365,5.763676,6.706966,6.481196,6.389326,6.750229,7.054677,6.894876,6.775034,7.099454,6.673627,6.290803,6.470307,6.666278,6.490932,7.261771,6.277239,7.262832,6.792021,6.826407
284760,7.138937,6.875751,6.776287,7.584996,6.807449,7.165861,6.356086,6.833722,6.313588,6.665407,7.395056,6.552906,6.41491,5.942933,7.43023,7.025352,7.080956,6.784418,6.305712,6.909737,6.411743,6.495892,6.207894,6.720389,3.983816,6.762424,7.007031,6.963014,5.672598,6.352135,6.740142,6.718269,6.358982,7.739693,6.985604,6.152702,7.025621,6.241451,6.711318,6.230655,6.642029,6.492932,6.20116,6.67696,6.596388,6.742161,5.835269,6.746365,6.540086,6.420158,...,6.999706,6.607535,6.471834,10.022818,7.325597,6.525484,5.80636,6.729268,6.066555,6.575966,6.287017,7.105315,6.138462,7.114128,7.143313,6.26384,6.83511,6.505881,6.818374,6.875206,6.188312,6.315405,6.665418,6.859398,6.940042,7.182323,6.539141,7.122956,6.20444,8.310125,6.102972,6.505765,6.76587,6.193514,6.495183,6.756928,5.867887,6.08767,6.53541,6.674549,6.329407,6.74164,6.050118,6.210576,6.873276,7.907024,6.491417,5.974857,6.022381,7.012793
286096,7.956596,6.862447,6.496655,6.863875,9.962277,6.857886,6.028924,6.334569,5.740877,6.880276,6.582042,5.999849,7.699244,6.093499,6.407662,6.775441,6.051523,6.558802,6.737882,6.335572,7.548537,5.5888,18.596024,7.682185,2.589918,7.265867,6.737799,7.189208,6.639896,7.321706,9.584621,7.386384,6.832199,7.619538,5.432071,5.997519,8.683411,3.607346,7.155419,5.514832,5.850385,6.741884,6.987037,7.580676,5.911497,6.960097,7.57036,6.041366,7.09441,6.064926,...,5.898318,5.809406,6.914448,9.76148,5.461325,6.811809,7.960059,4.826707,5.944077,6.266754,9.581848,7.197141,7.877887,7.058689,7.032727,5.730985,6.660791,4.866525,7.332869,6.324274,7.246052,7.071561,4.810308,7.581676,7.581501,6.08294,6.608224,9.753757,7.309408,6.561294,10.483264,7.074687,7.370327,8.00618,7.431762,7.597032,9.168505,5.636894,7.223545,6.848269,6.585668,6.606049,4.947955,6.050713,6.200478,9.376577,6.702415,6.502357,6.560186,6.093457
