### Matrix Factorization with Alternating Least Squares

### Springboard Capstone 2 project: building a recommendation engine
### John Burt


### Purpose of this notebook:

Generate an ALS filled ratings matrix of user (rows) x games (cols)


#### The method:

- Load data into a pandas dataframe from provided csv files.

- Use pivot to convert the data into a games(rows) X users(cols) rating matrix, with NaNs where users haven't rated games (majority of cells).

- Drop users who rated too few games, or gave outlier ratings.

- Fit ALS to fill in ratings for all users

- Save ALS filled array



## Load input data



In [3]:
# remove warnings
import warnings
warnings.filterwarnings('ignore')
# ---

%matplotlib inline
import pandas as pd
pd.options.display.max_columns = 100
from matplotlib import pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
import numpy as np

from datetime import datetime

pd.options.display.max_rows = 100

srcdir = './data/'

# load the boardgame title data
titledata = pd.read_csv(srcdir+'bgg_gamelist.csv')

# rename the gameID column
titledata=titledata.rename(columns = {"id":'gameID'})
titledata.reset_index(inplace=True)

utility_df = pd.read_hdf(srcdir+'bgg_game_mx_unfilled_v2.h5', 'mx')

# should not be necessary: gameID should already be index
# rp_filt = rp_filt.set_index('gameID')

print(titledata.shape)
print(utility_df.shape)
utility_df.head()

(12600, 7)
(12120, 65168)


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,...,65119,65120,65121,65122,65123,65124,65125,65126,65127,65128,65129,65130,65131,65132,65133,65134,65135,65136,65137,65138,65139,65140,65141,65142,65143,65144,65145,65146,65147,65148,65149,65150,65151,65152,65153,65154,65155,65156,65157,65158,65159,65160,65161,65162,65163,65164,65165,65166,65167,65168
gameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
1,,5.0,,,,9.0,,,,,,7.7,,9.0,,,,,10.0,,,8.0,,,,,,,6.0,,,,,,,,,,,,,,,,,,,9.0,6.0,,...,,,,,,,,,9.0,,,,,,,,,,8.0,,,8.0,,,,,,,,,,,,,,,9.4,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,4.0,,8.0,9.0,6.0,,,,,,,,,,,7.0,,7.9,,,,,,,,8.0,,,7.0,,,,,,,,8.0,,,,8.0,8.0,7.0,,,,7.0,,,...,,,,8.0,9.0,6.5,8.5,,,,,,,,,,,,,,,,,,,,,,,8.0,8.0,,,,,,8.0,8.5,,,,7.5,,9.0,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5.0,,,,,,,,,,,,,,,,,,,
5,,7.0,,,,,,,,,,,,,,,,,,,,6.0,,,,9.5,8.0,,6.0,,,,,,,,,8.0,,,,,8.0,,,,,8.0,,,...,,,,7.0,,,7.5,,,,,,,,6.0,,,,8.0,,,,6.0,8.0,,,,,,,9.0,,,,,,7.8,8.69,,,7.0,7.3,,9.0,,,6.0,7.0,,


## Alternating Least Squares matrix factorization

### Use ALS to compute the missing ratings.


[Here is a useful description of this method.](https://bugra.github.io/work/notes/2014-04-19/alternating-least-squares-method-for-collaborative-filtering/)

Notes: 

- The ALS algorithm I'm using is from an OSS package called implicit. This is wrapped in a utility function in recsys_utilities.py: do_ALS_df(), which takes a dataframe as input and outputs an ALS filled ratings dataframe with same indices and column names.


- The parameters used for the ALS were determined using hyperparameter tuning in a separate notebook.

In [4]:
# import utility functions
import sys
sys.path.append('./')
import recsys_utilities
import importlib
importlib.reload(recsys_utilities)
from recsys_utilities import do_ALS_df

# model defaults (from HP tuning runs)
params = {
    'regularization': 0.3,
    'n_factors': 25,
    'n_iterations': 20,
    'weighted': False,
    'bm25_K1': 4,
    'bm25_B': 0.5,
    'verbose': True,
    'scale': True,
    'use_native': True,
    'use_cg': True,
    'use_gpu': False,
    }

utility_df_filled, item_factors, user_factors = do_ALS_df(
    utility_df, ALS_method='implicit', **params)


fitting ALS model


HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




## Save the ALS filled matrix to an HDF5 file

In [5]:
# user_ratings_filled.to_csv(srcdir+'bgg_game_mx_filled.csv')
utility_df_filled.to_hdf(srcdir+'bgg_game_mx_filled_v2.h5', key='mx')


In [6]:
utility_df_filled.shape

(12120, 65168)

In [7]:
utility_df_filled.tail()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,...,65119,65120,65121,65122,65123,65124,65125,65126,65127,65128,65129,65130,65131,65132,65133,65134,65135,65136,65137,65138,65139,65140,65141,65142,65143,65144,65145,65146,65147,65148,65149,65150,65151,65152,65153,65154,65155,65156,65157,65158,65159,65160,65161,65162,65163,65164,65165,65166,65167,65168
gameID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
276925,6.386015,6.207924,6.414303,6.157567,6.25817,6.351043,6.160162,6.075673,5.949022,6.152587,6.151018,6.211317,6.037933,6.462116,6.277407,5.959018,6.484363,5.871089,6.081823,6.055026,6.198666,6.548743,6.227896,5.919703,5.996846,6.212459,6.242943,6.240223,5.854299,6.328776,6.422137,5.923799,6.332831,6.220666,6.239967,6.222564,6.220427,6.138691,5.794073,6.249036,6.137474,5.937656,5.970498,6.147401,6.265683,6.006303,6.165143,6.542853,6.131458,6.464466,...,6.270455,6.049587,6.141913,6.441819,6.426351,6.126408,7.062617,6.116514,6.178725,6.174303,6.293387,6.375491,5.996103,6.130337,6.312321,5.961359,5.96936,6.779797,6.216163,6.608896,5.955475,6.425298,6.240215,6.229632,6.491464,6.031808,6.044845,6.235034,6.254427,6.030169,6.172724,6.247353,5.995852,5.972504,6.692282,5.874828,7.078767,6.288208,6.147497,6.576926,6.079972,6.047498,6.392118,6.143959,6.162734,6.34046,6.28392,6.056695,6.194838,6.186833
277721,6.538056,5.872683,6.256649,6.154757,6.401428,6.255696,6.286851,6.161515,6.055374,6.107759,6.155159,6.202503,6.04483,6.31635,6.476088,6.302219,6.560421,6.059654,6.191854,6.016574,6.182189,6.388591,6.04514,5.982573,6.142472,6.238828,5.930999,6.181767,5.578951,6.224669,6.489919,5.996889,6.044029,6.271877,7.180823,6.139794,6.271113,6.157863,6.041736,6.281436,6.001177,5.704138,6.071423,6.125511,6.077414,6.009338,6.190165,6.183463,5.867213,6.654366,...,6.268713,6.216753,6.375752,6.163857,6.066635,6.150838,7.401213,6.0753,6.207426,6.216413,6.398992,6.32048,6.040918,6.294312,6.376506,6.423843,6.449741,6.293853,6.387267,6.769162,6.321056,6.220976,6.17215,6.202209,6.272469,6.00609,6.109859,6.400149,6.305483,5.805383,6.025389,6.31261,5.922369,6.42854,6.755386,6.265232,7.217903,6.202028,6.34165,6.187694,6.080297,6.040864,6.387999,6.16889,6.067099,6.311953,5.84757,6.186909,6.293118,5.982375
278751,6.105055,5.944846,5.913023,6.134682,6.05788,6.201626,6.319663,6.03142,6.180435,6.195074,6.108746,6.075449,6.250846,6.547694,6.276275,5.825398,6.554298,5.998857,6.401646,6.118531,6.113026,6.49492,6.891368,6.261802,6.157175,6.111492,6.293982,6.083853,6.147938,6.131085,5.940538,6.380602,6.194409,5.849782,6.208471,6.220195,6.13261,6.36027,6.267388,6.383636,6.268568,6.551623,6.232018,6.269504,6.408775,6.211515,5.875174,6.439242,6.852717,6.340019,...,6.152043,6.185471,6.194512,6.402278,6.203137,6.169764,7.52742,6.288509,6.285461,6.171806,6.187458,6.029394,6.103232,6.102353,5.937624,6.146598,5.679831,6.445822,6.231091,5.8718,6.581343,6.050949,6.137367,6.349404,6.049704,6.138336,6.22866,6.519111,6.739342,6.526852,5.826472,6.579005,6.060813,6.023544,6.128255,6.108235,6.524807,6.421551,6.047328,6.149913,5.980931,6.163893,6.206664,6.231037,6.322564,6.081064,6.478848,6.078439,6.352653,6.404754
280794,6.250512,6.011319,6.224489,6.156438,6.285044,6.245353,6.263029,6.142721,6.086185,6.156697,6.203772,6.18341,6.127605,6.201605,6.269678,6.184437,6.435411,5.988534,6.196775,6.138637,6.220085,6.332741,6.168015,6.163512,6.174522,6.264639,6.057923,6.167525,6.028551,6.263186,6.335258,6.033931,6.18975,6.169065,6.271966,6.113391,6.194785,6.068131,6.080978,6.118313,6.005008,6.109583,6.142079,6.175528,6.173938,6.175476,6.169623,6.230261,6.172174,6.188756,...,6.195108,6.127889,6.238698,6.315942,6.220745,6.208078,6.592873,6.096695,6.132159,6.17304,6.274726,6.127547,6.100805,6.105029,6.181401,6.112187,6.09646,6.236363,6.256319,6.208776,6.001191,6.076492,6.152996,6.240991,6.250348,6.239757,6.144095,6.307824,6.321018,6.136808,6.064432,6.255153,6.12012,6.071082,6.281846,6.094061,6.467022,6.143243,6.107682,6.287824,6.140574,6.12276,6.236846,6.105278,6.162431,6.186157,6.081925,6.225931,6.198174,6.171805
280896,6.401639,5.970799,5.969642,5.869514,5.901485,6.238485,6.02733,6.033891,6.121655,6.155506,6.081281,5.98443,6.272601,6.056727,6.664954,6.382201,6.378403,5.914056,6.33621,6.083011,5.872501,6.712572,7.251623,6.15836,5.956416,6.515329,6.306452,6.051872,5.891873,6.094973,6.060827,6.211916,6.20312,5.755943,6.231871,6.273307,6.225264,6.034153,6.259037,6.348979,6.022755,6.188836,6.344465,6.135086,6.301696,6.212772,6.046572,6.071476,5.958835,6.839176,...,6.240094,6.267001,6.056714,6.356914,6.058758,5.959655,7.823705,5.940091,6.064079,6.162294,6.201148,6.168333,6.115637,6.150583,6.089833,6.201796,5.808485,6.506844,6.204061,5.920644,6.616078,6.003362,6.232439,6.307116,6.157723,6.238787,6.404603,6.218391,7.048825,6.181205,5.636815,6.737434,5.670337,6.215665,6.278471,6.045606,6.393456,6.230094,5.944324,6.028279,6.240737,5.913371,6.254384,6.024954,6.131099,6.30863,6.047384,6.039902,6.472989,6.331643
