In [1]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.ml.recommendation import ALS

spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

For this example, first just a quick visual of a sparse user-item matrix, commonly what is used in recommendation systems.  This is a completely fictious one to recommend cheese to Muppets. Use whatever two things come to mind first! As you can see all users have rated something, but we have many unknowns.

In [14]:
muppet_cheese = [
{'muppet': 'Fozzie','gouda': 5, 'cheddar': 4,
 'brie': '-', 'swiss': 1, 'roquefort': '-'},
{'muppet': 'Kermit the Frog', 'gouda': '-', 'cheddar': 3,
 'brie': 4, 'swiss': 5, 'roquefort': '-'},
{'muppet': 'Rowlf the Dog', 'gouda': 2, 'cheddar': 4,
 'brie': 3, 'swiss': '-', 'roquefort': '-'},
{'muppet': 'Gonzo', 'gouda': '-', 'cheddar': '-',
 'brie': '-', 'swiss': 3, 'roquefort': 4},
{'muppet': 'Miss Piggy', 'gouda': 5, 'cheddar': 1,
 'brie': 3, 'swiss': '-', 'roquefort': 5}
        ]

mc = pd.DataFrame(muppet_cheese)

In [6]:
mc[['muppet', 'brie', 'cheddar', 'gouda', 'roquefort', 'swiss']]

Unnamed: 0,muppet,brie,cheddar,gouda,roquefort,swiss
0,Fozzie,-,4,5,-,1
1,Kermit the Frog,4,3,-,-,5
2,Rowlf the Dog,3,4,2,-,-
3,Gonzo,-,-,-,4,3
4,Miss Piggy,3,1,5,5,-


For creating the ALS model in Spark, data would typically more resemble the following format, each row a record of a user-item interaction.  For Spark, you also need to make sure the user and item are numeric, they cannot be text. 

In [12]:
muppet_cheese_als = [
    {'muppet': 1,'item':101, 'score': 5},
    {'muppet': 1,'item': 201, 'score': 4},
    {'muppet': 1,'item':401, 'score': 1},
    {'muppet': 2,'item': 201, 'score': 3},
    {'muppet': 2,'item': 301, 'score': 4},
    {'muppet': 2,'item':401, 'score': 5},
    {'muppet': 3,'item':101, 'score': 2},
    {'muppet': 3,'item': 201, 'score': 4},
    {'muppet': 3,'item': 301, 'score': 3},
    {'muppet': 4,'item':401, 'score': 3},
    {'muppet': 4,'item': 501, 'score': 4},
    {'muppet': 5,'item':101, 'score': 5},
    {'muppet': 5,'item': 201, 'score': 1},
    {'muppet': 5,'item': 301, 'score': 3},
    {'muppet': 5,'item': 501, 'score': 5},
        ]
muppet_cheese_als_df = pd.DataFrame(muppet_cheese_als)

In [13]:
muppets = spark.createDataFrame(muppet_cheese_als_df)

Typically, you would create train-test split to validate the results, but this is a small silly Muppet example, so we throw it all into our model:

In [16]:
als = ALS(rank=10, regParam=.1, maxIter=20,
          userCol='muppet', itemCol='item', 
          ratingCol='score', nonnegative=True)

als_model = als.fit(muppets)

Separating the items and user features to show what they look like:

In [19]:
items = als_model.itemFactors.toPandas()
users = als_model.userFactors.toPandas()

In [20]:
# Item Features
for i in range(10):
    items[str(i + 1)] = items['features'].apply(lambda x: x[i])
items['cheese'] = ['gouda', 'cheddar', 'brie', 'swiss', 'roquefort']
#users['muppet'] = ['Fozzie', 'Kermit the Frog', 'Rowlf the Dog', 'Gonzo', 'Miss Piggy']
items[['cheese', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]

Unnamed: 0,cheese,1,2,3,4,5,6,7,8,9,10
0,gouda,0.0,1.83355,0.311505,0.304448,1.437197,0.770893,0.0,0.0,0.53874,0.0
1,cheddar,0.520016,0.0,0.08339,0.0,1.16973,0.416471,0.693999,0.0,1.449831,0.0
2,brie,0.59094,0.989227,0.486651,0.246948,0.750923,0.340505,0.857241,0.0,0.771576,0.0
3,swiss,0.970373,0.903034,0.799617,0.340711,0.0,0.0,1.426155,0.0,0.33852,0.0
4,roquefort,0.284505,2.062847,0.674279,0.453092,0.688684,0.452625,0.494089,0.0,0.083563,0.0


In [22]:
# User Features
for i in range(10):
    users[str(i + 1)] = users['features'].apply(lambda x: x[i])
users['muppet'] = ['Fozzie', 'Kermit the Frog', 'Rowlf the Dog', 'Gonzo', 'Miss Piggy']
users[['muppet', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]

Unnamed: 0,muppet,1,2,3,4,5,6,7,8,9,10
0,Fozzie,0.016609,0.749777,0.0,0.061349,1.581885,0.711459,0.0,0.0,1.137922,0.0
1,Kermit the Frog,0.984344,0.910427,0.753244,0.3218,0.325052,0.128301,1.432978,0.0,0.653109,0.0
2,Rowlf the Dog,0.633099,0.0,0.195157,0.031192,0.798141,0.249949,0.875324,0.0,1.238229,0.0
3,Gonzo,0.379911,1.275077,0.538478,0.314408,0.352955,0.231973,0.597281,0.0,0.124494,0.0
4,Miss Piggy,0.066662,1.771649,0.460284,0.351409,0.674145,0.4458,0.143247,0.0,0.0,0.0


In [24]:
# Combining User & Item features:
for idx, cheese in enumerate(items['cheese'].tolist()):
    users[cheese] = users['features'].apply(lambda x: np.dot(x, items['features'][idx]))
users['muppet'] = ['Fozzie', 'Kermit the Frog', 'Rowlf the Dog', 'Gonzo', 'Miss Piggy']
users[['muppet', 'brie', 'cheddar', 'gouda', 'swiss', 'roquefort']]

Unnamed: 0,muppet,brie,cheddar,gouda,swiss,roquefort
0,Fozzie,3.074786,3.805113,4.828416,1.099304,3.085729
1,Kermit the Frog,3.94845,2.949727,2.919851,4.754015,3.856349
2,Rowlf the Dog,2.867002,3.785899,2.077143,2.448535,1.524602
3,Gonzo,2.777639,1.346943,3.354538,2.95175,3.897499
4,Miss Piggy,2.883558,1.146691,4.811316,2.356618,4.880015


We can now compare this matrix with the original to see what some of our best recommendations are:

In [26]:
mc[['muppet', 'brie', 'cheddar', 'gouda', 'swiss', 'roquefort']]

Unnamed: 0,muppet,brie,cheddar,gouda,swiss,roquefort
0,Fozzie,-,4,5,1,-
1,Kermit the Frog,4,3,-,5,-
2,Rowlf the Dog,3,4,2,-,-
3,Gonzo,-,-,-,3,4
4,Miss Piggy,3,1,5,-,5


Next, we can walk through using these features to get recommendations for a new user without re-running the model. First, we get the item features for the items we have ratings for (in this scenario, gouda & cheddar):

In [27]:
select_cheese = items.iloc[0:2, 2:12].to_numpy()

In [29]:
select_cheese.shape

(2, 10)

The legendary Muppet Dr. Teeth has rated both of these cheeses very highly. We put those score in an array of the corresponding amount of rows:

In [30]:
dr_teeth = np.array([[5],[5]])
dr_teeth.shape

(2, 1)

We then do the linear algebra to take our known ratings and know item features to return a full set of user features:

In [31]:
dr_teeth_features = np.linalg.lstsq(select_cheese, dr_teeth, rcond=None)
dr_teeth_features = dr_teeth_features[0].reshape((10,))
dr_teeth_features

array([0.46305335, 0.70459949, 0.19396104, 0.1169939 , 1.5938869 ,
       0.66709078, 0.61797834, 0.        , 1.49804521, 0.        ])

Tada! Now we can use these features to get a full set of recommendations by matrix multiplication with all the item features:

In [32]:
dr_teeth_scores = [np.dot(dr_teeth_features, row) for row in items.iloc[:, 2:12].to_numpy()]
for row in zip(items['cheese'], dr_teeth_scores):
    print(row)

('gouda', 5.0000000000000036)
('cheddar', 5.000000000000001)
('brie', 4.203573486490703)
('swiss', 2.669019200661358)
('roquefort', 3.599158692123502)
