In [1]:
import pandas as pd
import numpy as np
import pyspark
from pyspark.ml.recommendation import ALS

spark = (pyspark.sql.SparkSession.builder
    .master("local")
    .getOrCreate())

For this example, first just a quick visual of a sparse user-item matrix, commonly what is used in recommendation systems.  This is a completely fictious one to recommend cheese to Muppets. Use whatever two things come to mind first! As you can see all users have rated something, but we have many unknowns.

In [14]:
muppet_cheese = [
{'muppet': 'Fozzie','gouda': 5, 'cheddar': 4,
 'brie': '-', 'swiss': 1, 'roquefort': '-'},
{'muppet': 'Kermit the Frog', 'gouda': '-', 'cheddar': 3,
 'brie': 4, 'swiss': 5, 'roquefort': '-'},
{'muppet': 'Rowlf the Dog', 'gouda': 2, 'cheddar': 4,
 'brie': 3, 'swiss': '-', 'roquefort': '-'},
{'muppet': 'Gonzo', 'gouda': '-', 'cheddar': '-',
 'brie': '-', 'swiss': 3, 'roquefort': 4},
{'muppet': 'Miss Piggy', 'gouda': 5, 'cheddar': 1,
 'brie': 3, 'swiss': '-', 'roquefort': 5}
        ]

mc = pd.DataFrame(muppet_cheese)

In [6]:
mc[['muppet', 'brie', 'cheddar', 'gouda', 'roquefort', 'swiss']]

Unnamed: 0,muppet,brie,cheddar,gouda,roquefort,swiss
0,Fozzie,-,4,5,-,1
1,Kermit the Frog,4,3,-,-,5
2,Rowlf the Dog,3,4,2,-,-
3,Gonzo,-,-,-,4,3
4,Miss Piggy,3,1,5,5,-


For creating the ALS model in Spark, data would typically more resemble the following format, each row a record of a user-item interaction.  For Spark, you also need to make sure the user and item are numeric, they cannot be text. 

In [12]:
muppet_cheese_als = [
    {'muppet': 1,'item':101, 'score': 5},
    {'muppet': 1,'item': 201, 'score': 4},
    {'muppet': 1,'item':401, 'score': 1},
    {'muppet': 2,'item': 201, 'score': 3},
    {'muppet': 2,'item': 301, 'score': 4},
    {'muppet': 2,'item':401, 'score': 5},
    {'muppet': 3,'item':101, 'score': 2},
    {'muppet': 3,'item': 201, 'score': 4},
    {'muppet': 3,'item': 301, 'score': 3},
    {'muppet': 4,'item':401, 'score': 3},
    {'muppet': 4,'item': 501, 'score': 4},
    {'muppet': 5,'item':101, 'score': 5},
    {'muppet': 5,'item': 201, 'score': 1},
    {'muppet': 5,'item': 301, 'score': 3},
    {'muppet': 5,'item': 501, 'score': 5},
        ]
muppet_cheese_als_df = pd.DataFrame(muppet_cheese_als)

In [13]:
muppets = spark.createDataFrame(muppet_cheese_als_df)

Typically, you would create train-test split to validate the results, but this is a small silly Muppet example, so we throw it all into our model:

In [14]:
als = ALS(rank=10, regParam=.1, maxIter=20,
          userCol='muppet', itemCol='item', 
          ratingCol='score', nonnegative=True)

als_model = als.fit(muppets)

In [15]:
items = als_model.itemFactors.toPandas()
users = als_model.userFactors.toPandas()

In [49]:
for i in range(10):
    items[str(i + 1)] = items['features'].apply(lambda x: x[i])
items['cheese'] = ['gouda', 'cheddar', 'brie', 'swiss', 'roquefort']
#users['muppet'] = ['Fozzie', 'Kermit the Frog', 'Rowlf the Dog', 'Gonzo', 'Miss Piggy']
items[['cheese', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']]

Unnamed: 0,cheese,1,2,3,4,5,6,7,8,9,10
0,gouda,0.564988,0.0,0.0,2.074283,0.538793,0.0,0.0,1.091593,0.329288,0.50032
1,cheddar,1.923183,0.0,0.0,0.774878,0.0,0.0,0.0,0.0,0.269862,0.0
2,brie,1.184818,0.0,0.0,0.824768,0.59275,0.0,0.0,0.546756,0.727644,0.550722
3,swiss,0.734064,0.0,0.0,0.0,1.056171,0.0,0.0,0.398587,1.350816,0.98168
4,roquefort,0.146882,0.0,0.0,1.346822,0.997866,0.0,0.0,1.197902,0.833252,0.926784


In [50]:
users['gouda'] = users['features'].apply(lambda x: np.dot(x, items['features'][0]))
users['cheddar'] = users['features'].apply(lambda x: np.dot(x, items['features'][1]))
users['brie'] = users['features'].apply(lambda x: np.dot(x, items['features'][2]))
users['swiss'] = users['features'].apply(lambda x: np.dot(x, items['features'][3]))
users['roquefort'] = users['features'].apply(lambda x: np.dot(x, items['features'][4]))
users['muppet'] = ['Fozzie', 'Kermit the Frog', 'Rowlf the Dog', 'Gonzo', 'Miss Piggy']
users[['muppet', 'brie', 'cheddar', 'gouda', 'swiss', 'roquefort']]

Unnamed: 0,muppet,brie,cheddar,gouda,swiss,roquefort
0,Fozzie,3.201114,3.803025,4.823604,1.128277,3.080742
1,Kermit the Frog,3.952644,2.934336,3.180585,4.74905,3.989108
2,Rowlf the Dog,2.862721,3.788768,2.077029,2.1394,1.395473
3,Gonzo,2.644571,1.21161,3.414823,2.956365,3.893735
4,Miss Piggy,2.876855,1.165907,4.81478,2.607894,4.873067


In [60]:
select_cheese = items.iloc[0:2, 2:12].to_numpy()

In [55]:
cheese_1.shape

(2, 10)

In [58]:
dr_teeth = np.array([[5],[5]])
dr_teeth.shape

(2, 1)

In [62]:
dr_teeth_features = np.linalg.lstsq(select_cheese, dr_teeth, rcond=None)
dr_teeth_features = dr_teeth_features[0].reshape((10,))
dr_teeth_features

array([1.94412615, 0.        , 0.        , 1.49873073, 0.20873594,
       0.        , 0.        , 0.42289857, 0.36965752, 0.19383103])

In [76]:
dr_teeth_scores = [np.dot(dr_teeth_features, row) for row in items.iloc[:, 2:12].to_numpy()]
for row in zip(items['cheese'], dr_teeth_scores):
    print(row)

('gouda', 4.9999999999999964)
('cheddar', 5.000000000000001)
('brie', 4.270218133363084)
('swiss', 2.505755675697055)
('roquefort', 3.5066194383500755)
