### ALS

In [1]:
import findspark
findspark.init()

In [2]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('MovieLens').getOrCreate()

In [3]:
artists = spark.read.csv("hetrec2011-lastfm-2k/artists.dat",sep="\t",header=True)
user_artists = spark.read.csv("hetrec2011-lastfm-2k/user_artists.dat",sep="\t", header=True)

In [4]:
user_artists = user_artists.withColumn("weight", user_artists.weight.cast('int') )
user_artists = user_artists.withColumn("userID", user_artists.userID.cast('int') )
user_artists = user_artists.withColumn("artistID", user_artists.artistID.cast('int') )

In [5]:
user_artists.printSchema()

root
 |-- userID: integer (nullable = true)
 |-- artistID: integer (nullable = true)
 |-- weight: integer (nullable = true)



In [6]:
import pandas as pd

In [5]:
user_artists_pan=user_artists.toPandas()
user_artists_pan["play"]=1

In [6]:
count_artist=user_artists_pan.groupby(by='artistID').count()
count_artist=count_artist.rename(columns={"play": "count"})
count_artist=count_artist.drop(columns=['userID','weight'])
count_artist=count_artist.reset_index()

In [7]:
type(count_artist)

pandas.core.frame.DataFrame

In [8]:
user_artists_pan=user_artists_pan.join(count_artist.set_index('artistID'), on="artistID")

In [9]:
user_artists_pan

Unnamed: 0,userID,artistID,weight,play,count
0,2,51,13883,1,111
1,2,52,11690,1,23
2,2,53,11351,1,75
3,2,54,10300,1,18
4,2,55,8983,1,298
...,...,...,...,...,...
92829,2100,18726,337,1,1
92830,2100,18727,297,1,1
92831,2100,18728,281,1,1
92832,2100,18729,280,1,1


In [10]:
user_artists_pan=user_artists_pan[user_artists_pan["count"]>10]

In [11]:
user_artists_pan

Unnamed: 0,userID,artistID,weight,play,count
0,2,51,13883,1,111
1,2,52,11690,1,23
2,2,53,11351,1,75
3,2,54,10300,1,18
4,2,55,8983,1,298
...,...,...,...,...,...
92794,2100,1260,1459,1,11
92795,2100,1276,1133,1,13
92796,2100,1281,573,1,20
92798,2100,2765,457,1,11


In [12]:
us_art = spark.createDataFrame(user_artists_pan)
us_art

DataFrame[userID: bigint, artistID: bigint, weight: bigint, play: bigint, count: bigint]

In [13]:
from pyspark.sql.types import *
from pyspark.sql.functions import col

us_art = us_art.withColumn("userId", col("userId").cast(IntegerType()))\
                        .withColumn("artistId", col("artistId").cast(IntegerType()))\
                        .withColumn("weight", col("weight").cast(DoubleType()))


In [14]:
from pyspark import SparkConf, SparkContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating

In [15]:
# dzielimy zbiór na treningowy i testowy (0.7 i 0.3)
(ratings_train, ratings_test) = us_art.randomSplit([0.7,0.3], seed = 1000)

print(' training: {0}\n test: {1}\n'\
  .format(ratings_train.count(),
          ratings_test.count()
         ))

 training: 43111
 test: 18618



In [16]:
from pyspark.ml.recommendation import ALS
#https://spark.apache.org/docs/2.1.0/ml-collaborative-filtering.html
ratings_als = ALS(userCol = "userId",
                  itemCol = "artistId",
                  ratingCol = "play",
                  rank = 10, #latent components
                  maxIter = 10,
                  regParam = 0.1, #regularization
                  implicitPrefs = False, 
                  coldStartStrategy = "drop", 
                  )

In [17]:
#predykcje
from pyspark.ml.evaluation import RegressionEvaluator
ratings_model = ratings_als.fit(ratings_train)

ratings_pred = ratings_model.transform(ratings_test)
ratings_pred.show(10)

+------+--------+------+----+-----+----------+
|userId|artistId|weight|play|count|prediction|
+------+--------+------+----+-----+----------+
|   126|    1580| 267.0|   1|   43|0.89959943|
|   210|     463| 510.0|   1|   17| 0.8997955|
|    27|     463| 159.0|   1|   17| 0.8997495|
|   271|    1580| 900.0|   1|   43|0.89971703|
|   333|    1580| 295.0|   1|   43| 0.8996826|
|   319|    1645|2067.0|   1|   21| 0.8997618|
|   479|     463|  23.0|   1|   17|0.89975446|
|   432|    1645| 223.0|   1|   21|0.89979494|
|   306|     463| 452.0|   1|   17|0.89963704|
|   189|     496| 241.0|   1|   11| 0.8997361|
+------+--------+------+----+-----+----------+
only showing top 10 rows



In [19]:
#ocena jakości
from pyspark.ml.evaluation import RegressionEvaluator
ratings_eval = RegressionEvaluator(metricName = "rmse", 
                                    labelCol = "play",
                                    predictionCol = "prediction")
ratings_rmse = ratings_eval.evaluate(ratings_pred)
print("RMSE = " + str(ratings_rmse))

RMSE = 0.10024020695887441


In [20]:
ratings_pred.show()

+------+--------+------+----+-----+----------+
|userId|artistId|weight|play|count|prediction|
+------+--------+------+----+-----+----------+
|   126|    1580| 267.0|   1|   43|0.89959943|
|   210|     463| 510.0|   1|   17| 0.8997955|
|    27|     463| 159.0|   1|   17| 0.8997495|
|   271|    1580| 900.0|   1|   43|0.89971703|
|   333|    1580| 295.0|   1|   43| 0.8996826|
|   319|    1645|2067.0|   1|   21| 0.8997618|
|   479|     463|  23.0|   1|   17|0.89975446|
|   432|    1645| 223.0|   1|   21|0.89979494|
|   306|     463| 452.0|   1|   17|0.89963704|
|   189|     496| 241.0|   1|   11| 0.8997361|
|   129|    1580| 101.0|   1|   43| 0.8996829|
|   256|    4818| 503.0|   1|   12|0.89978206|
|    56|    1645|1491.0|   1|   21|0.89977145|
|   487|     463| 170.0|   1|   17| 0.8997858|
|   147|    1580|2294.0|   1|   43| 0.8996853|
|   198|     496| 496.0|   1|   11|0.89972764|
|    30|     471|  89.0|   1|   42|0.89980364|
|   174|     463|  93.0|   1|   17|0.89975214|
|   399|    1

In [21]:
# rekomendacja 
user_recs=ratings_model.recommendForAllUsers(5).show(truncate=False)

+------+--------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                   |
+------+--------------------------------------------------------------------------------------------------+
|12    |[{835, 0.89990866}, {1253, 0.89989144}, {3053, 0.89988256}, {592, 0.89988226}, {1066, 0.899878}]  |
|22    |[{835, 0.8998583}, {2746, 0.89985424}, {4156, 0.8998464}, {917, 0.8998402}, {1274, 0.8998393}]    |
|26    |[{2746, 0.8998568}, {5844, 0.8998458}, {4271, 0.89984554}, {917, 0.89984393}, {1369, 0.89984286}] |
|27    |[{1253, 0.8999237}, {835, 0.8999113}, {3053, 0.89990604}, {175, 0.8999033}, {592, 0.8999014}]     |
|28    |[{592, 0.8998627}, {5416, 0.8998564}, {4069, 0.8998532}, {572, 0.89985126}, {5844, 0.8998509}]    |
|31    |[{1253, 0.89992654}, {175, 0.8999092}, {3053, 0.89990216}, {592, 0.8998977}, {1066, 0.899896}]    |
|34    |[{592, 0.89983886}, 