# Download and Unzip Dataset

In [1]:
from urllib.request import urlretrieve
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-100k.zip", "ml-100k.zip")

('ml-100k.zip', <http.client.HTTPMessage at 0x28db50f2dd8>)

In [2]:
from zipfile import ZipFile
ZipFile('ml-100k.zip').extractall()

# Env Setup

In [1]:
import findspark
findspark.init("C:\\Users\\Owner\\scoop\\apps\\spark\\current",)

from pyspark.sql import SparkSession

SparkSession is more welcomed after Spark 2.0.0

In [2]:
spark = SparkSession.builder.getOrCreate()

You can easily go back to sparkContext

In [3]:
sc = spark.sparkContext

In [4]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import IntegerType, FloatType

# Explore Dataset

In [5]:
movielens = sc.textFile("ml-100k/u.data")

In [6]:
mls = movielens.map(lambda l: l.split('\t')).toDF()
mls = mls.withColumnRenamed("_3", "rating").withColumnRenamed(
    "_1", "userId").withColumnRenamed("_2", "movieId")
mls = mls.withColumn("userId", mls["userId"].cast(IntegerType())).withColumn(
    "movieId", mls["movieId"].cast(IntegerType())).withColumn("rating", mls["rating"].cast(FloatType()))

In [7]:
mls = mls.select(["userId", 'movieId', 'rating'])

In [8]:
mls.show(5)

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|   196|    242|   3.0|
|   186|    302|   3.0|
|    22|    377|   1.0|
|   244|     51|   2.0|
|   166|    346|   1.0|
+------+-------+------+
only showing top 5 rows



In [9]:
mls.describe().show()

+-------+------------------+-----------------+------------------+
|summary|            userId|          movieId|            rating|
+-------+------------------+-----------------+------------------+
|  count|            100000|           100000|            100000|
|   mean|         462.48475|        425.53013|           3.52986|
| stddev|266.61442012750945|330.7983563255858|1.1256735991443179|
|    min|                 1|                1|               1.0|
|    max|               943|             1682|               5.0|
+-------+------------------+-----------------+------------------+



# Movie Recommendation

In [10]:
train, test = mls.randomSplit([0.7, 0.3], 363)

In [11]:
train.count()

69893

In [12]:
als = ALS(userCol='userId', itemCol='movieId', ratingCol='rating')

model = als.fit(train)

predictions = model.transform(test)

In [13]:
predictions.describe().show()

+-------+-----------------+------------------+-----------------+----------+
|summary|           userId|           movieId|           rating|prediction|
+-------+-----------------+------------------+-----------------+----------+
|  count|            30107|             30107|            30107|     30107|
|   mean|460.5582422692397|425.93765569468894|3.529677483641678|       NaN|
| stddev|266.2311117623146| 330.3187214122093|1.122862210258293|       NaN|
|    min|                1|                 1|              1.0|-0.5226904|
|    max|              943|              1680|              5.0|       NaN|
+-------+-----------------+------------------+-----------------+----------+



In [14]:
predictions = predictions.na.drop()
predictions.describe().show()

+-------+------------------+------------------+-----------------+------------------+
|summary|            userId|           movieId|           rating|        prediction|
+-------+------------------+------------------+-----------------+------------------+
|  count|             30038|             30038|            30038|             30038|
|   mean| 460.6113922365004|423.61312337705573|3.532325720753712|3.3958862769979956|
| stddev|266.24432834107756|326.91610055410786|1.120946333236851|0.7169788128622192|
|    min|                 1|                 1|              1.0|        -0.5226904|
|    max|               943|              1664|              5.0|          5.609894|
+-------+------------------+------------------+-----------------+------------------+



In [15]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating')
evaluator.evaluate(predictions)

0.9291548287137773

In [6]:
ratings.timestamp = ratings.timestamp.apply(datetime.fromtimestamp).apply(lambda x: x.strftime('%Y'))