# Jonathan Halverson
# Monday, December 19, 2016
# Dating Recommender

### This code uses alternating least squares to make recommendations for individuals on a dating site

In [1]:
from __future__ import print_function

import sys
import random
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
APP_NAME = "Dating Recommender"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
print("Spark version: %s" % spark.sparkContext.version)

Spark version: 2.0.2


In [3]:
def parse_rating(line, sep=','):
  u = line.strip().split(sep)
  return Row(userID=int(u[0]), profileID=int(u[1]), rating=float(u[2]))

In [4]:
def parse_user(line, sep=','):
  fields = line.strip().split(sep)
  user_id = int(fields[0])
  gender = fields[1]
  return (user_id, gender)

Load the ratings data and convert to a DataFrame:

In [5]:
lines = spark.sparkContext.textFile('ratings.dat')
ratingsRDD = lines.map(parse_rating)
ratingsRDD.first()

Row(profileID=8305, rating=10.0, userID=1)

In [6]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.printSchema()

root
 |-- profileID: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- userID: long (nullable = true)



In [7]:
ratings.show(n=5)

+---------+------+------+
|profileID|rating|userID|
+---------+------+------+
|     8305|  10.0|     1|
|    15530|   6.0|     1|
|    22319|  10.0|     1|
|    32136|   9.0|     1|
|    38868|   7.0|     1|
+---------+------+------+
only showing top 5 rows



Which users have provided the fewest ratings:

In [8]:
from pyspark.sql import functions as F
rating_counts = ratings.groupBy('userID').agg(F.count('*').alias('count'))
rating_counts.sort('count', ascending=True).show(n=5)

+------+-----+
|userID|count|
+------+-----+
|  4872|    1|
|  9954|    1|
|  5107|    1|
|  1921|    1|
|  6225|    1|
+------+-----+
only showing top 5 rows



What percentage of users have given fewer than 20 ratings?

In [9]:
100.0 * rating_counts.select('*').where('count < 20').count() / rating_counts.count()

94.31364002393634

Create a list of users that have at least 20 ratings:

In [10]:
active_users_ids = rating_counts.rdd.map(lambda x: (x[0], x[1])).filter(lambda x: x[1] >= 20).keys()
active_users_ids.take(10)

[2214, 2509, 7747, 10422, 16530, 17979, 18628, 19979, 22201, 25084]

Split the ratings data into a training and test set:

In [11]:
training, test = ratings.randomSplit([0.6, 0.4])

In [12]:
print('Training: %d' % training.count())
print('Validation: %d' % test.count())

Training: 651287
Validation: 433672


Set parameters for ALS:

In [13]:
rank_ = 8
num_iterations = 8
lambda_ = 0.1

In [14]:
als = ALS(rank=rank_, maxIter=num_iterations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
model = als.fit(training)

In [15]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

Let's examine the NaN elements. They arise when the predicted case in the the training set.

In [16]:
predictions.select('*').filter(predictions.prediction == float('NaN')).show()

+---------+------+------+----------+
|profileID|rating|userID|prediction|
+---------+------+------+----------+
|     7240|   3.0| 85290|       NaN|
|     7993|  10.0| 95479|       NaN|
|    13832|   7.0| 33639|       NaN|
|    13840|   4.0| 86208|       NaN|
|    14450|   2.0|135264|       NaN|
|    15447|  10.0| 47694|       NaN|
|    15846|   7.0| 33827|       NaN|
|    16861|   3.0| 53467|       NaN|
|    16861|   5.0|108503|       NaN|
|    17420|  10.0|  4627|       NaN|
|    20497|   5.0| 45123|       NaN|
|    20497|   6.0| 84942|       NaN|
|    24171|   1.0| 15196|       NaN|
|    25462|   1.0|108348|       NaN|
|    26087|   2.0| 44957|       NaN|
|    26087|   6.0| 54594|       NaN|
|    26708|  10.0| 13816|       NaN|
|    27484|   1.0| 69509|       NaN|
|    27484|   1.0| 72563|       NaN|
|    29719|   2.0|132207|       NaN|
+---------+------+------+----------+
only showing top 20 rows



Let's drop the NaN's so that we can compute the RMSE:

In [17]:
predictions = predictions.dropna(how='any')

The prediction is NaN when the user has made a rating for the given profile. There are also negative values and maybe values greater than 10.

In [18]:
predictions.describe(['rating', 'prediction']).show()

+-------+-----------------+------------------+
|summary|           rating|        prediction|
+-------+-----------------+------------------+
|  count|           396812|            396812|
|   mean|5.947468826547584|3.5264682021156197|
| stddev|3.103842902712731| 3.532751096253791|
|    min|              1.0|        -27.878664|
|    max|             10.0|         28.086868|
+-------+-----------------+------------------+



In [19]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.65859018648


Let's find the top 10 female matches for userID 209 who is a male:

In [20]:
matchseekerID = 209

In [21]:
lines = spark.sparkContext.textFile('gender.dat')
usersRDD = lines.map(parse_user).filter(lambda x: x[1] == 'F').map(lambda x: Row(userID=matchseekerID, profileID=x[0]))
users = spark.createDataFrame(usersRDD)
users.show()

+---------+------+
|profileID|userID|
+---------+------+
|        1|   209|
|        2|   209|
|        4|   209|
|        5|   209|
|        6|   209|
|        7|   209|
|       11|   209|
|       19|   209|
|       24|   209|
|       25|   209|
|       26|   209|
|       27|   209|
|       28|   209|
|       31|   209|
|       35|   209|
|       36|   209|
|       38|   209|
|       39|   209|
|       40|   209|
|       42|   209|
+---------+------+
only showing top 20 rows



In [22]:
predictions_matchseekerID = model.transform(users).dropna(how='any')
predictions_matchseekerID.sort('prediction', ascending=False).show(n=10)

+---------+------+----------+
|profileID|userID|prediction|
+---------+------+----------+
|   128085|   209| 1.8326111|
|    94750|   209| 1.7952724|
|    51525|   209| 1.7804391|
|    83952|   209| 1.7723638|
|    29160|   209| 1.7341098|
|    52622|   209| 1.6950321|
|   117877|   209| 1.6577109|
|    79726|   209| 1.6481326|
|     8451|   209| 1.6433898|
|   107691|   209| 1.6325084|
+---------+------+----------+
only showing top 10 rows



Mnay of the results in the notebook deserve more attention. This is thought to be because so many users only have a few ratings.