# Jonathan Halverson
# Monday, December 19, 2016
# Dating Recommender

### This code uses alternating least squares to make recommendations for individuals on a dating site

In [1]:
from __future__ import print_function

import sys
import random
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
APP_NAME = "Dating Recommender"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
print("Spark version: %s" % spark.sparkContext.version)

Spark version: 2.0.2


In [3]:
def parse_rating(line, sep=','):
  u = line.strip().split(sep)
  return Row(userID=int(u[0]), profileID=int(u[1]), rating=float(u[2]))

In [4]:
def parse_user(line, sep=','):
  fields = line.strip().split(sep)
  user_id = int(fields[0])
  gender = fields[1]
  return (user_id, gender)

Load the ratings data and convert to a DataFrame:

In [5]:
lines = spark.sparkContext.textFile('ratings.dat')
ratingsRDD = lines.map(parse_rating)
ratingsRDD.first()

Row(profileID=8305, rating=10.0, userID=1)

In [6]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.printSchema()

root
 |-- profileID: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- userID: long (nullable = true)



In [7]:
ratings.show(n=5)

+---------+------+------+
|profileID|rating|userID|
+---------+------+------+
|     8305|  10.0|     1|
|    15530|   6.0|     1|
|    22319|  10.0|     1|
|    32136|   9.0|     1|
|    38868|   7.0|     1|
+---------+------+------+
only showing top 5 rows



Which users have provided the fewest ratings:

In [8]:
from pyspark.sql import functions as F
rating_counts = ratings.groupBy('userID').agg(F.count('*').alias('count'))
rating_counts.sort('count', ascending=True).show(n=5)

+------+-----+
|userID|count|
+------+-----+
| 10620|    1|
| 15799|    1|
| 10632|    1|
|  1671|    1|
| 11574|    1|
+------+-----+
only showing top 5 rows



What percentage of users have given fewer than 20 ratings?

In [9]:
100.0 * rating_counts.select('*').where('count < 20').count() / rating_counts.count()

94.31364002393634

Create a list of users that have at least 20 ratings:

In [10]:
active_users_ids = rating_counts.rdd.map(lambda x: (x[0], x[1])).filter(lambda x: x[1] >= 20).keys()
active_users_ids.take(10)

[2214, 2509, 7747, 10422, 16530, 17979, 18628, 19979, 22201, 25084]

Split the ratings data into a training and test set:

In [11]:
training, test = ratings.randomSplit([0.6, 0.4])
training.cache()
test.cache()

DataFrame[profileID: bigint, rating: double, userID: bigint]

In [12]:
print('Training: %d' % training.count())
print('Validation: %d' % test.count())

Training: 650812
Validation: 434147


Set parameters for ALS:

In [13]:
rank_ = 8
num_iterations = 8
lambda_ = 0.1

In [14]:
als = ALS(rank=rank_, maxIter=num_iterations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
model = als.fit(training)

In [15]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

Let's examine the NaN elements. They arise when the predicted case in the the training set.

In [16]:
predictions.select('*').filter(predictions.prediction == float('NaN')).show()

+---------+------+------+----------+
|profileID|rating|userID|prediction|
+---------+------+------+----------+
|     1591|   8.0| 65105|       NaN|
|     1591|   3.0| 95357|       NaN|
|     2122|   8.0| 96125|       NaN|
|     2122|   6.0| 18332|       NaN|
|     2122|   5.0| 68011|       NaN|
|     6466|  10.0| 40184|       NaN|
|     8592|   8.0|103050|       NaN|
|     8592|  10.0| 47092|       NaN|
|    11858|   5.0| 90280|       NaN|
|    11858|   9.0| 93019|       NaN|
|    13832|   7.0| 33639|       NaN|
|    14450|   2.0|135264|       NaN|
|    14450|   2.0|112534|       NaN|
|    15957|  10.0|108671|       NaN|
|    18944|   7.0|  2561|       NaN|
|    20135|   8.0| 26239|       NaN|
|    20497|   5.0| 98815|       NaN|
|    20497|   2.0| 77048|       NaN|
|    20497|   4.0| 94853|       NaN|
|    20497|   3.0|  1637|       NaN|
+---------+------+------+----------+
only showing top 20 rows



Let's drop the NaN's so that we can compute the RMSE:

In [17]:
predictions = predictions.dropna(how='any')

The prediction is NaN when the user has made a rating for the given profile. There are also negative values and maybe values greater than 10.

In [18]:
predictions.describe(['rating', 'prediction']).show()

+-------+-----------------+------------------+
|summary|           rating|        prediction|
+-------+-----------------+------------------+
|  count|           397403|            397403|
|   mean|5.946074891231319|2.8578997503196395|
| stddev|3.106971833920752|3.8439783369475093|
|    min|              1.0|        -34.912716|
|    max|             10.0|         32.818954|
+-------+-----------------+------------------+



In [19]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 5.39798749093


Let's find the top 10 female matches for userID 209 who is a male:

In [20]:
matchseekerID = 209

In [21]:
lines = spark.sparkContext.textFile('gender.dat')
usersRDD = lines.map(parse_user).filter(lambda x: x[1] == 'F').map(lambda x: Row(userID=matchseekerID, profileID=x[0]))
users = spark.createDataFrame(usersRDD)
users.show()

+---------+------+
|profileID|userID|
+---------+------+
|        1|   209|
|        2|   209|
|        4|   209|
|        5|   209|
|        6|   209|
|        7|   209|
|       11|   209|
|       19|   209|
|       24|   209|
|       25|   209|
|       26|   209|
|       27|   209|
|       28|   209|
|       31|   209|
|       35|   209|
|       36|   209|
|       38|   209|
|       39|   209|
|       40|   209|
|       42|   209|
+---------+------+
only showing top 20 rows



In [22]:
predictions_matchseekerID = model.transform(users).dropna(how='any')
predictions_matchseekerID.sort('prediction', ascending=False).show(n=10)

+---------+------+----------+
|profileID|userID|prediction|
+---------+------+----------+
|    24443|   209| 1.8492653|
|    91260|   209| 1.8375853|
|    94648|   209| 1.7534904|
|    54181|   209| 1.7020448|
|     8246|   209| 1.6635846|
|    22958|   209| 1.6568315|
|    24697|   209| 1.6375257|
|    54808|   209| 1.6334382|
|   124865|   209| 1.6285427|
|    91757|   209| 1.6153195|
+---------+------+----------+
only showing top 10 rows



Mnay of the results in the notebook deserve more attention. This is thought to be because so many users only have a few ratings.