# Jonathan Halverson
# Monday, December 19, 2016
# Dating Recommender

### This code uses alternating least squares to make recommendations for individuals on a dating site

In [1]:
from __future__ import print_function

import sys
import random
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
APP_NAME = "Dating Recommender"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
print("Spark version: %s" % spark.sparkContext.version)

Spark version: 2.0.2


In [3]:
def parse_rating(line, sep=','):
  u = line.strip().split(sep)
  return Row(userID=int(u[0]), profileID=int(u[1]), rating=float(u[2]))

In [4]:
def parse_user(line, sep=','):
  fields = line.strip().split(sep)
  user_id = int(fields[0])
  gender = fields[1]
  return (user_id, gender)

Load the ratings data and convert to a DataFrame:

In [5]:
lines = spark.sparkContext.textFile('ratings.dat')
ratingsRDD = lines.map(parse_rating)
ratingsRDD.first()

Row(profileID=8305, rating=10.0, userID=1)

In [6]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.printSchema()

root
 |-- profileID: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- userID: long (nullable = true)



In [7]:
ratings.show(n=5)

+---------+------+------+
|profileID|rating|userID|
+---------+------+------+
|     8305|  10.0|     1|
|    15530|   6.0|     1|
|    22319|  10.0|     1|
|    32136|   9.0|     1|
|    38868|   7.0|     1|
+---------+------+------+
only showing top 5 rows



Which users have provided the fewest ratings:

In [8]:
from pyspark.sql import functions as F
rating_counts = ratings.groupBy('userID').agg(F.count('*').alias('count'))
rating_counts.sort('count', ascending=True).show(n=5)

+------+-----+
|userID|count|
+------+-----+
|  8075|    1|
| 16896|    1|
|  9458|    1|
|  9978|    1|
| 13723|    1|
+------+-----+
only showing top 5 rows



In [9]:
rating_counts.select('*').where('count >= 20').count()

7697

Filter out the users that have fewer than 20 ratings:

In [10]:
active_users_ids = rating_counts.rdd.map(lambda x: (x[0], x[1])).filter(lambda x: x[1] >= 20).keys()
active_users_ids.take(10)

[2214, 2509, 7747, 10422, 16530, 17979, 18628, 19979, 22201, 25084]

Split the ratings data into a training and test set:

In [11]:
training, test = ratings.randomSplit([0.6, 0.4])

In [12]:
print('Training: %d' % training.count())
print('Validation: %d' % test.count())

Training: 651366
Validation: 433593


Set parameters for ALS:

In [13]:
rank = 15
num_iterations = 10
lambda_ = 0.01

In [14]:
als = ALS(rank=rank, maxIter=num_iterations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
model = als.fit(training)

In [15]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [23]:
predictions.show(n=50)

+---------+------+------+-----------+
|profileID|rating|userID| prediction|
+---------+------+------+-----------+
|      496|   6.0| 29441|        NaN|
|     1238|   1.0| 50243|  0.4668392|
|     1238|   1.0|  3953|   5.972074|
|     1238|   5.0| 10645| -3.4227247|
|     1591|   8.0| 65105| 0.51276386|
|     1829|   4.0| 81297| -6.1413608|
|     1959|  10.0|122203| -1.4482899|
|     2122|   6.0| 18332| 0.03760426|
|     2866|   7.0| 29822|  1.1421095|
|     3175|   5.0| 63111|-0.21619962|
|     4101|   2.0| 82282|-0.47315684|
|     4900|  10.0| 30185|        NaN|
|     5518|   5.0|112042| -0.4043395|
|     5518|   8.0| 57245| -0.7950466|
|     6336|   1.0| 82272|  -3.063899|
|     6336|   4.0| 69799|   9.631379|
|     6336|   3.0|  8867|  -4.333564|
|     6336|   2.0| 86088| -10.321013|
|     6336|   5.0| 92889|  -8.203182|
|     6336|   5.0|103232|  2.7293777|
|     6336|   3.0| 92757|  3.9227939|
|     6658|   6.0| 81011|  1.2529993|
|     6658|   8.0| 72802|  1.3647562|
|     6658| 

In [17]:
predictions.describe(['rating', 'prediction']).show()

+-------+-----------------+----------+
|summary|           rating|prediction|
+-------+-----------------+----------+
|  count|           433593|    433593|
|   mean|5.946558177830362|       NaN|
| stddev| 3.11120204146382|       NaN|
|    min|              1.0|-86.023155|
|    max|             10.0|       NaN|
+-------+-----------------+----------+



In [18]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


Let's find the top 10 female matches for userID 209 who is a male:

In [19]:
matchseekerID = 209

In [20]:
lines = spark.sparkContext.textFile('gender.dat')
usersRDD = lines.map(parse_user).filter(lambda x: x[1] == 'F').map(lambda x: Row(userID=matchseekerID, profileID=x[0]))
users = spark.createDataFrame(usersRDD)
users.show()

+---------+------+
|profileID|userID|
+---------+------+
|        1|   209|
|        2|   209|
|        4|   209|
|        5|   209|
|        6|   209|
|        7|   209|
|       11|   209|
|       19|   209|
|       24|   209|
|       25|   209|
|       26|   209|
|       27|   209|
|       28|   209|
|       31|   209|
|       35|   209|
|       36|   209|
|       38|   209|
|       39|   209|
|       40|   209|
|       42|   209|
+---------+------+
only showing top 20 rows



In [21]:
predictions_matchseekerID = model.transform(users)
predictions_matchseekerID.sort('prediction', ascending=False).show(n=10)

+---------+------+----------+
|profileID|userID|prediction|
+---------+------+----------+
|       31|   209|       NaN|
|      516|   209|       NaN|
|     1139|   209|       NaN|
|     1143|   209|       NaN|
|     1270|   209|       NaN|
|     1322|   209|       NaN|
|     1618|   209|       NaN|
|     1699|   209|       NaN|
|     1903|   209|       NaN|
|     2393|   209|       NaN|
+---------+------+----------+
only showing top 10 rows



In [22]:
#spark.stop()