# Jonathan Halverson
# Monday, December 19, 2016
# Dating Recommender

### This code uses alternating least squares to make recommendations for individuals on a dating site

In [1]:
from __future__ import print_function

import sys
import random
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
APP_NAME = "Dating Recommender"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
print("Spark version: %s" % spark.sparkContext.version)

Spark version: 2.0.2


In [3]:
def parse_rating(line, sep=','):
  u = line.strip().split(sep)
  return Row(userID=int(u[0]), profileID=int(u[1]), rating=float(u[2]))

In [4]:
def parse_user(line, sep=','):
  fields = line.strip().split(sep)
  user_id = int(fields[0])
  gender = fields[1]
  return (user_id, gender)

Set the id and gender filter for the person to make recommendations for:

In [5]:
matchseekerID = 101
gender_filter = 'M'

Load the ratings data and convert to a DataFrame:

In [6]:
lines = spark.sparkContext.textFile('ratings.dat')
ratingsRDD = lines.map(parse_rating)
print(ratingsRDD.first())

Row(profileID=8305, rating=10.0, userID=1)


In [7]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.printSchema()

root
 |-- profileID: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- userID: long (nullable = true)



In [8]:
ratings.show(n=5)

+---------+------+------+
|profileID|rating|userID|
+---------+------+------+
|     8305|  10.0|     1|
|    15530|   6.0|     1|
|    22319|  10.0|     1|
|    32136|   9.0|     1|
|    38868|   7.0|     1|
+---------+------+------+
only showing top 5 rows



Load the gender info:

In [9]:
lines = spark.sparkContext.textFile('gender.dat')
users = dict(lines.map(parse_user).collect())

Split the ratings data into a training and test set:

In [10]:
training, test = ratings.randomSplit([0.8, 0.2])

In [11]:
print('Training: %d' % training.count())
print('Validation: %d' % test.count())

Training: 867559
Validation: 217400


Set parameters for ALS:

In [12]:
rank = 5
num_iterations = 10
lambda_ = 0.01

In [13]:
als = ALS(rank=rank, maxIter=num_iterations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
model = als.fit(training)

In [14]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [15]:
predictions.show(n=5)

+---------+------+------+----------+
|profileID|rating|userID|prediction|
+---------+------+------+----------+
|      496|   6.0| 29441|       NaN|
|     1238|   8.0|100936|  5.692427|
|     1238|   1.0| 25280|  6.687668|
|     1238|   1.0| 50243|-0.4080252|
|     1238|   4.0|104557|  6.746895|
+---------+------+------+----------+
only showing top 5 rows



In [16]:
predictions.describe(['rating', 'prediction']).show()

+-------+------------------+----------+
|summary|            rating|prediction|
+-------+------------------+----------+
|  count|            217400|    217400|
|   mean| 5.934185832566698|       NaN|
| stddev|3.1139845619158297|       NaN|
|    min|               1.0|-390.58282|
|    max|              10.0|       NaN|
+-------+------------------+----------+



In [17]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = nan


In [18]:
#spark.stop()