# Jonathan Halverson
# Monday, December 19, 2016
# Dating Recommender

### This code uses alternating least squares to make recommendations for individuals on a dating site

In [23]:
from __future__ import print_function

import sys
import random
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [24]:
APP_NAME = "Dating Recommender"
spark = SparkSession.builder.appName(APP_NAME).getOrCreate()
print("Spark version: %s" % spark.sparkContext.version)

Spark version: 2.0.2


In [25]:
def parse_rating(line, sep=','):
  u = line.strip().split(sep)
  return Row(userID=int(u[0]), profileID=int(u[1]), rating=float(u[2]))

In [26]:
def parse_user(line, sep=','):
  fields = line.strip().split(sep)
  user_id = int(fields[0])
  gender = fields[1]
  return (user_id, gender)

Load the ratings data and convert to a DataFrame:

In [27]:
lines = spark.sparkContext.textFile('ratings.dat')
ratingsRDD = lines.map(parse_rating)
ratingsRDD.first()

Row(profileID=8305, rating=10.0, userID=1)

In [28]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.printSchema()

root
 |-- profileID: long (nullable = true)
 |-- rating: double (nullable = true)
 |-- userID: long (nullable = true)



In [29]:
ratings.show(n=5)

+---------+------+------+
|profileID|rating|userID|
+---------+------+------+
|     8305|  10.0|     1|
|    15530|   6.0|     1|
|    22319|  10.0|     1|
|    32136|   9.0|     1|
|    38868|   7.0|     1|
+---------+------+------+
only showing top 5 rows



Which users have provided the fewest ratings:

In [30]:
from pyspark.sql import functions as F
rating_counts = ratings.groupBy('userID').agg(F.count('*').alias('count'))
rating_counts.sort('count', ascending=True).show(n=5)

+------+-----+
|userID|count|
+------+-----+
| 10620|    1|
| 15799|    1|
| 10632|    1|
|  1671|    1|
| 11574|    1|
+------+-----+
only showing top 5 rows



In [31]:
rating_counts.select('*').where('count >= 20').count()

7697

Filter out the users that have fewer than 20 ratings:

In [32]:
active_users_ids = rating_counts.rdd.map(lambda x: (x[0], x[1])).filter(lambda x: x[1] >= 20).keys()
active_users_ids.take(10)

[2214, 2509, 7747, 10422, 16530, 17979, 18628, 19979, 22201, 25084]

Split the ratings data into a training and test set:

In [33]:
training, test = ratings.randomSplit([0.9, 0.1])

In [34]:
print('Training: %d' % training.count())
print('Validation: %d' % test.count())

Training: 976635
Validation: 108324


Set parameters for ALS:

In [35]:
rank_ = 15
num_iterations = 20
lambda_ = 0.01

In [36]:
als = ALS(rank=rank_, maxIter=num_iterations, regParam=lambda_, userCol="userID", itemCol="profileID", ratingCol="rating")
model = als.fit(training)

In [46]:
predictions = model.transform(test).dropna(how='any')
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [82]:
import numpy as np
predictions.select('*').filter(predictions.prediction == float('nan')).show()

+---------+------+------+----------+
|profileID|rating|userID|prediction|
+---------+------+------+----------+
+---------+------+------+----------+



In [84]:
predictions.select('*').filter(predictions.userID == 209).show()

+---------+------+------+----------+
|profileID|rating|userID|prediction|
+---------+------+------+----------+
+---------+------+------+----------+



In [81]:
x = float('nan')
print(x)
print(type(x))

nan
<type 'float'>


In [77]:
predictions.rdd.take(20)

[Row(profileID=1238, rating=1.0, userID=81505, prediction=-0.8608876466751099),
 Row(profileID=1645, rating=1.0, userID=40326, prediction=2.617866277694702),
 Row(profileID=2366, rating=5.0, userID=24646, prediction=1.4188374280929565),
 Row(profileID=5518, rating=2.0, userID=104057, prediction=3.555800676345825),
 Row(profileID=5518, rating=1.0, userID=79331, prediction=1.2028555870056152),
 Row(profileID=5518, rating=3.0, userID=54180, prediction=2.5360350608825684),
 Row(profileID=6336, rating=1.0, userID=82272, prediction=0.768162727355957),
 Row(profileID=6336, rating=3.0, userID=8867, prediction=0.8140339255332947),
 Row(profileID=6336, rating=5.0, userID=92889, prediction=5.4729695320129395),
 Row(profileID=6336, rating=3.0, userID=92757, prediction=-2.0251524448394775),
 Row(profileID=6658, rating=7.0, userID=5294, prediction=3.5344796180725098),
 Row(profileID=6658, rating=6.0, userID=95776, prediction=5.444518089294434),
 Row(profileID=6658, rating=7.0, userID=86010, predicti

In [65]:
df = spark.createDataFrame([(1.0, float('nan')), (float('nan'), 2.0)], ("a", "b"))
df.show()

+---+---+
|  a|  b|
+---+---+
|1.0|NaN|
|NaN|2.0|
+---+---+



In [64]:
df.select(F.isnan("a").alias("r1"), F.isnan(df.a).alias("r2")).collect()

[Row(r1=False, r2=False), Row(r1=True, r2=True)]

The prediction is nan when the user has made a rating for the given profile. There are also negative values and maybe values greater than 10.

In [48]:
predictions.describe(['rating', 'prediction']).show()

+-------+------------------+------------------+
|summary|            rating|        prediction|
+-------+------------------+------------------+
|  count|            103077|            103077|
|   mean| 5.935727659904732|3.7773589493108948|
| stddev|3.1045389499140006|3.9257219345143737|
|    min|               1.0|        -27.810637|
|    max|              10.0|         38.102245|
+-------+------------------+------------------+



In [49]:
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 4.7603219941


Let's find the top 10 female matches for userID 209 who is a male:

In [50]:
matchseekerID = 209

In [51]:
lines = spark.sparkContext.textFile('gender.dat')
usersRDD = lines.map(parse_user).filter(lambda x: x[1] == 'F').map(lambda x: Row(userID=matchseekerID, profileID=x[0]))
users = spark.createDataFrame(usersRDD)
users.show()

+---------+------+
|profileID|userID|
+---------+------+
|        1|   209|
|        2|   209|
|        4|   209|
|        5|   209|
|        6|   209|
|        7|   209|
|       11|   209|
|       19|   209|
|       24|   209|
|       25|   209|
|       26|   209|
|       27|   209|
|       28|   209|
|       31|   209|
|       35|   209|
|       36|   209|
|       38|   209|
|       39|   209|
|       40|   209|
|       42|   209|
+---------+------+
only showing top 20 rows



In [53]:
predictions_matchseekerID = model.transform(users).dropna(how='any')
predictions_matchseekerID.sort('prediction', ascending=False).show(n=10)

+---------+------+----------+
|profileID|userID|prediction|
+---------+------+----------+
|    88841|   209| 1.9209691|
|    98161|   209| 1.5344874|
|    55190|   209| 1.5338956|
|    49976|   209| 1.4907684|
|   110953|   209| 1.4170587|
|    13250|   209|   1.36831|
|    45285|   209| 1.3639637|
|    63534|   209| 1.3294585|
|   130656|   209| 1.2801224|
|   208024|   209| 1.2697482|
+---------+------+----------+
only showing top 10 rows

