In [1]:
import pyspark.conf
import pyspark.sql
SparkConf = pyspark.conf.SparkConf
SparkSession = pyspark.sql.SparkSession
spark = SparkSession.builder \
            .appName("Intro") \
            .config('spark.executor.memory', '2g') \
            .config('spark.driver.memory','8g') \
            .config("spark.sql.crossJoin.enabled", "true")\
            .getOrCreate()

In [2]:
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark.sql import Row
from pyspark.sql.types import IntegerType

In [17]:
from pyspark.ml.recommendation import ALS
import random

In [3]:
datadir = '/home/ubuntu/profiledata_06-May-2005/'

In [4]:
base = datadir
rawUserArtistData = spark.read.text(base + "user_artist_data.txt")
rawArtistData = spark.read.text(base + "artist_data.txt")
rawArtistAlias = spark.read.text(base + "artist_alias.txt")

In [None]:

        runRecommender = RunRecommender(spark)
        runRecommender.preparation(rawUserArtistData, rawArtistData, rawArtistAlias)
        runRecommender.model(rawUserArtistData, rawArtistData, rawArtistAlias)
        runRecommender.evaluate(rawUserArtistData, rawArtistAlias)
        runRecommender.recommend(rawUserArtistData, rawArtistData, rawArtistAlias)

### preparation

In [5]:
for _ in rawUserArtistData.take(5):
    print(_)

Row(value='1000002 1 55')
Row(value='1000002 1000006 33')
Row(value='1000002 1000007 8')
Row(value='1000002 1000009 144')
Row(value='1000002 1000010 314')


In [6]:
def fx(row):
    cols = row.value.split(' ')
    user, artist = cols[:2]
    return int(user), int(artist)
# end def
    
userArtistDF = rawUserArtistData.rdd.map(fx).toDF(["user", "artist"])

In [7]:
userArtistDF.agg(F.min("user"), F.max("user"), F.min("artist"), F.max("artist")).show()

+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



In [8]:
def buildArtistByID(rawArtistData):
    def func(row):
        try:
            (_id, name) = row.value.split('\t')
        except ValueError:
            return None, None
        # end try
        if (name.strip() == ''):
            return None, None
        else:
            try:
                return int(_id), name.strip()
            except:
                return None, None
            # end try
        # end if
    # end def
    return rawArtistData.rdd.map(func).toDF(["id", "name"]).na.drop()
# end def

In [10]:
def buildArtistAlias(rawArtistAlias):
    def func(row):
        try:
            artist, alias = row.value.split('\t')
        except ValueError:
            return None, None
        # end try
        if (artist.strip()==''):
            return None, None
        else:
            return int(artist), int(alias)
        # end if
    # end def
    return dict(rawArtistAlias.rdd.map(func).collect())
# end def

In [11]:
artistByID = buildArtistByID(rawArtistData)
artistAlias = buildArtistAlias(rawArtistAlias)

In [12]:
(badID, goodID) = next(iter(artistAlias.items()))

In [13]:
artistByID.filter(F.col('id').isin([badID, goodID])).show()

+-------+------------------+
|     id|              name|
+-------+------------------+
|1109457|             P.O.S|
|2097152|DJ Tiesto -  P.O.S|
+-------+------------------+



### model

In [14]:
bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))

In [15]:
def buildCounts(rawUserArtistData, bArtistAlias):
    def func(row):
        try:
            userID, artistID, count = map(int, row.value.split(' '))
            finalArtistID = bArtistAlias.value.get(artistID, artistID)
            return (userID, finalArtistID, count)
        except ValueError:
            return None, None, None
        # end try
    # end def
    return rawUserArtistData.rdd.map(func).toDF(["user", "artist", "count"]).na.drop()
# end def
trainData = buildCounts(rawUserArtistData, bArtistAlias).cache()

In [18]:
model = ALS().\
    setSeed(random.randrange(0,10000000)).\
    setImplicitPrefs(True).\
    setRank(10).\
    setRegParam(0.01).\
    setAlpha(1.0).\
    setMaxIter(5).\
    setUserCol("user").\
    setItemCol("artist").\
    setRatingCol("count").\
    setPredictionCol("prediction").\
    fit(trainData)

In [19]:
trainData.unpersist()

DataFrame[user: bigint, artist: bigint, count: bigint]

In [20]:
model.userFactors.select("features").show(truncate = False)

+---------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                     |
+---------------------------------------------------------------------------------------------------------------------------------------------+
|[0.21385032, 1.0595834, 0.8638984, -0.58076483, 0.003490461, 0.28563088, -0.6426925, -0.25011587, 0.19714808, 0.056397416]                   |
|[0.025967527, 0.16122174, -0.007507139, 0.21961178, 0.35733238, 0.17316116, 0.078774504, -0.23080605, -0.049477864, -0.13456783]             |
|[0.002412245, 0.0018115287, -0.00166127, -0.008169363, 0.0018560075, -0.0043602716, -0.0012213151, 1.02754086E-4, -0.004237815, 0.0029334316]|
|[0.93689215, 2.0572243, 0.62198925, -0.083344735, 0.2554755, 0.23550883, -0.148994, -0.45758742, 0.50745463, -0.35692817]              

In [21]:
userID = 2093760

existingArtistIDs = trainData.\
    filter(F.col("user") == userID).rdd.\
    map(lambda row: int(row.artist)).collect()

In [22]:
artistByID = buildArtistByID(rawArtistData)

In [23]:
artistByID.filter(F.col("id").isin(existingArtistIDs)).show()

+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



In [24]:
toRecommend = model.itemFactors.\
        select(F.col("id").alias("artist")).\
        withColumn("user", F.lit(userID))

In [25]:
def makeRecommendations(model, userID, howMany):
    toRecommend = model.itemFactors.\
        select(F.col("id").alias("artist")).\
        withColumn("user", F.lit(userID))
    ans = model.transform(toRecommend).\
        select(["artist", "prediction"]).\
        orderBy(F.col("prediction"), ascending=False).\
        limit(howMany)
    return ans
# end def

In [26]:
topRecommendations = makeRecommendations(model, userID, 5)
topRecommendations.show()

+-------+-----------+
| artist| prediction|
+-------+-----------+
|   2814|0.033237856|
|1300642|0.032410886|
|1001819| 0.03231231|
|   4605|0.031960785|
|   1811| 0.03170042|
+-------+-----------+



In [27]:
recommendedArtistIDs = topRecommendations.select("artist").rdd.map(lambda row: int(row['artist'])).collect()

In [28]:
artistByID.filter(F.col("id").isin(recommendedArtistIDs)).show()

+-------+----------+
|     id|      name|
+-------+----------+
|   2814|   50 Cent|
|   4605|Snoop Dogg|
|   1811|   Dr. Dre|
|1001819|      2Pac|
|1300642|  The Game|
+-------+----------+



In [29]:
model.userFactors.unpersist()
model.itemFactors.unpersist()

DataFrame[id: int, features: array<float>]

### evaluate

In [30]:
bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))

In [31]:
allData = buildCounts(rawUserArtistData, bArtistAlias)

In [32]:
trainData, cvData = allData.randomSplit([0.9, 0.1])

In [33]:
trainData.cache()
cvData.cache()

DataFrame[user: bigint, artist: bigint, count: bigint]

In [34]:
allArtistIDs = allData.select("artist").rdd.map(lambda row: int(row["artist"])).distinct().collect()

In [35]:
bAllArtistIDs = spark.sparkContext.broadcast(allArtistIDs)

In [36]:
positiveData = cvData

In [37]:
def predictMostListened(train):
    listenCounts = train.groupBy("artist").\
        agg(F.sum("count").alias("prediction")).\
        select(["artist", "prediction"])
    def func(allData):
        return allData.\
          join(listenCounts, ["artist"], "left_outer").\
          select(["user", "artist", "prediction"])
    # end def
    return func
# end def
predictFunction = predictMostListened(trainData)

In [38]:
positivePredictions = predictFunction(positiveData.select(["user", "artist"])).\
    withColumnRenamed("prediction", "positivePrediction")

In [45]:
negativeData = positiveData.select(["user", "artist"]).rdd.\
    map(lambda row: (int(row['user']), int(row['artist'])))

In [47]:
negativeData = negativeData.groupByKey()

In [49]:
negativeData.cache()

PythonRDD[299] at RDD at PythonRDD.scala:53

In [None]:


# BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
# small AUC problems, and it would be inefficient, when a direct computation is available.

# Create a set of "negative" products for each user. These are randomly chosen
# from among all of the other artists, excluding those that are "positive" for the user.
def func(item):
    userID, userIDAndPosArtistIDs = item
    posItemIDSet = list(userIDAndPosArtistIDs[1])
    negative = []
    allArtistIDs = bAllArtistIDs.value
    i = 0
    # Make at most one pass over all artists to avoid an infinite loop.
    # Also stop when number of negative equals positive set size
    while (i < len(allArtistIDs) and len(negative) < len(posItemIDSet)):
        artistID = random.choice(allArtistIDs)
        # Only add distinct IDs
        if (artistID not in posItemIDSet):
            negative.append(artistID)
        # end def
        i += 1
    # end while
    # Return the set with user ID added back
    return (userID, artistID)
# end def

# BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
# small AUC problems, and it would be inefficient, when a direct computation is available.

# Create a set of "negative" products for each user. These are randomly chosen
# from among all of the other artists, excluding those that are "positive" for the user.
negativeData = positiveData.select(["user", "artist"]).rdd.\
    map(lambda row: (int(row['user']), int(row['artist'])))
negativeData = negativeData.groupByKey()
negativeData = negativeData.map(lambda item: (item[0], (item[0], item[1])) )
negativeData = negativeData.map(func)
negativeData = negativeData.toDF(["user", "artist"])



In [106]:
# Make predictions on the rest:
negativePredictions = predictFunction(negativeData).\
    withColumnRenamed("prediction", "negativePrediction")

# Join positive predictions to negative predictions by user, only.
# This will result in a row for every possible pairing of positive and negative
# predictions within each user.
joinedPredictions = positivePredictions.join(negativePredictions, "user").\
    select(["user", "positivePrediction", "negativePrediction"]).cache()

# Count the number of pairs per user
allCounts = joinedPredictions.\
    groupBy("user").agg(F.count(F.lit("1")).alias("total")).\
    select(["user", "total"])
# Count the number of correctly ordered pairs per user
correctCounts = joinedPredictions.\
    filter(F.col("positivePrediction") > F.col("negativePrediction")).\
    groupBy("user").agg(F.count("user").alias("correct")).\
    select(["user", "correct"])

# Combine these, compute their ratio, and average over all users
meanAUC = allCounts.join(correctCounts, ["user"], "left_outer").\
    select("user", (F.coalesce(F.col("correct"), F.lit(0)) / F.col("total")).alias("auc")).\
    agg(F.mean("auc"))


# joinedPredictions.unpersist()


In [None]:
meanAUC.collect()

In [None]:
def areaUnderCurve(positiveData, bAllArtistIDs, predictFunction):
    
    # What this actually computes is AUC, per user. The result is actually something
    # that might be called "mean AUC".

    # Take held-out data as the "positive".
    # Make predictions for each of them, including a numeric score
    positivePredictions = predictFunction(positiveData.select(["user", "artist"])).\
        withColumnRenamed(["prediction", "positivePrediction"])

    # BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
    # small AUC problems, and it would be inefficient, when a direct computation is available.

    # Create a set of "negative" products for each user. These are randomly chosen
    # from among all of the other artists, excluding those that are "positive" for the user.
    negativeData = positiveData.select(["user", "artist"]).as((IntegerType, IntegerType)).
        groupByKey { case (user, _) : user}.
        flatMapGroups { case (userID, userIDAndPosArtistIDs) :
            random = Random()
            posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) : artist }
            negative = ArrayBuffer[Int]()
            allArtistIDs = bAllArtistIDs.value
            var i = 0
            # Make at most one pass over all artists to avoid an infinite loop.
            # Also stop when number of negative equals positive set size
            while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
                artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
                # Only add distinct IDs
                if (!posItemIDSet.contains(artistID)) {
                    negative += artistID
                # end def
                i += 1
            # end def
            # Return the set with user ID added back
            negative.map(artistID : (userID, artistID))
        # end def.toDF("user", "artist")

    # Make predictions on the rest:
    negativePredictions = predictFunction(negativeData).\
        withColumnRenamed("prediction", "negativePrediction")

    # Join positive predictions to negative predictions by user, only.
    # This will result in a row for every possible pairing of positive and negative
    # predictions within each user.
    joinedPredictions = positivePredictions.join(negativePredictions, "user").\
        select("user", "positivePrediction", "negativePrediction").cache()

    # Count the number of pairs per user
    allCounts = joinedPredictions.\
        groupBy("user").agg(count(lit("1")).as("total")).\
        select("user", "total")
    # Count the number of correctly ordered pairs per user
    correctCounts = joinedPredictions.\
        filter($"positivePrediction" > $"negativePrediction").\
        groupBy("user").agg(count("user").as("correct")).\
        select("user", "correct")

    # Combine these, compute their ratio, and average over all users
    meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").\
        select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).\
        agg(mean("auc")).\
        as[Double].first()

    joinedPredictions.unpersist()

    return meanAUC
# end def

In [None]:
mostListenedAUC = areaUnderCurve(cvData, bAllArtistIDs, predictMostListened(trainData))
print(mostListenedAUC)

evaluations = []
for rank in (5, 30):
    for regParam in (1.0, 0.0001):
        for alpha in (1.0, 40.0):
            model = ALS().
                setSeed(random.randrange(0,10000000)).\
                setImplicitPrefs(True).\
                setRank(rank).setRegParam(regParam).\
                setAlpha(alpha).setMaxIter(20).\
                setUserCol("user").setItemCol("artist").\
                setRatingCol("count").setPredictionCol("prediction").\
                fit(trainData)

                auc = areaUnderCurve(cvData, bAllArtistIDs, model.transform)

                model.userFactors.unpersist()
                model.itemFactors.unpersist()

                ans = (auc, (rank, regParam, alpha))
                evaluations.append(ans)
        # end for
    # end for
# end for

for _ in reversed(sorted(evaluations)):
    print(_)
# end for

trainData.unpersist()
cvData.unpersist()

In [269]:
def recommend(rawUserArtistData, rawArtistData, rawArtistAlias):

    bArtistAlias = self.spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
    allData = buildCounts(rawUserArtistData, bArtistAlias).cache()
    model = ALS().\
        setSeed(Random.nextLong()).\
        setImplicitPrefs(True).\
        setRank(10).setRegParam(1.0).setAlpha(40.0).setMaxIter(20).\
        setUserCol("user").setItemCol("artist").\
        setRatingCol("count").setPredictionCol("prediction").\
        fit(allData)
    allData.unpersist()

    userID = 2093760
    topRecommendations = makeRecommendations(model, userID, 5)

    recommendedArtistIDs = topRecommendations.select("artist").map(lambda row: int(row["artist"])).collect()
    artistByID = buildArtistByID(rawArtistData)
    artistByID.join(self.spark.createDataset(recommendedArtistIDs).toDF("id"), "id").\
        select("name").show()

    model.userFactors.unpersist()
    model.itemFactors.unpersist()
# end def