In [1]:
import pyspark.conf
import pyspark.sql
SparkConf = pyspark.conf.SparkConf
SparkSession = pyspark.sql.SparkSession
spark = SparkSession.builder \
            .appName("Intro") \
            .config('spark.executor.memory', '2g') \
            .config('spark.driver.memory','8g') \
            .getOrCreate()
spark.conf.set("spark.sql.crossJoin.enabled", "true")

In [2]:
datadir = '/home/ubuntu/profiledata_06-May-2005/'

In [3]:
base = datadir
rawUserArtistData = spark.read.text(base + "user_artist_data.txt")
rawArtistData = spark.read.text(base + "artist_data.txt")
rawArtistAlias = spark.read.text(base + "artist_alias.txt")

In [None]:

        runRecommender = RunRecommender(spark)
        runRecommender.preparation(rawUserArtistData, rawArtistData, rawArtistAlias)
        runRecommender.model(rawUserArtistData, rawArtistData, rawArtistAlias)
        runRecommender.evaluate(rawUserArtistData, rawArtistAlias)
        runRecommender.recommend(rawUserArtistData, rawArtistData, rawArtistAlias)

### preparation

In [4]:
for _ in rawUserArtistData.take(5):
    print(_)

Row(value='1000002 1 55')
Row(value='1000002 1000006 33')
Row(value='1000002 1000007 8')
Row(value='1000002 1000009 144')
Row(value='1000002 1000010 314')


In [71]:
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark.sql import Row

In [8]:
def fx(row):
    cols = row.value.split(' ')
    user, artist = cols[:2]
    return int(user), int(artist)
# end def
    
userArtistDF = rawUserArtistData.rdd.map(fx).toDF(["user", "artist"])

In [13]:
userArtistDF.agg(F.min("user"), F.max("user"), F.min("artist"), F.max("artist")).show()

+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+



In [122]:
def buildArtistByID(rawArtistData):
    def func(row):
        try:
            (_id, name) = row.value.split('\t')
        except ValueError:
            return None, None
        # end try
        if (name.strip() == ''):
            return None, None
        else:
            try:
                return int(_id), name.strip()
            except:
                return None, None
            # end try
        # end if
    # end def
    return rawArtistData.rdd.map(func).toDF(["id", "name"]).na.drop()
# end def

In [127]:
artistByID = buildArtistByID(rawArtistData)

In [129]:
def buildArtistAlias(rawArtistAlias):
    def func(row):
        try:
            artist, alias = row.value.split('\t')
        except ValueError:
            return None, None
        # end try
        if (artist.strip()==''):
            return None, None
        else:
            return int(artist), int(alias)
        # end if
    # end def
    return dict(rawArtistAlias.rdd.map(func).collect())
# end def

In [130]:
artistAlias = buildArtistAlias(rawArtistAlias)

In [131]:
(badID, goodID) = next(iter(artistAlias.items()))

In [133]:
artistByID.filter(F.col('id').isin([badID, goodID])).show()

+-------+------------------+
|     id|              name|
+-------+------------------+
|1109457|             P.O.S|
|2097152|DJ Tiesto -  P.O.S|
+-------+------------------+



### model

In [163]:
bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))

In [171]:
def buildCounts(rawUserArtistData, bArtistAlias):
    def func(row):
        try:
            userID, artistID, count = map(int, row.value.split(' '))
            finalArtistID = bArtistAlias.value.get(artistID, artistID)
            return (userID, finalArtistID, count)
        except ValueError:
            return None, None, None
        # end try
    # end def
    return rawUserArtistData.rdd.map(func).toDF(["user", "artist", "count"]).na.drop()
# end def
trainData = buildCounts(rawUserArtistData, bArtistAlias).cache()

In [176]:
from pyspark.ml.recommendation import ALS
import random

In [182]:
model = ALS().\
    setSeed(random.randrange(0,10000000)).\
    setImplicitPrefs(True).\
    setRank(10).\
    setRegParam(0.01).\
    setAlpha(1.0).\
    setMaxIter(5).\
    setUserCol("user").\
    setItemCol("artist").\
    setRatingCol("count").\
    setPredictionCol("prediction").\
    fit(trainData)

In [183]:
trainData.unpersist()

DataFrame[user: bigint, artist: bigint, count: bigint]

In [184]:
model.userFactors.select("features").show(truncate = False)

+-----------------------------------------------------------------------------------------------------------------------------------------------+
|features                                                                                                                                       |
+-----------------------------------------------------------------------------------------------------------------------------------------------+
|[-0.026651615, 0.7032719, 0.2558946, -0.40509853, 0.11468698, 0.38856968, -0.55922914, 0.61060536, -0.62590265, -0.83069855]                   |
|[0.07361874, 0.07843246, 0.17066856, 0.18403913, 0.20880459, -0.21302597, 0.17672752, 0.10975869, -0.17468008, -0.12103835]                    |
|[0.0072502233, 0.0022113367, 0.0023733207, -0.0058102906, 0.0015046557, 0.0062324544, -0.0021433341, 0.0028819693, 1.2387775E-4, -1.4924115E-4]|
|[-0.09531816, 0.69944614, 0.2610822, 0.15259303, -0.5126287, 0.18788792, -0.3447748, 1.2580154, -1.1473079, -1.4304459]    

In [187]:
from pyspark.sql.types import IntegerType

In [192]:
trainData.take(5)

[Row(user=1000002, artist=1, count=55),
 Row(user=1000002, artist=1000006, count=33),
 Row(user=1000002, artist=1000007, count=8),
 Row(user=1000002, artist=1000009, count=144),
 Row(user=1000002, artist=1000010, count=314)]

In [199]:
userID = 2093760

existingArtistIDs = trainData.\
    filter(F.col("user") == userID).rdd.\
    map(lambda row: int(row.artist)).collect()

In [200]:
artistByID = buildArtistByID(rawArtistData)

In [203]:
artistByID.filter(F.col("id").isin(existingArtistIDs)).show()

+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
|    942|         Xzibit|
+-------+---------------+



In [231]:
toRecommend = model.itemFactors.\
        select(F.col("id").alias("artist")).\
        withColumn("user", F.lit(userID))

In [239]:
type(toRecommend)

pyspark.sql.dataframe.DataFrame

In [232]:
toRecommend.take(5)

[Row(artist=30, user=2093760),
 Row(artist=40, user=2093760),
 Row(artist=50, user=2093760),
 Row(artist=70, user=2093760),
 Row(artist=90, user=2093760)]

In [238]:
help(model.transform)

Help on method transform in module pyspark.ml.base:

transform(dataset, params=None) method of pyspark.ml.recommendation.ALSModel instance
    Transforms the input dataset with optional parameters.
    
    :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`
    :param params: an optional param map that overrides embedded params.
    :returns: transformed dataset
    
    .. versionadded:: 1.3.0



In [243]:
def makeRecommendations(model, userID, howMany):
    toRecommend = model.itemFactors.\
        select(F.col("id").alias("artist")).\
        withColumn("user", F.lit(userID))
    ans = model.transform(toRecommend).\
        select(["artist", "prediction"]).\
        orderBy(F.col("prediction"), ascending=False).\
        limit(howMany)
    return ans
# end def

In [245]:
topRecommendations = makeRecommendations(model, userID, 5)
topRecommendations.show()

+-------+-----------+
| artist| prediction|
+-------+-----------+
|1001819|0.027669843|
|   2814|0.027642187|
|1300642|0.027621742|
|   4605|0.027328338|
|1007614|0.027036585|
+-------+-----------+



In [256]:
recommendedArtistIDs = topRecommendations.select("artist").rdd.map(lambda row: int(row['artist'])).collect()

In [258]:
artistByID.filter(F.col("id").isin(recommendedArtistIDs)).show()

+-------+----------+
|     id|      name|
+-------+----------+
|   2814|   50 Cent|
|   4605|Snoop Dogg|
|1007614|     Jay-Z|
|1001819|      2Pac|
|1300642|  The Game|
+-------+----------+



In [259]:
model.userFactors.unpersist()
model.itemFactors.unpersist()

DataFrame[id: int, features: array<float>]

### evaluate

In [None]:
bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))

allData = buildCounts(rawUserArtistData, bArtistAlias)
trainData, cvData = allData.randomSplit([0.9, 0.1])
trainData.cache()
cvData.cache()

allArtistIDs = allData.select("artist").as(IntegerType).distinct().collect()
bAllArtistIDs = self.spark.sparkContext.broadcast(allArtistIDs)

mostListenedAUC = areaUnderCurve(cvData, bAllArtistIDs, predictMostListened(trainData))
print(mostListenedAUC)

        evaluations =
            for (rank         <- Seq(5,    30);
                     regParam <- Seq(1.0, 0.0001);
                     alpha        <- Seq(1.0, 40.0))
            yield {
                model = ALS().
                    setSeed(Random.nextLong()).
                    setImplicitPrefs(true).
                    setRank(rank).setRegParam(regParam).
                    setAlpha(alpha).setMaxIter(20).
                    setUserCol("user").setItemCol("artist").
                    setRatingCol("count").setPredictionCol("prediction").
                    fit(trainData)

                auc = areaUnderCurve(cvData, bAllArtistIDs, model.transform)

                model.userFactors.unpersist()
                model.itemFactors.unpersist()

                (auc, (rank, regParam, alpha))
            # end def

        evaluations.sorted.reverse.foreach(print)

        trainData.unpersist()
        cvData.unpersist()
    # end def


In [None]:



    def recommend(
            rawUserArtistData,
            rawArtistData,
            rawArtistAlias):

        bArtistAlias = self.spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
        allData = buildCounts(rawUserArtistData, bArtistAlias).cache()
        model = ALS().
            setSeed(Random.nextLong()).
            setImplicitPrefs(true).
            setRank(10).setRegParam(1.0).setAlpha(40.0).setMaxIter(20).
            setUserCol("user").setItemCol("artist").
            setRatingCol("count").setPredictionCol("prediction").
            fit(allData)
        allData.unpersist()

        userID = 2093760
        topRecommendations = makeRecommendations(model, userID, 5)

        recommendedArtistIDs = topRecommendations.select("artist").as[Int].collect()
        artistByID = buildArtistByID(rawArtistData)
        artistByID.join(self.spark.createDataset(recommendedArtistIDs).toDF("id"), "id").
            select("name").show()

        model.userFactors.unpersist()
        model.itemFactors.unpersist()
    # end def







    def predictMostListened(train)(allData):
        listenCounts = train.groupBy("artist").
            agg(sum("count").as("prediction")).
            select("artist", "prediction")
        allData.
            join(listenCounts, Seq("artist"), "left_outer").
            select("user", "artist", "prediction")
    # end def

# end class

In [None]:

def areaUnderCurve(positiveData, bAllArtistIDs, predictFunction):
    
    # What this actually computes is AUC, per user. The result is actually something
    # that might be called "mean AUC".

    # Take held-out data as the "positive".
    # Make predictions for each of them, including a numeric score
    positivePredictions = predictFunction(positiveData.select(["user", "artist"])).
        withColumnRenamed(["prediction", "positivePrediction"])

    # BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
    # small AUC problems, and it would be inefficient, when a direct computation is available.

    # Create a set of "negative" products for each user. These are randomly chosen
    # from among all of the other artists, excluding those that are "positive" for the user.
    negativeData = positiveData.select(["user", "artist"]).as((IntegerType, IntegerType)).
        groupByKey { case (user, _) : user # end def.
        flatMapGroups { case (userID, userIDAndPosArtistIDs) :
            random = Random()
            posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) : artist # end def.toSet
            negative = ArrayBuffer[Int]()
            allArtistIDs = bAllArtistIDs.value
            var i = 0
            # Make at most one pass over all artists to avoid an infinite loop.
            # Also stop when number of negative equals positive set size
            while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
                artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
                # Only add distinct IDs
                if (!posItemIDSet.contains(artistID)) {
                    negative += artistID
                # end def
                i += 1
            # end def
            # Return the set with user ID added back
            negative.map(artistID : (userID, artistID))
        # end def.toDF("user", "artist")

    # Make predictions on the rest:
    negativePredictions = predictFunction(negativeData).
        withColumnRenamed("prediction", "negativePrediction")

    # Join positive predictions to negative predictions by user, only.
    # This will result in a row for every possible pairing of positive and negative
    # predictions within each user.
    joinedPredictions = positivePredictions.join(negativePredictions, "user").
        select("user", "positivePrediction", "negativePrediction").cache()

    # Count the number of pairs per user
    allCounts = joinedPredictions.
        groupBy("user").agg(count(lit("1")).as("total")).
        select("user", "total")
    # Count the number of correctly ordered pairs per user
    correctCounts = joinedPredictions.
        filter($"positivePrediction" > $"negativePrediction").
        groupBy("user").agg(count("user").as("correct")).
        select("user", "correct")

    # Combine these, compute their ratio, and average over all users
    meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
        select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
        agg(mean("auc")).
        as[Double].first()

    joinedPredictions.unpersist()

    meanAUC
# end def
