# 스파크 고급 분석 3장

## 음악 추천과 Audioscrobber 데이터셋

In [2]:
%%configure -f
{
    "proxyUser": "hduser",
    "executorMemory": "4000M", 
    "executorCores": 6,
    "conf": {"spark.jars.packages": "graphframes:graphframes:0.3.0-spark2.0-s_2.11",
             "spark.sql.crossJoin.enabled": "true"}
}

In [3]:
// Optional, but may help avoid errors due to long lineage
//spark.sparkContext.setCheckpointDir("hdfs:///tmp/")

import scala.collection.Map
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
4,,spark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

import scala.collection.Map
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.recommendation.{ALS, ALSModel}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.spark.sql.functions._


In [4]:
val base = "hdfs://localhost:54310/audioscrobbler/"
val rawUserArtistData = spark.read.textFile(base + "user_artist_data.txt")
val rawArtistData = spark.read.textFile(base + "artist_data.txt")
val rawArtistAlias = spark.read.textFile(base + "artist_alias.txt")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

base: String = hdfs://localhost:54310/audioscrobbler/
rawUserArtistData: org.apache.spark.sql.Dataset[String] = [value: string]
rawArtistData: org.apache.spark.sql.Dataset[String] = [value: string]
rawArtistAlias: org.apache.spark.sql.Dataset[String] = [value: string]


In [5]:
def buildArtistAlias(rawArtistAlias: Dataset[String]): Map[Int,Int] = {
    rawArtistAlias.flatMap { line =>
        val Array(artist, alias) = line.split('\t')
        if (artist.isEmpty) {
            None
        } else {
            Some((artist.toInt, alias.toInt))
        }
    }.collect().toMap
}

def buildArtistByID(rawArtistData: Dataset[String]): DataFrame = {
    rawArtistData.flatMap { line =>
        val (id, name) = line.span(_ != '\t')
        if (name.isEmpty) {
            None
        } else {
            try {
                Some((id.toInt, name.trim))
            } catch {
                case _: NumberFormatException => None
            }
        }
    }.toDF("id", "name")
}

def preparation(
    rawUserArtistData: Dataset[String],
    rawArtistData: Dataset[String],
    rawArtistAlias: Dataset[String]): Unit = {
    rawUserArtistData.take(5).foreach(println)

    val userArtistDF = rawUserArtistData.map { line =>
      val Array(user, artist, _*) = line.split(' ')
      (user.toInt, artist.toInt)
    }.toDF("user", "artist")

    userArtistDF.agg(min("user"), max("user"), min("artist"), max("artist")).show()

    val artistByID = buildArtistByID(rawArtistData)
    val artistAlias = buildArtistAlias(rawArtistAlias)

    val (badID, goodID) = artistAlias.head
    artistByID.filter($"id" isin (badID, goodID)).show()
}

def buildCounts(
    rawUserArtistData: Dataset[String],
    bArtistAlias: Broadcast[Map[Int,Int]]): DataFrame = {

    rawUserArtistData.map { line =>
        val Array(userID, artistID, count) = line.split(' ').map(_.toInt)
        val finalArtistID = bArtistAlias.value.getOrElse(artistID, artistID)
        (userID, finalArtistID, count)
    }.toDF("user", "artist", "count")
}

def makeRecommendations(model: ALSModel, userID: Int, howMany: Int): DataFrame = {
    val toRecommend = model.itemFactors.
        select($"id".as("artist")).
        withColumn("user", lit(userID))
    
    model.transform(toRecommend).
        select("artist", "prediction").
        orderBy($"prediction".desc).
        limit(howMany)
}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

buildArtistAlias: (rawArtistAlias: org.apache.spark.sql.Dataset[String])scala.collection.Map[Int,Int]
buildArtistByID: (rawArtistData: org.apache.spark.sql.Dataset[String])org.apache.spark.sql.DataFrame
preparation: (rawUserArtistData: org.apache.spark.sql.Dataset[String], rawArtistData: org.apache.spark.sql.Dataset[String], rawArtistAlias: org.apache.spark.sql.Dataset[String])Unit
buildCounts: (rawUserArtistData: org.apache.spark.sql.Dataset[String], bArtistAlias: org.apache.spark.broadcast.Broadcast[scala.collection.Map[Int,Int]])org.apache.spark.sql.DataFrame
makeRecommendations: (model: org.apache.spark.ml.recommendation.ALSModel, userID: Int, howMany: Int)org.apache.spark.sql.DataFrame


In [16]:
preparation(rawUserArtistData, rawArtistData, rawArtistAlias)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

1000002 1 55
1000002 1000006 33
1000002 1000007 8
1000002 1000009 144
1000002 1000010 314
+---------+---------+-----------+-----------+
|min(user)|max(user)|min(artist)|max(artist)|
+---------+---------+-----------+-----------+
|       90|  2443548|          1|   10794401|
+---------+---------+-----------+-----------+

+-------+----------------+
|     id|            name|
+-------+----------------+
|1208690|Collective Souls|
|1003926| Collective Soul|
+-------+----------------+



## ALS 모델 학습

In [20]:
def model(
    rawUserArtistData: Dataset[String],
    rawArtistData: Dataset[String],
    rawArtistAlias: Dataset[String]):(ALSModel, DataFrame, DataFrame) = {

    val bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))

    val allData = buildCounts(rawUserArtistData, bArtistAlias)
    val Array(trainData, testData) = allData.randomSplit(Array(0.9, 0.1))
    trainData.cache()
    testData.cache()

    val model = new ALS().
        setSeed(Random.nextLong()).
        setImplicitPrefs(true).
        setRank(10).
        setRegParam(0.01).
        setAlpha(1.0).
        setMaxIter(5).
        setUserCol("user").
        setItemCol("artist").
        setRatingCol("count").
        setPredictionCol("prediction").
        fit(trainData)

    (model, trainData, testData)
}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

model: (rawUserArtistData: org.apache.spark.sql.Dataset[String], rawArtistData: org.apache.spark.sql.Dataset[String], rawArtistAlias: org.apache.spark.sql.Dataset[String])(org.apache.spark.ml.recommendation.ALSModel, org.apache.spark.sql.DataFrame, org.apache.spark.sql.DataFrame)


In [23]:
// val (trainedModel, trainData, testData) = model(rawUserArtistData, rawArtistData, rawArtistAlias)
// trainedModel.save(base + "saved_als_model")

val model = ALSModel.load(base + "saved_als_model")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

trainedModel: org.apache.spark.ml.recommendation.ALSModel = als_a7c6535d9ecf
trainData: org.apache.spark.sql.DataFrame = [user: int, artist: int ... 1 more field]
testData: org.apache.spark.sql.DataFrame = [user: int, artist: int ... 1 more field]
model: org.apache.spark.ml.recommendation.ALSModel = als_a7c6535d9ecf


## 예측

In [24]:
// val bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
// val trainData = buildCounts(rawUserArtistData, bArtistAlias).cache()
val userID = 2093760
val existingArtistIDs = trainData.
filter($"user" === userID).
select("artist").as[Int].collect()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

userID: Int = 2093760
existingArtistIDs: Array[Int] = Array(378, 813, 1180, 1255340)


In [25]:
val artistByID = buildArtistByID(rawArtistData)
artistByID.filter($"id" isin (existingArtistIDs:_*)).show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

artistByID: org.apache.spark.sql.DataFrame = [id: int, name: string]
+-------+---------------+
|     id|           name|
+-------+---------------+
|   1180|     David Gray|
|    378|  Blackalicious|
|    813|     Jurassic 5|
|1255340|The Saw Doctors|
+-------+---------------+



In [26]:
val topRecommendations = makeRecommendations(model, userID, 5)
topRecommendations.show()

val recommendedArtistIDs = topRecommendations.select("artist").as[Int].collect()
artistByID.filter($"id" isin (recommendedArtistIDs:_*)).show()

// model.userFactors.unpersist()
// model.itemFactors.unpersist()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

topRecommendations: org.apache.spark.sql.DataFrame = [artist: int, prediction: float]
+-------+-----------+
| artist| prediction|
+-------+-----------+
|1037970|0.013426684|
|1007614| 0.01282847|
|    250|0.012546132|
|1002061|0.012249939|
|    813|0.012233459|
+-------+-----------+

recommendedArtistIDs: Array[Int] = Array(1037970, 1007614, 250, 1002061, 813)
+-------+------------+
|     id|        name|
+-------+------------+
|1007614|       Jay-Z|
|1037970|  Kanye West|
|1002061|Jack Johnson|
|    813|  Jurassic 5|
|    250|     Outkast|
+-------+------------+



## 평가

In [27]:
def areaUnderCurve(
    positiveData: DataFrame,
    bAllArtistIDs: Broadcast[Array[Int]],
    predictFunction: (DataFrame => DataFrame)): Double = {

    // What this actually computes is AUC, per user. The result is actually something
    // that might be called "mean AUC".

    // Take held-out data as the "positive".
    // Make predictions for each of them, including a numeric score
    val positivePredictions = predictFunction(positiveData.select("user", "artist")).
        withColumnRenamed("prediction", "positivePrediction")

    // BinaryClassificationMetrics.areaUnderROC is not used here since there are really lots of
    // small AUC problems, and it would be inefficient, when a direct computation is available.

    // Create a set of "negative" products for each user. These are randomly chosen
    // from among all of the other artists, excluding those that are "positive" for the user.
    val negativeData = positiveData.select("user", "artist").as[(Int,Int)].
      groupByKey { case (user, _) => user }.
      flatMapGroups { case (userID, userIDAndPosArtistIDs) =>
        val random = new Random()
        val posItemIDSet = userIDAndPosArtistIDs.map { case (_, artist) => artist }.toSet
        val negative = new ArrayBuffer[Int]()
        val allArtistIDs = bAllArtistIDs.value
        var i = 0
        // Make at most one pass over all artists to avoid an infinite loop.
        // Also stop when number of negative equals positive set size
        while (i < allArtistIDs.length && negative.size < posItemIDSet.size) {
          val artistID = allArtistIDs(random.nextInt(allArtistIDs.length))
          // Only add new distinct IDs
          if (!posItemIDSet.contains(artistID)) {
            negative += artistID
          }
          i += 1
        }
        // Return the set with user ID added back
        negative.map(artistID => (userID, artistID))
      }.toDF("user", "artist")

    // Make predictions on the rest:
    val negativePredictions = predictFunction(negativeData).
      withColumnRenamed("prediction", "negativePrediction")

    // Join positive predictions to negative predictions by user, only.
    // This will result in a row for every possible pairing of positive and negative
    // predictions within each user.
    val joinedPredictions = positivePredictions.join(negativePredictions, "user").
      select("user", "positivePrediction", "negativePrediction").cache()

    // Count the number of pairs per user
    val allCounts = joinedPredictions.
      groupBy("user").agg(count(lit("1")).as("total")).
      select("user", "total")
    // Count the number of correctly ordered pairs per user
    val correctCounts = joinedPredictions.
      filter($"positivePrediction" > $"negativePrediction").
      groupBy("user").agg(count("user").as("correct")).
      select("user", "correct")

    // Combine these, compute their ratio, and average over all users
    val meanAUC = allCounts.join(correctCounts, Seq("user"), "left_outer").
      select($"user", (coalesce($"correct", lit(0)) / $"total").as("auc")).
      agg(mean("auc")).
      as[Double].first()

    joinedPredictions.unpersist()

    meanAUC
}

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

areaUnderCurve: (positiveData: org.apache.spark.sql.DataFrame, bAllArtistIDs: org.apache.spark.broadcast.Broadcast[Array[Int]], predictFunction: org.apache.spark.sql.DataFrame => org.apache.spark.sql.DataFrame)Double


In [30]:
val bArtistAlias = spark.sparkContext.broadcast(buildArtistAlias(rawArtistAlias))
val allData = buildCounts(rawUserArtistData, bArtistAlias)
val allArtistIDs = allData.select("artist").as[Int].distinct().collect()
val bAllArtistIDs = spark.sparkContext.broadcast(allArtistIDs)
val auc = areaUnderCurve(testData, bAllArtistIDs, model.transform)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

bArtistAlias: org.apache.spark.broadcast.Broadcast[scala.collection.Map[Int,Int]] = Broadcast(181)
allData: org.apache.spark.sql.DataFrame = [user: int, artist: int ... 1 more field]
allArtistIDs: Array[Int] = Array(1001129, 1003373, 1007972, 1029443, 1076507, 1318111, 833, 1239413, 1000636, 1002431, 1005697, 1040360, 1043263, 1245208, 463, 1043126, 1001601, 1091589, 1004021, 1012885, 1023660, 1004666, 1004739, 1005158, 1005476, 1009031, 1004552, 1233083, 1007334, 1012609, 1348498, 7014014, 1239554, 1007290, 1007777, 1048379, 1013212, 1247803, 1281854, 2164368, 2366, 1054452, 1087384, 1261793, 4935, 1126737, 6623644, 1008081, 2132425, 1168540, 10699021, 496, 1014191, 1009575, 10725691, 1259455, 2060508, 3175, 6896492, 1012902, 1023841, 1088214, 1008233, 1024037, 10368773, 1062158, 1092115, 1236684, 2136388, 1022960, 1247265, 1006814, 1014690, 1104368, 1237375, 10057880, 1017864, 1017973, 1099354, 1329652, 2014256, 2158098, 6656832, 6696725, 1034510, 1291109, 2146392...bAllArtistIDs: or