# Big Data Exam Report @ UniBo a.y. 2023/2024

- Manuel Andruccioli
- Kelvin Olaiya

In [None]:
import org.apache.spark.SparkContext
val sc = new SparkContext("local[*]", "BigDataExam")

## Data structures and definitions

### Utility function for parsing

In [None]:
def getCharIndexes(line: String, char: Char): Seq[Int] = line.zipWithIndex.filter(_._1 == char).map(_._2) 
def splitAt(s: String, indices: Seq[Int]): Seq[String] = indices match {
  case h +: t => s.splitAt(h) match {
    case (a, b) => a +: splitAt(b, t.map(_ - h))
  }
  case Nil => Seq(s)
}
def parseCSVLine(l: String): Seq[String] = {
  val apices = getCharIndexes(l, '"').grouped(2).map { case Seq(a, b) => (a, b) }.toSeq
  val commas = getCharIndexes(l, ',').filter(i => !apices.exists { case (a, b) => a < i && i < b })
  return splitAt(l, commas).map(_.dropWhile(s => s == ',' || s == ' ')).map(_.replaceAll("^\"|\"$", ""))
}

### Data structures

In [None]:
case class Track(
  uri: String,
  name: String,
  duration: Int,
  explicit: Boolean,
  artists: String,            // List of artists uri, separated by |
  available_markets: String,  // List of markets, separated by |
  album_uri: String,
  popularity: Int,
)

object Track {
  def fromCSVLine(line: String): Option[Track] = 
    parseCSVLine(line) match {
      case Seq(uri, name, duration, explicit, artists, available_markets, album_uri, popularity) =>
        try {
          Some(Track(uri, name, duration.toInt, explicit.toBoolean, artists, available_markets, album_uri, popularity.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Playlist(
  uri: String,
  name: String,
  num_follower: Int,
)

object Playlist {
  def fromCSVLine(line: String): Option[Playlist] = 
    parseCSVLine(line) match {
      case Seq(uri, name, num_follower) =>
        try {
            Some(Playlist(uri, name, num_follower.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class TrackInPlaylist(
  pid: Int,
  track_uri: String,
  pos: Int,
)

object TrackInPlaylist {
  def fromCSVLine(line: String): Option[TrackInPlaylist] = 
    parseCSVLine(line) match {
      case Seq(pid, track_uri, pos) =>
        try {
          Some(TrackInPlaylist(pid.toInt, track_uri, pos.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Artist(
  uri: String,
  name: String,
  followers: Int,
  genres: String,             // List of genres, separated by |
  popularity: Int,
)

object Artist {
  def fromCSVLine(line: String): Option[Artist] =
    parseCSVLine(line) match {
      case Seq(uri, name, followers, genres, popularity) =>
        try {
          Some(Artist(uri, name, followers.toInt, genres, popularity.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Album(
  uri: String,
  name: String,
  album_type: String,         // album, compilation, single.
  artists: String,            // List of artists uri, separated by |
  available_markets: String,  // List of markets, separated by |
  release_year: String,
  total_tracks: Int,
)

object Album {
  def fromCSVLine(line: String): Option[Album] = 
    parseCSVLine(line) match {
      case Seq(uri, name, album_type, artists, available_markets, release_year, total_tracks) =>
        try {
          Some(Album(uri, name, album_type, artists, available_markets, release_year, total_tracks.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Feature(
  uri: String,
  key: Int,
  loudness: String,
  tempo: String,
  mode: Boolean,
  danceability: String,
  valence: String,
  instrumentalness: String,
  liveness: String,
  acousticness: String,
  energy: String,
  speechiness: String,
)

object Feature {
  def fromCSVLine(line: String): Option[Feature] = 
    parseCSVLine(line) match {
      case Seq(uri, key, loudness, tempo, mode, danceability, valence, instrumentalness, liveness, acousticness, energy, speechiness) =>
        try {
          Some(Feature(uri, key.toInt, loudness, tempo, mode.toInt == 1, danceability, valence, instrumentalness, liveness, acousticness, energy, speechiness))
        } catch {
          case e: Throwable => None
        }
    }
}

## Dataset exploration

In [None]:
val datasetPath = "dataset/"

val albumRdd = sc.textFile(s"${datasetPath}albums.csv").flatMap(Album.fromCSVLine)
val artistRdd = sc.textFile(s"${datasetPath}artists.csv").flatMap(Artist.fromCSVLine)
val featureRdd = sc.textFile(s"${datasetPath}features.csv").flatMap(Feature.fromCSVLine)
val playlistRdd = sc.textFile(s"${datasetPath}playlists.csv").flatMap(Playlist.fromCSVLine)
val trackInPlaylistRdd = sc.textFile(s"${datasetPath}tracks_in_playlists.csv").flatMap(TrackInPlaylist.fromCSVLine)
val trackRdd = sc.textFile(s"${datasetPath}tracks.csv").flatMap(Track.fromCSVLine)

In [None]:
/*val albumRddCached = albumRdd.cache()
val artistRddCached = artistRdd.cache()
val featureRddCached = featureRdd.cache()
val playlistRddCached = playlistRdd.cache()
val trackInPlaylistRddCached = trackInPlaylistRdd.cache()
val trackRddCached = trackRdd.cache()

In [None]:
println(s"Number of Albums: ${albumRddCached.count()}")
println(s"Number of Artists: ${artistRddCached.count()}")
println(s"Number of Track's Feature: ${featureRddCached.count()}")
println(s"Number of Playlist: ${playlistRddCached.count()}")
println(s"Number of Tracks add in Playlists: ${trackInPlaylistRddCached.count()}")
println(s"Number of Tracks: ${trackRddCached.count()}")*/


- Date le seguenti metriche:
  - popolarità della traccia
  - popolarità media delle tracce in anno
  - popolarità dell'artista (se più artisti, media di essi)
Capire come una playlist viene influenzata maggiormente dalle precedenti metriche, mediando i valori delle tracce di cui è composta. Inoltre, aggregare le playlist sull'influenza precedentemente calcolata, mediando per il numero di followers delle playlist.
La query permette di rispondere alla seguente domanda:
una playlist influenzata maggiormente dalla popolarità delle tracce ha in media 500 followers. (stessa cosa per le altre due metriche di partenza)
 
- Given the following classes: slowly danceable (tempo <= 130BPM, danceability > 0.5), swiftly danceable (tempo >130BPM, danceability > 0.5), slowly undanceable (tempo <= 130BPM, danceability <= 0.5), swiftly undanceable (tempo >130BPM, danceability <= 0.5); and the various keys (C, C#/Db, ...).
  for each class and (key ---OR--- range of followers) get:
    - The number of playlist.
    - Average playlist's percentage.
    - Percentage of explicit songs.
    - Average number of playlist followers.
    - Average tracks tempo
    - Average tracks danceability
  (The key of a playlist is the most present key among its tracks)

In [None]:
val trackWithAlbumUriAsKey = trackRdd.map(t => (t.album_uri, (t.uri, t.popularity)))

val avgPopPerYear = albumRdd.map(a => (a.uri, a.release_year)).
        join(trackWithAlbumUriAsKey).
        map { case (albumUri, (releaseYear, (trackUri, popularity))) => (releaseYear, popularity) }.
        aggregateByKey((0, 0))(
          { case ((acc, count), popularity) => (acc + popularity, count + 1) },
          { case ((acc1, count1), (acc2, count2)) => (acc1 + acc2, count1 + count2) }
        ).mapValues { case (acc, count) => acc.toDouble / count } // year -> avg popularity of tracks in that year

val trackWithPopularityAndAvgPopInYear = albumRdd.map(a => (a.uri, a.release_year)).
        join(trackWithAlbumUriAsKey).
        map { case (albumUri, (releaseYear, (trackUri, popularity))) => (releaseYear, (trackUri, popularity)) }.
        join(avgPopPerYear).
        map { case (releaseYear, ((trackUri, popularity), avgPopularityInYear)) => (trackUri, (popularity, avgPopularityInYear)) }

In [None]:
val artistWithPopularity = artistRdd.map(a => (a.uri, a.popularity))

val trackWithArtistPopularity = trackRdd.flatMap(t => t.artists.split('|').map(artistUri => (artistUri, t.uri))).
        join(artistWithPopularity).
        map { case (artistUri, (trackUri, artistPopularity)) => (trackUri, artistPopularity) }.
        aggregateByKey((0, 0))(
          { case ((acc, count), popularity) => (acc + popularity, count + 1) },
          { case ((acc1, count1), (acc2, count2)) => (acc1 + acc2, count1 + count2) }
        ).mapValues { case (acc, count) => acc.toDouble / count } // track -> avg popularity of artists of that track


In [None]:
val trackWithPid = trackInPlaylistRdd.map(t => (t.track_uri, t.pid))

val popAvgArtistPop = trackWithPopularityAndAvgPopInYear.join(trackWithArtistPopularity).
        map { case (trackUri, ((popularity, avgPopularityInYear), avgArtistPopularity)) => (trackUri, (popularity, avgPopularityInYear, avgArtistPopularity)) }

import org.apache.spark.HashPartitioner

trackWithPid.partitionBy(new HashPartitioner(1)).
        join(popAvgArtistPop).
        map {
            case (trackUri, (pid, (popularity, avgPopularityInYear, avgArtistPopularity))) => (pid, (popularity, avgPopularityInYear, avgArtistPopularity))
        }.
        aggregateByKey((0.0, 0.0, 0.0, 0))(
          { case ((accPop, accAvgPopInYear, accAvgArtistPop, count), (popularity, avgPopularityInYear, avgArtistPopularity)) =>  
              (accPop + popularity, accAvgPopInYear + avgPopularityInYear, accAvgArtistPop + avgArtistPopularity, count + 1)
          },
          { case ((accPop1, accAvgPopInYear1, accAvgArtistPop1, count1), (accPop2, accAvgPopInYear2, accAvgArtistPop2, count2)) =>
            (accPop1 + accPop2, accAvgPopInYear1 + accAvgPopInYear2, accAvgArtistPop1 + accAvgArtistPop2, count1 + count2)
          }
        ).
        mapValues { case (accPop, accAvgPopInYear, accAvgArtistPop, count) => (accPop / count, accAvgPopInYear / count, accAvgArtistPop / count) }.
        map {
            case (pid, (avgPop, avgPopInYear, avgArtistPop)) =>
              val maxAvg = Math.max(avgPop, Math.max(avgPopInYear, avgArtistPop))
              val indexOfBestAvg = Seq(avgPop, avgPopInYear, avgArtistPop).indexWhere(_ >= maxAvg)
              (pid, indexOfBestAvg)
        }.take(5).foreach(println)