# Big Data Exam Report @ UniBo a.y. 2023/2024

- Manuel Andruccioli
- Kelvin Olaiya

## Data structures and definitions

### Utility function for parsing

In [None]:
def getCharIndexes(line: String, char: Char): Seq[Int] = line.zipWithIndex.filter(_._1 == char).map(_._2) 
def splitAt(s: String, indices: Seq[Int]): Seq[String] = indices match {
  case h +: t => s.splitAt(h) match {
    case (a, b) => a +: splitAt(b, t.map(_ - h))
  }
  case Nil => Seq(s)
}
def parseCSVLine(l: String): Seq[String] = {
  val apices = getCharIndexes(l, '"').grouped(2).map { case Seq(a, b) => (a, b) }.toSeq
  val commas = getCharIndexes(l, ',').filter(i => !apices.exists { case (a, b) => a < i && i < b })
  return splitAt(l, commas).map(_.dropWhile(s => s == ',' || s == ' ')).filter(_ != "").map(_.replaceAll("^\"|\"$", ""))
}

### Data structures

In [None]:
case class Track(
  uri: String,
  name: String,
  duration: Int,
  explicit: Boolean,
  artists: String,            // List of artists uri, separated by |
  available_markets: String,  // List of markets, separated by |
  album_uri: String,
  popularity: Int,
)

object Track {
  def fromCSVLine(line: String): Option[Track] = 
    parseCSVLine(line) match {
      case Seq(uri, name, duration, explicit, artists, available_markets, album_uri, popularity) =>
        try {
          Some(Track(uri, name, duration.toInt, explicit.toBoolean, artists, available_markets, album_uri, popularity.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Playlist(
  uri: String,
  name: String,
  num_follower: Int,
)

object Playlist {
  def fromCSVLine(line: String): Option[Playlist] = 
    parseCSVLine(line) match {
      case Seq(uri, name, num_follower) =>
        try {
            Some(Playlist(uri, name, num_follower.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class TrackInPlaylist(
  pid: Int,
  track_uri: String,
  pos: Int,
)

object TrackInPlaylist {
  def fromCSVLine(line: String): Option[TrackInPlaylist] = 
    parseCSVLine(line) match {
      case Seq(pid, track_uri, pos) =>
        try {
          Some(TrackInPlaylist(pid.toInt, track_uri, pos.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Artists(
  uri: String,
  name: String,
  followers: Int,
  genres: String,             // List of genres, separated by |
  popularity: Int,
)

object Artists {
  def fromCSVLine(line: String): Option[Artists] = 
    parseCSVLine(line) match {
      case Seq(uri, name, followers, genres, popularity) =>
        try {
          Some(Artists(uri, name, followers.toInt, genres, popularity.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Album(
  uri: String,
  name: String,
  album_type: String,         // album, compilation, single.
  artists: String,            // List of artists uri, separated by |
  available_markets: String,  // List of markets, separated by |
  release_year: String,
  total_tracks: Int,
)

object Album {
  def fromCSVLine(line: String): Option[Album] = 
    parseCSVLine(line) match {
      case Seq(uri, name, album_type, artists, available_markets, release_year, total_tracks) =>
        try {
          Some(Album(uri, name, album_type, artists, available_markets, release_year, total_tracks.toInt))
        } catch {
          case _: Throwable => None
        }
    }
}

case class Feature(
  uri: String,
  key: Int,
  loudness: Double,
  tempo: Double,
  mode: Boolean,
  danceability: Double,
  valence: Double,
  instrumentalness: Double,
  liveness: Double,
  acousticness: Double,
  energy: Double,
  speechiness: Double,
)

object Feature {
  def fromCSVLine(line: String): Option[Feature] = 
    parseCSVLine(line) match {
      case Seq(uri, key, loudness, tempo, mode, danceability, valence, instrumentalness, liveness, acousticness, energy, speechiness) =>
        try {
          Some(Feature(uri, key.toInt, loudness.toDouble, tempo.toDouble, mode.toBoolean, danceability.toDouble, valence.toDouble, instrumentalness.toDouble, liveness.toDouble, acousticness.toDouble, energy.toDouble, speechiness.toDouble))
        } catch {
          case _: Throwable => None
        }
    }
}

## Dataset exploration

In [None]:
val albumRdd = sc.textFile("dataset/albums.csv")
val artistRdd = sc.textFile("dataset/artists.csv")
val featureRdd = sc.textFile("dataset/features.csv")
val playlistRdd = sc.textFile("dataset/playlists.csv")
val trackInPlaylistRdd = sc.textFile("dataset/tracks_in_playlists.csv")
val trackRdd = sc.textFile("dataset/tracks.csv")

In [None]:
// albumRdd.flatMap(Album.fromCSVLine).take(5).foreach(println)
// artistRdd.flatMap(Artists.fromCSVLine).take(5).foreach(println)
// featureRdd.flatMap(Feature.fromCSVLine).take(5).foreach(println)
playlistRdd.flatMap(Playlist.fromCSVLine).take(5).foreach(println)
trackInPlaylistRdd.flatMap(TrackInPlaylist.fromCSVLine).take(5).foreach(println)
trackRdd.flatMap(Track.fromCSVLine).take(5).foreach(println)