### Import Libraries

In [30]:
// Import Libraries

import scala.io.Source
import scala.util.{Try, Using}
import java.time.LocalDate

[32mimport [39m[36mscala.io.Source
[39m
[32mimport [39m[36mscala.util.{Try, Using}
[39m
[32mimport [39m[36mjava.time.LocalDate
[39m

### Import Files

In [31]:
// Retrieving the File Path (using lazy val)

val configPath = "config.txt" // Retrieve file path from config.txt
lazy val filePath = Try(Source.fromFile(configPath).getLines().next()).getOrElse("songs_2000_2020_50k.csv") // Get file path, suppress output


[36mconfigPath[39m: [32mString[39m = [32m"config.txt"[39m
[36mfilePath[39m: [32mString[39m = [32m<lazy>[39m

### Data Cleaning - Part 1

In [32]:
// Reading and Parsing the Data

val dataRows = Using(Source.fromFile(filePath)) { source =>
  source.getLines().toList
}.getOrElse(List())

val header = dataRows.head.split(",").map(_.trim) // First row is taken as headers
val data = dataRows.tail.map(_.split(",").map(_.trim).toList) // Remaining rows are treated as data


[36mdataRows[39m: [32mList[39m[[32mString[39m] = [33mList[39m(
  [32m"Title,Artist,Album,Genre,Release Date,Duration,Popularity"[39m,
  [32m"Include name this.,Patrick Anderson,Care.,R&B,2008-01-09,262,71"[39m,
  [32m"Manage west energy.,Eric Miller,Raise get.,Jazz,2011-08-20,187,37"[39m,
  [32m"Evening court painting.,Richard Curry,Sport.,Electronic,2010-05-30,212,58"[39m,
  [32m"Section turn hour.,James Smith,Full.,Hip-Hop,2014-10-12,272,59"[39m,
  [32m"Five agreement teach.,Amy Rodriguez,Eat.,Blues,2005-06-09,131,34"[39m,
  [32m"Turn child.,Jessica Martin,Cold according.,R&B,2006-09-16,207,58"[39m,
  [32m"Old.,Cheyenne Powell,Oil.,Country,2010-04-23,163,72"[39m,
  [32m"Clear fly over.,Aaron Coleman,Strategy development.,Classical,2010-02-06,183,73"[39m,
  [32m"Agency employee present.,Brandon Henderson,Might live.,Country,2020-02-18,243,69"[39m,
  [32m"Face become we.,Raymond White,Probably camera.,Blues,2011-11-07,177,55"[39m,
  [32m"Couple bank.,Paul

In [33]:
// Cleaning and Transforming Data

val cleanedData = data.map { row =>
  val rawData = header.zip(row).toMap
  
  Map(
    "Title" -> rawData.getOrElse("Title", "Unknown Title"),
    "Artist" -> rawData.getOrElse("Artist", "Unknown Artist"),
    "Album" -> rawData.getOrElse("Album", "Unknown Album"),
    "Genre" -> rawData.getOrElse("Genre", "Unknown Genre"),
    "Duration" -> Try(rawData.getOrElse("Duration", "0").toInt).getOrElse(0).toString,
    "Popularity" -> Try(rawData.getOrElse("Popularity", "0").toInt).getOrElse(0).toString,
    "Release Date" -> Try(LocalDate.parse(rawData.getOrElse("Release Date", "2000-01-01"))).getOrElse(LocalDate.of(2000, 1, 1)).toString
  )
}

[36mcleanedData[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mString[39m]] = [33mList[39m(
  [33mHashMap[39m(
    [32m"Title"[39m -> [32m"Include name this."[39m,
    [32m"Duration"[39m -> [32m"262"[39m,
    [32m"Popularity"[39m -> [32m"71"[39m,
    [32m"Album"[39m -> [32m"Care."[39m,
    [32m"Artist"[39m -> [32m"Patrick Anderson"[39m,
    [32m"Release Date"[39m -> [32m"2008-01-09"[39m,
    [32m"Genre"[39m -> [32m"R&B"[39m
  ),
  [33mHashMap[39m(
    [32m"Title"[39m -> [32m"Manage west energy."[39m,
    [32m"Duration"[39m -> [32m"187"[39m,
    [32m"Popularity"[39m -> [32m"37"[39m,
    [32m"Album"[39m -> [32m"Raise get."[39m,
    [32m"Artist"[39m -> [32m"Eric Miller"[39m,
    [32m"Release Date"[39m -> [32m"2011-08-20"[39m,
    [32m"Genre"[39m -> [32m"Jazz"[39m
  ),
  [33mHashMap[39m(
    [32m"Title"[39m -> [32m"Evening court painting."[39m,
    [32m"Duration"[39m -> [32m"212"[39m,
    [32m"Popularity"

In [34]:
// Printing Results

// Print headers
println(header.mkString(" | "))

// Print a separating line
println("-" * 50)

// Print the first 5 rows
cleanedData.take(5).foreach { row =>
  println(header.map(h => row.getOrElse(h, "")).mkString(" | "))
}

Title | Artist | Album | Genre | Release Date | Duration | Popularity
--------------------------------------------------
Include name this. | Patrick Anderson | Care. | R&B | 2008-01-09 | 262 | 71
Manage west energy. | Eric Miller | Raise get. | Jazz | 2011-08-20 | 187 | 37
Evening court painting. | Richard Curry | Sport. | Electronic | 2010-05-30 | 212 | 58
Section turn hour. | James Smith | Full. | Hip-Hop | 2014-10-12 | 272 | 59
Five agreement teach. | Amy Rodriguez | Eat. | Blues | 2005-06-09 | 131 | 34


### Data Cleaning - Part 2

In [35]:
// Transforming the cleaned data
val transformedData = cleanedData.map { row =>
  // Define popularity categories based on the "Popularity" value
  val popularityCategory = row("Popularity").toInt match {
    case pop if pop >= 70 => "High"
    case pop if pop >= 40 => "Medium"
    case _ => "Low"
  }

  // Extract year from "Release Date"
  val releaseYear = LocalDate.parse(row("Release Date")).getYear.toString

  // Normalize title and artist (capitalize each word)
  val titleNormalized = row("Title").split(" ").map(_.capitalize).mkString(" ")
  val artistNormalized = row("Artist").split(" ").map(_.capitalize).mkString(" ")

  // Add the new transformations to the row
  row ++ Map(
    "Popularity Category" -> popularityCategory,
    "Release Year" -> releaseYear,
    "Title Normalized" -> titleNormalized,
    "Artist Normalized" -> artistNormalized
  )
}

// Print transformed headers and the first 5 rows of transformed data
val transformedHeader = header ++ Seq("Popularity Category", "Release Year", "Title Normalized", "Artist Normalized")
println(transformedHeader.mkString(" | "))
println("-" * 50)
transformedData.take(5).foreach { row =>
  println(transformedHeader.map(h => row.getOrElse(h, "")).mkString(" | "))
}


Title | Artist | Album | Genre | Release Date | Duration | Popularity | Popularity Category | Release Year | Title Normalized | Artist Normalized
--------------------------------------------------
Include name this. | Patrick Anderson | Care. | R&B | 2008-01-09 | 262 | 71 | High | 2008 | Include Name This. | Patrick Anderson
Manage west energy. | Eric Miller | Raise get. | Jazz | 2011-08-20 | 187 | 37 | Low | 2011 | Manage West Energy. | Eric Miller
Evening court painting. | Richard Curry | Sport. | Electronic | 2010-05-30 | 212 | 58 | Medium | 2010 | Evening Court Painting. | Richard Curry
Section turn hour. | James Smith | Full. | Hip-Hop | 2014-10-12 | 272 | 59 | Medium | 2014 | Section Turn Hour. | James Smith
Five agreement teach. | Amy Rodriguez | Eat. | Blues | 2005-06-09 | 131 | 34 | Low | 2005 | Five Agreement Teach. | Amy Rodriguez


[36mtransformedData[39m: [32mList[39m[[32mMap[39m[[32mString[39m, [32mString[39m]] = [33mList[39m(
  [33mHashMap[39m(
    [32m"Title"[39m -> [32m"Include name this."[39m,
    [32m"Duration"[39m -> [32m"262"[39m,
    [32m"Title Normalized"[39m -> [32m"Include Name This."[39m,
    [32m"Popularity"[39m -> [32m"71"[39m,
    [32m"Album"[39m -> [32m"Care."[39m,
    [32m"Artist"[39m -> [32m"Patrick Anderson"[39m,
    [32m"Release Year"[39m -> [32m"2008"[39m,
    [32m"Release Date"[39m -> [32m"2008-01-09"[39m,
    [32m"Genre"[39m -> [32m"R&B"[39m,
    [32m"Artist Normalized"[39m -> [32m"Patrick Anderson"[39m,
    [32m"Popularity Category"[39m -> [32m"High"[39m
  ),
  [33mHashMap[39m(
    [32m"Title"[39m -> [32m"Manage west energy."[39m,
    [32m"Duration"[39m -> [32m"187"[39m,
    [32m"Title Normalized"[39m -> [32m"Manage West Energy."[39m,
    [32m"Popularity"[39m -> [32m"37"[39m,
    [32m"Album"[39m -> [32m"Raise 

### Data Cleaning - Part 3

In [None]:
// Group by Genre and Release Year to calculate average Popularity and song count
val genreYearStats = transformedData
  .groupBy(row => (row("Genre"), row("Release Year")))
  .view.mapValues { rows =>
    val popularities = rows.map(_("Popularity").toInt)
    
    // Calculate average popularity and song count for each group
    val avgPopularity = popularities.sum / popularities.size
    val songCount = rows.size
    
    Map(
      "Average Popularity by Genre-Year" -> avgPopularity.toString,
      "Song Count by Genre-Year" -> songCount.toString
    )
  }.toMap

// Convert grouped stats to a list of maps for merging
val genreYearStatsData = genreYearStats.toList.map {
  case ((genre, year), stats) =>
    Map(
      "Genre" -> genre,
      "Release Year" -> year,
      "Average Popularity by Genre-Year" -> stats("Average Popularity by Genre-Year"),
      "Song Count by Genre-Year" -> stats("Song Count by Genre-Year")
    )
}

// Print the grouped stats (first 5 rows for preview)
val statsHeader = Seq("Genre", "Release Year", "Average Popularity by Genre-Year", "Song Count by Genre-Year")
println(statsHeader.mkString(" | "))
println("-" * 60)
genreYearStatsData.take(5).foreach { row =>
  println(statsHeader.map(h => row.getOrElse(h, "")).mkString(" | "))
}

Genre | Popularity Category | Average Duration | Average Popularity | Count
--------------------------------------------------
Classical | High | 208 | 85 | 1573
Rock | Medium | 211 | 54 | 1432
Blues | Low | 210 | 19 | 2060
Reggae | High | 212 | 85 | 1583
Rock | Low | 212 | 19 | 1945


[36mgenrePopularityStats[39m: [32mMap[39m[([32mString[39m, [32mString[39m), [32mMap[39m[[32mString[39m, [32mString[39m]] = [33mHashMap[39m(
  ([32m"Classical"[39m, [32m"High"[39m) -> [33mMap[39m(
    [32m"Average Duration"[39m -> [32m"208"[39m,
    [32m"Average Popularity"[39m -> [32m"85"[39m,
    [32m"Count"[39m -> [32m"1573"[39m
  ),
  ([32m"Rock"[39m, [32m"Medium"[39m) -> [33mMap[39m(
    [32m"Average Duration"[39m -> [32m"211"[39m,
    [32m"Average Popularity"[39m -> [32m"54"[39m,
    [32m"Count"[39m -> [32m"1432"[39m
  ),
  ([32m"Blues"[39m, [32m"Low"[39m) -> [33mMap[39m(
    [32m"Average Duration"[39m -> [32m"210"[39m,
    [32m"Average Popularity"[39m -> [32m"19"[39m,
    [32m"Count"[39m -> [32m"2060"[39m
  ),
  ([32m"Reggae"[39m, [32m"High"[39m) -> [33mMap[39m(
    [32m"Average Duration"[39m -> [32m"212"[39m,
    [32m"Average Popularity"[39m -> [32m"85"[39m,
    [32m"Count"[39m -> [32m"1583"[

### Data Cleaning - Part 4

In [None]:
// Convert genreYearStatsData to a Map for easy lookup during merge
val genreYearStatsMap = genreYearStatsData.map { row =>
  (row("Genre"), row("Release Year")) -> row
}.toMap

// Merge transformedData with genreYearStatsMap based on Genre and Release Year
val mergedData = transformedData.map { row =>
  val key = (row("Genre"), row("Release Year"))
  val statsRow = genreYearStatsMap.getOrElse(key, Map(
    "Average Popularity by Genre-Year" -> "N/A",
    "Song Count by Genre-Year" -> "0"
  ))
  
  // Merge the original row with genre-year stats
  row ++ statsRow
}

// Print merged data (first 5 rows for preview)
val mergedHeader = transformedHeader ++ Seq("Average Popularity by Genre-Year", "Song Count by Genre-Year")
println(mergedHeader.mkString(" | "))
println("-" * 80)
mergedData.take(5).foreach { row =>
  println(mergedHeader.map(h => row.getOrElse(h, "")).mkString(" | "))
}

Title | Artist | Album | Genre | Release Date | Duration | Popularity | Popularity Category | Release Year | Title Normalized | Artist Normalized | Average Duration | Average Popularity | Count
--------------------------------------------------
Include name this. | Patrick Anderson | Care. | R&B | 2008-01-09 | 262 | 71 | High | 2008 | Include Name This. | Patrick Anderson | 211 | 84 | 1552
Manage west energy. | Eric Miller | Raise get. | Jazz | 2011-08-20 | 187 | 37 | Low | 2011 | Manage West Energy. | Eric Miller | 209 | 19 | 1911
Evening court painting. | Richard Curry | Sport. | Electronic | 2010-05-30 | 212 | 58 | Medium | 2010 | Evening Court Painting. | Richard Curry | 208 | 54 | 1459
Section turn hour. | James Smith | Full. | Hip-Hop | 2014-10-12 | 272 | 59 | Medium | 2014 | Section Turn Hour. | James Smith | 208 | 54 | 1467
Five agreement teach. | Amy Rodriguez | Eat. | Blues | 2005-06-09 | 131 | 34 | Low | 2005 | Five Agreement Teach. | Amy Rodriguez | 210 | 19 | 2060


[36manalysisMap[39m: [32mMap[39m[([32mString[39m, [32mString[39m), [32mMap[39m[[32mString[39m, [32mString[39m]] = [33mHashMap[39m(
  ([32m"Classical"[39m, [32m"High"[39m) -> [33mHashMap[39m(
    [32m"Count"[39m -> [32m"1573"[39m,
    [32m"Genre"[39m -> [32m"Classical"[39m,
    [32m"Popularity Category"[39m -> [32m"High"[39m,
    [32m"Average Duration"[39m -> [32m"208"[39m,
    [32m"Average Popularity"[39m -> [32m"85"[39m
  ),
  ([32m"Rock"[39m, [32m"Medium"[39m) -> [33mHashMap[39m(
    [32m"Count"[39m -> [32m"1432"[39m,
    [32m"Genre"[39m -> [32m"Rock"[39m,
    [32m"Popularity Category"[39m -> [32m"Medium"[39m,
    [32m"Average Duration"[39m -> [32m"211"[39m,
    [32m"Average Popularity"[39m -> [32m"54"[39m
  ),
  ([32m"Blues"[39m, [32m"Low"[39m) -> [33mHashMap[39m(
    [32m"Count"[39m -> [32m"2060"[39m,
    [32m"Genre"[39m -> [32m"Blues"[39m,
    [32m"Popularity Category"[39m -> [32m"Low"[39m,
    