# MusicoSet Metadata Cleaning

Clean tab-separated musicoset files - convert `-` and `[]` to NA, trim whitespace

In [15]:
library(tidyverse)
library(readr)

In [16]:
# load artists
artists_in <- '../../data/musicoset_metadata/artists.csv'
songs_in <- '../../data/musicoset_metadata/songs.csv'
artists_out <- '../../data/cleaned/musicoset_artists_cleaned.csv'
songs_out <- '../../data/cleaned/musicoset_songs_cleaned.csv'

artists <- read_delim(artists_in, delim="\t", na=c("", "NA", "-"))
sprintf("%d artists loaded", nrow(artists))

"[1m[22mOne or more parsing issues, call `problems()` on your data frame for details,
e.g.:
  dat <- vroom(...)
  problems(dat)"
[1mRows: [22m[34m11518[39m [1mColumns: [22m[34m8[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (6): artist_id, name, artist_type, main_genre, genres, image_url
[32mdbl[39m (2): followers, popularity

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [17]:
glimpse(artists)

Rows: 11,518
Columns: 8
$ artist_id   [3m[90m<chr>[39m[23m "66CXWjxzNUsdJxJ2JdwvnR"[90m, [39m"26VFTg2z8YR0cCuwLzESi2"[90m, [39m"0Y5tJ~
$ name        [3m[90m<chr>[39m[23m "Ariana Grande"[90m, [39m"Halsey"[90m, [39m"Travis Scott"[90m, [39m"Post Malone"[90m, [39m~
$ followers   [3m[90m<dbl>[39m[23m 34554242[90m, [39m7368242[90m, [39m6313709[90m, [39m16737002[90m, [39m483032[90m, [39m15566666[90m, [39m37~
$ popularity  [3m[90m<dbl>[39m[23m 96[90m, [39m90[90m, [39m94[90m, [39m96[90m, [39m89[90m, [39m91[90m, [39m85[90m, [39m88[90m, [39m81[90m, [39m89[90m, [39m86[90m, [39m90[90m, [39m88[90m, [39m62[90m, [39m55~
$ artist_type [3m[90m<chr>[39m[23m "singer"[90m, [39m"singer"[90m, [39m"rapper"[90m, [39m"rapper"[90m, [39m"singer"[90m, [39m"DJ"[90m, [39m"b~
$ main_genre  [3m[90m<chr>[39m[23m "dance pop"[90m, [39m"dance pop"[90m, [39m"pop"[90m, [39m"dfw rap"[90m, [39m"trap music"[90m, [39m~
$ genres    

In [18]:
# load songs
songs <- read_delim(songs_in, delim="\t", na=c("", "NA", "-"))
sprintf("%d songs loaded", nrow(songs))

[1mRows: [22m[34m20405[39m [1mColumns: [22m[34m7[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (5): song_id, song_name, billboard, artists, song_type
[32mdbl[39m (1): popularity
[33mlgl[39m (1): explicit

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [19]:
glimpse(songs)

Rows: 20,405
Columns: 7
$ song_id    [3m[90m<chr>[39m[23m "3e9HZxeyfWwjeyPAMmWSSQ"[90m, [39m"5p7ujcrUXASCNwRaWNHR1C"[90m, [39m"2xLMif~
$ song_name  [3m[90m<chr>[39m[23m "thank u, next"[90m, [39m"Without Me"[90m, [39m"SICKO MODE"[90m, [39m"Sunflower - S~
$ billboard  [3m[90m<chr>[39m[23m "('Thank U, Next', 'Ariana Grande')"[90m, [39m"('Without Me', 'Hals~
$ artists    [3m[90m<chr>[39m[23m "{'66CXWjxzNUsdJxJ2JdwvnR': 'Ariana Grande'}"[90m, [39m"{'26VFTg2z8~
$ popularity [3m[90m<dbl>[39m[23m 86[90m, [39m87[90m, [39m85[90m, [39m92[90m, [39m86[90m, [39m63[90m, [39m52[90m, [39m53[90m, [39m3[90m, [39m51[90m, [39m81[90m, [39m48[90m, [39m78[90m, [39m77[90m, [39m86[90m, [39m~
$ explicit   [3m[90m<lgl>[39m[23m TRUE[90m, [39mTRUE[90m, [39mTRUE[90m, [39mFALSE[90m, [39mFALSE[90m, [39mFALSE[90m, [39mFALSE[90m, [39mFALSE[90m, [39mFALSE[90m,[39m~
$ song_type  [3m[90m<chr>[39m[23m "Solo"[90m, [39m"Solo"[90m, [3

In [29]:
artists2 %>% 
  summarise(across(everything(), ~sum(is.na(.)))) %>% 
  pivot_longer(everything()) %>%
  arrange(desc(value))

name,value
<chr>,<int>
artist_type,4466
main_genre,3148
genres,3148
image_url,504
followers,2
artist_id,0
name,0
popularity,0


In [30]:
# verify no standalone dashes or empty brackets
check <- artists2 %>% filter(if_any(where(is.character), ~ . == "-" | . == "[]"))
if(nrow(check) == 0) print("✓ No dashes or []")

[1] "<U+2713> No dashes or []"


In [20]:
# clean artists - trim and fix dashes/empty lists
fix_dash <- function(x) {
  if(is.character(x)) {
    x <- ifelse(str_detect(x, "^-$"), NA_character_, x)
    x <- ifelse(str_detect(x, "^\\[\\]$"), NA_character_, x)
    x
  } else x
}

artists2 <- artists %>% 
  mutate(across(where(is.character), str_trim)) %>%
  mutate(across(where(is.character), fix_dash))

sprintf("Cleaned %d artists", nrow(artists2))

In [21]:
# missing data (after cleaning - includes both - and [])
artists2 %>% 
  summarise(across(everything(), ~sum(is.na(.)))) %>% 
  pivot_longer(everything()) %>%
  arrange(desc(value))

name,value
<chr>,<int>
artist_type,4466
main_genre,3148
genres,3148
image_url,504
followers,2
artist_id,0
name,0
popularity,0


In [22]:
# verify no standalone dashes or empty brackets
check <- artists2 %>% filter(if_any(where(is.character), ~ . == "-" | . == "[]"))
if(nrow(check) == 0) print("✓ No dashes or []")

[1] "<U+2713> No dashes or []"


In [23]:
# clean songs
songs2 <- songs %>% 
  mutate(across(where(is.character), str_trim)) %>%
  mutate(across(where(is.character), fix_dash))

sprintf("Cleaned %d songs", nrow(songs2))

In [24]:
# song type breakdown
songs2 %>% count(song_type, sort=T)

song_type,n
<chr>,<int>
Solo,18978
Collaboration,1427


In [25]:
# export
dir.create("../../data/cleaned", showWarnings=F, recursive=T)

write_csv(artists2, artists_out, na="", quote="needed")
write_csv(songs2, songs_out, na="", quote="needed")

sprintf("Wrote artists: %s", artists_out)
sprintf("Wrote songs: %s", songs_out)

In [26]:
# verify files
a_lines <- length(readLines(artists_out))
s_lines <- length(readLines(songs_out))

sprintf("Artists: %d lines (expected %d)", a_lines, nrow(artists2)+1)
sprintf("Songs: %d lines (expected %d)", s_lines, nrow(songs2)+1)

In [27]:
# preview artists output
artists2 %>% 
  select(name, followers, popularity, artist_type, main_genre) %>%
  head(10)

name,followers,popularity,artist_type,main_genre
<chr>,<dbl>,<dbl>,<chr>,<chr>
Ariana Grande,34554242,96,singer,dance pop
Halsey,7368242,90,singer,dance pop
Travis Scott,6313709,94,rapper,pop
Post Malone,16737002,96,rapper,dfw rap
Swae Lee,483032,89,singer,trap music
Marshmello,15566666,91,DJ,brostep
Bastille,3776115,85,band,metropopolis
Panic! At The Disco,7749228,88,band,baroque pop
Mariah Carey,4682308,81,singer,dance pop
Offset,1202706,89,singer,atl hip hop


In [28]:
# preview songs output  
songs2 %>%
  select(song_name, billboard, popularity, explicit, song_type) %>%
  head(10)

song_name,billboard,popularity,explicit,song_type
<chr>,<chr>,<dbl>,<lgl>,<chr>
"thank u, next","('Thank U, Next', 'Ariana Grande')",86,True,Solo
Without Me,"('Without Me', 'Halsey')",87,True,Solo
SICKO MODE,"('Sicko Mode', 'Travis Scott')",85,True,Solo
Sunflower - Spider-Man: Into the Spider-Verse,"('Sunflower (Spider-Man: Into The Spider-Verse)', 'Post Malone & Swae Lee')",92,False,Collaboration
High Hopes,"('High Hopes', 'Panic! At The Disco')",86,False,Solo
All I Want for Christmas Is You,"('All I Want For Christmas Is You', 'Mariah Carey')",63,False,Solo
It's the Most Wonderful Time of the Year,"(""It's The Most Wonderful Time Of The Year"", 'Andy Williams')",52,False,Solo
Rockin' Around The Christmas Tree,"(""Rockin' Around The Christmas Tree"", 'Brenda Lee')",53,False,Solo
A Holly Jolly Christmas,"('A Holly Jolly Christmas', 'Burl Ives')",3,False,Solo
Jingle Bell Rock,"('Jingle Bell Rock', 'Bobby Helms')",51,False,Solo
