# Lexical Diversity Analysis - Data Prep

Transform Billboard data for lexical diversity analysis

Takes cleaned Billboard + MusicoSet data and calculates text metrics

In [None]:
library(tidyverse)
library(tidytext)
library(stringr)
library(readr)

options(scipen = 999)

## Load data

In [None]:
# load billboard
billboard <- read_csv('../../data/cleaned/billboard_24years_lyrics_spotify_bigquery.csv', 
                      show_col_types = F)

# load musicoset genres
artists <- read_csv('../../data/cleaned/musicoset_artists_cleaned.csv', 
                    show_col_types = F)

sprintf("loaded %d songs, %d artists", nrow(billboard), nrow(artists))

## Join with genres

In [None]:
# normalize names for matching
df <- billboard %>%
  mutate(artist_clean = str_to_lower(str_trim(band_singer))) %>%
  left_join(
    artists %>% 
      mutate(artist_clean = str_to_lower(str_trim(name))) %>%
      select(artist_clean, main_genre, genres),
    by = "artist_clean"
  ) %>%
  mutate(has_genre = !is.na(main_genre))

# check coverage
cat(sprintf("genre coverage: %d/%d songs (%.1f%%)\n", 
            sum(df$has_genre), nrow(df), mean(df$has_genre)*100))

# top genres
df %>% count(main_genre, sort=T) %>% head(10)

## Map to macro genres

Too many micro genres (159) - group into broader categories

In [None]:
# create mapping
genre_map <- tribble(
  ~micro_genre,                  ~macro_genre,
  # pop
  "dance pop",                   "POP",
  "pop",                         "POP",
  "canadian pop",                "POP",
  "australian pop",              "POP",
  "hip pop",                     "POP",
  "electropop",                  "POP",
  "post-teen pop",               "POP",
  "indie poptimism",             "POP",
  "pop r&b",                     "POP",
  "boy band",                    "POP",
  
  # hip hop
  "atl hip hop",                 "HIP HOP",
  "hip hop",                     "HIP HOP",
  "canadian hip hop",            "HIP HOP",
  "chicago rap",                 "HIP HOP",
  "detroit hip hop",             "HIP HOP",
  "dirty south rap",             "HIP HOP",
  "dfw rap",                     "HIP HOP",
  "pop rap",                     "HIP HOP",
  "rap",                         "HIP HOP",
  "trap",                        "HIP HOP",
  "southern hip hop",            "HIP HOP",
  "east coast hip hop",          "HIP HOP",
  "west coast rap",              "HIP HOP",
  "gangster rap",                "HIP HOP",
  "conscious hip hop",           "HIP HOP",
  "underground hip hop",         "HIP HOP",
  
  # country
  "contemporary country",        "COUNTRY",
  "country",                     "COUNTRY",
  "country road",                "COUNTRY",
  "modern country rock",         "COUNTRY",
  "country pop",                 "COUNTRY",
  
  # rock/metal
  "alternative metal",           "ROCK",
  "rock",                        "ROCK",
  "indie rock",                  "ROCK",
  "alternative rock",            "ROCK",
  "pop rock",                    "ROCK",
  "modern rock",                 "ROCK",
  "post-grunge",                 "ROCK",
  "nu metal",                    "ROCK",
  "hard rock",                   "ROCK",
  "punk",                        "ROCK",
  
  # r&b
  "canadian contemporary r&b",   "R&B",
  "neo mellow",                  "R&B",
  "r&b",                         "R&B",
  "soul",                        "R&B",
  "urban contemporary",          "R&B",
  "contemporary r&b",            "R&B",
  "quiet storm",                 "R&B",
  
  # electronic
  "edm",                         "ELECTRONIC",
  "house",                       "ELECTRONIC",
  "electro house",               "ELECTRONIC",
  "tropical house",              "ELECTRONIC",
  "big room",                    "ELECTRONIC",
  "progressive house",           "ELECTRONIC",
  "brostep",                     "ELECTRONIC",
  "complextro",                  "ELECTRONIC",
  "moombahton",                  "ELECTRONIC"
)

df <- df %>%
  left_join(genre_map, by = c("main_genre" = "micro_genre")) %>%
  mutate(macro_genre = if_else(is.na(macro_genre) & has_genre, "OTHER", macro_genre))

df %>% count(macro_genre, sort=T)

## Tokenize lyrics

In [None]:
# tokenize into words
tokens <- df %>%
  filter(!is.na(lyrics)) %>%
  select(song, band_singer, year, lyrics, ranking, macro_genre, main_genre) %>%
  unnest_tokens(word, lyrics) %>%
  mutate(
    word = str_replace_all(word, "\\d+", ""),
    word = str_replace_all(word, "'s$", ""),
    word = str_replace_all(word, "[^a-z]", "")
  ) %>%
  filter(word != "")

sprintf("%s words total", format(nrow(tokens), big.mark=","))

In [None]:
# tag stop words
data(stop_words)
tokens <- tokens %>%
  mutate(is_stop = word %in% stop_words$word)

sprintf("stop words: %.1f%%", mean(tokens$is_stop)*100)

## Calculate metrics per song

In [None]:
# basic metrics
metrics <- tokens %>%
  group_by(song, band_singer, year) %>%
  summarise(
    total_words = n(),
    unique_words = n_distinct(word),
    ttr = unique_words / total_words,
    content_words = sum(!is_stop),
    lexical_density = content_words / total_words,
    repetition_rate = 1 - ttr,
    avg_word_length = mean(nchar(word)),
    .groups = "drop"
  )

# hapax (words used once)
hapax <- tokens %>%
  count(song, band_singer, year, word) %>%
  group_by(song, band_singer, year) %>%
  summarise(
    hapax_count = sum(n == 1),
    hapax_ratio = mean(n == 1),
    .groups = "drop"
  )

metrics <- metrics %>% left_join(hapax)

metrics %>% select(total_words:hapax_ratio) %>% summary()

## Add features

In [None]:
# join back with main data
df_final <- df %>%
  left_join(metrics) %>%
  mutate(
    # chart categories
    chart_tier = case_when(
      ranking <= 10 ~ "Top 10",
      ranking <= 25 ~ "11-25",
      ranking <= 50 ~ "26-50",
      ranking <= 75 ~ "51-75",
      ranking <= 100 ~ "76-100"
    ),
    is_top10 = ranking <= 10,
    is_top25 = ranking <= 25,
    chart_score = 101 - ranking,
    
    # time periods
    decade = case_when(
      year < 2010 ~ "2000s",
      year < 2020 ~ "2010s",
      TRUE ~ "2020s"
    ),
    era = case_when(
      year < 2008 ~ "Early",
      year < 2015 ~ "Middle",
      TRUE ~ "Late"
    ),
    years_since_2000 = year - 2000,
    
    # flags
    is_short = total_words < 100,
    is_long = total_words > 600,
    is_normal_length = total_words >= 100 & total_words <= 600,
    has_complete_data = !is.na(macro_genre) & is_normal_length & !is.na(ttr)
  )

## Normalized scores

Compare to genre/year averages

In [None]:
# genre norms
genre_stats <- df_final %>%
  filter(!is.na(macro_genre) & !is.na(ttr)) %>%
  group_by(macro_genre) %>%
  summarise(m = mean(ttr), s = sd(ttr))

df_final <- df_final %>%
  left_join(genre_stats, by = "macro_genre") %>%
  mutate(ttr_z_genre = (ttr - m) / s) %>%
  select(-m, -s)

# year norms
year_stats <- df_final %>%
  filter(!is.na(ttr)) %>%
  group_by(year) %>%
  summarise(m = mean(ttr), s = sd(ttr))

df_final <- df_final %>%
  left_join(year_stats, by = "year") %>%
  mutate(ttr_z_year = (ttr - m) / s) %>%
  select(-m, -s)

# log transforms
df_final <- df_final %>%
  mutate(
    log_words = log(total_words + 1),
    log_unique = log(unique_words + 1)
  )

## Quality check

In [None]:
cat(sprintf("total: %d\n", nrow(df_final)))
cat(sprintf("complete data: %d (%.1f%%)\n", 
            sum(df_final$has_complete_data, na.rm=T),
            mean(df_final$has_complete_data, na.rm=T)*100))

cat("\nlength distribution:\n")
cat(sprintf("  short: %d\n", sum(df_final$is_short, na.rm=T)))
cat(sprintf("  normal: %d\n", sum(df_final$is_normal_length, na.rm=T)))
cat(sprintf("  long: %d\n", sum(df_final$is_long, na.rm=T)))

## Export

In [None]:
# select final columns
final <- df_final %>%
  select(
    song, band_singer, year,
    main_genre, macro_genre,
    ranking, chart_tier, chart_score, is_top10, is_top25,
    decade, era, years_since_2000,
    total_words, unique_words, ttr, lexical_density, repetition_rate, avg_word_length,
    hapax_count, hapax_ratio,
    ttr_z_genre, ttr_z_year, log_words, log_unique,
    is_short, is_long, is_normal_length, has_complete_data,
    lyrics
  )

write_csv(final, '../../data/cleaned/billboard_lexical_analysis_ready.csv')
write_csv(genre_map, '../../data/cleaned/genre_macro_mapping.csv')

cat("\nexported files:\n")
cat("  billboard_lexical_analysis_ready.csv\n")
cat("  genre_macro_mapping.csv\n")

# summary stats
tibble(
  metric = c("total_songs", "with_genre", "complete_data", "years", "macro_genres"),
  value = c(
    nrow(final),
    sum(!is.na(final$macro_genre)),
    sum(final$has_complete_data, na.rm=T),
    n_distinct(final$year),
    n_distinct(final$macro_genre, na.rm=T)
  )
)

In [None]:
# preview
final %>% 
  filter(has_complete_data) %>%
  select(song, band_singer, macro_genre, ranking, ttr, total_words) %>%
  head(10)

---

Done. Ready for analysis.