# Lexical Diversity Analysis - Data Prep

Transform Billboard data for lexical diversity analysis

Takes cleaned Billboard + MusicoSet data and calculates text metrics

In [29]:
library(tidyverse)
library(tidytext)
library(stringr)
library(readr)

options(scipen = 999)

## Load data

In [30]:
# load billboard (using improved/cleaned version from bigquery cleaning)
billboard <- read_csv('../../data/cleaned/billboard_24years_lyrics_spotify_bigquery.csv',
                      show_col_types = F)

# load musicoset genres
artists <- read_csv('../../data/cleaned/musicoset_artists_cleaned.csv', 
                    show_col_types = F)

sprintf("loaded %d songs, %d artists", nrow(billboard), nrow(artists))

## Join with genres

In [31]:
# normalize names for matching
df <- billboard %>%
  mutate(artist_clean = str_to_lower(str_trim(band_singer))) %>%
  left_join(
    artists %>% 
      mutate(artist_clean = str_to_lower(str_trim(name))) %>%
      select(artist_clean, main_genre, genres),
    by = "artist_clean"
  ) %>%
  mutate(has_genre = !is.na(main_genre))

# check coverage
cat(sprintf("genre coverage: %d/%d songs (%.1f%%)\n", 
            sum(df$has_genre), nrow(df), mean(df$has_genre)*100))

# top genres
df %>% count(main_genre, sort=T) %>% head(10)

"[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mi[39m Row 573 of `x` matches multiple rows in `y`.
[36mi[39m Row 8366 of `y` matches multiple rows in `x`.
[36mi[39m If a many-to-many relationship is expected, set `relationship =


genre coverage: 2948/3399 songs (86.7%)


main_genre,n
<chr>,<int>
dance pop,982
,451
contemporary country,253
atl hip hop,250
pop,152
hip hop,97
alternative metal,76
canadian hip hop,75
canadian pop,56
chicago rap,51


In [32]:
# --- NA Diagnostic ---
cat("=== NA Analysis ===\n\n")

# Genre matching failures
no_genre <- df %>% filter(!has_genre)
cat(sprintf("Songs without genre match: %d (%.1f%%)\n", nrow(no_genre), nrow(no_genre)/nrow(df)*100))

# Sample unmatched artists
cat("\nSample unmatched artists (first 10):\n")
no_genre %>% 
  distinct(band_singer) %>% 
  head(10) %>% 
  pull(band_singer) %>% 
  paste(collapse = ", ") %>% 
  cat()

# Many-to-many duplicates
dupes <- df %>% 
  group_by(song, band_singer, year) %>% 
  filter(n() > 1) %>% 
  ungroup()
cat(sprintf("\n\nDuplicate rows from many-to-many join: %d\n", nrow(dupes)))

# Lyrics status
cat(sprintf("\nSongs with missing lyrics: %d\n", sum(is.na(df$lyrics))))

cat("\n\n=== This explains the NA sources ===\n")
cat("- main_genre NA: Artist name not found in MusicoSet\n")
cat("- Duplicates: Same artist name matches multiple MusicoSet entries\n")

=== NA Analysis ===

Songs without genre match: 451 (13.3%)

Sample unmatched artists (first 10):
The Product G&B, Sisq<U+00F3>, Pink, NSYNC, 'N Sync, Celine Dion, Sonique, Blink 182, 98 Degrees, Cheb Mami

Duplicate rows from many-to-many join: 4

Songs with missing lyrics: 0


=== This explains the NA sources ===
- main_genre NA: Artist name not found in MusicoSet
- Duplicates: Same artist name matches multiple MusicoSet entries


## Map to macro genres

Dynamically maps all ~1560 micro-genres from MusicoSet to 16 macro categories using regex pattern matching:

- **POP**: dance pop, indie pop, teen pop, k-pop, j-pop, etc.
- **HIP HOP**: trap, crunk, drill, regional rap scenes, emo rap, etc.
- **ROCK**: alternative, indie, punk, grunge, post-rock, etc.
- **METAL**: nu metal, metalcore, death metal, black metal, etc.
- **ELECTRONIC**: EDM, house, techno, dubstep, ambient, etc.
- **R&B**: soul, neo soul, funk, motown, quiet storm, etc.
- **COUNTRY**: contemporary country, americana, bluegrass, etc.
- **LATIN**: reggaeton, salsa, bachata, cumbia, urbano latino, etc.
- **JAZZ**: bebop, smooth jazz, fusion, big band, etc.
- **BLUES**: delta blues, chicago blues, electric blues, etc.
- **FOLK**: indie folk, singer-songwriter, celtic, traditional, etc.
- **CLASSICAL**: orchestra, opera, chamber, baroque, etc.
- **REGGAE**: ska, dub, dancehall, roots reggae, etc.
- **NEW AGE**: ambient, meditation, relaxation, etc.
- **AVANT-GARDE**: experimental, noise, drone, etc.
- **OTHER**: remaining niche/regional genres

In [None]:
# === DYNAMIC GENRE MAPPING ===
# Extract all unique genres from MusicoSet and classify using regex patterns

# Get all unique main_genre values from the joined data
all_genres <- df %>%
  filter(!is.na(main_genre)) %>%
  distinct(main_genre) %>%
  pull(main_genre)

cat(sprintf("Found %d unique micro-genres to classify\n\n", length(all_genres)))

# Define regex patterns for each macro genre (order matters - first match wins)
genre_patterns <- list(
  # POP - broad pop category
  "POP" = c("pop(?!.*punk)", "teen", "boy band", "girl group", "bubblegum", "candy"),
  
  # HIP HOP - rap and hip hop variants
  "HIP HOP" = c("hip hop", "\\brap\\b", "trap", "crunk", "drill", "g funk", "gangster",
                "conscious hip", "battle rap", "boom bap", "chopped and screwed",
                "dirty south", "hyphy", "phonk", "grime", "uk hip hop"),
  
  # COUNTRY - country and americana
  "COUNTRY" = c("country", "americana", "bluegrass", "honky tonk", "outlaw",
                "red dirt", "texas music", "cowboy", "nashville"),
  
  # ROCK - rock and punk variants
  "ROCK" = c("rock(?!.*opera)", "punk(?!.*funk)", "grunge", "alternative(?!.*r&b|.*hip)",
             "indie(?!.*pop|.*folk|.*soul)", "emo(?!.*rap)", "screamo", "post-",
             "new wave", "britpop", "shoegaze", "garage", "psychedelic"),
  
  # METAL - heavy metal variants
  "METAL" = c("metal", "\\bcore\\b", "metalcore", "deathcore", "grindcore",
              "thrash", "death metal", "black metal", "doom", "sludge",
              "djent", "nu metal", "groove metal", "power metal"),
  
  # R&B - rhythm and blues, soul
  "R&B" = c("r&b", "\\bsoul\\b", "neo soul", "quiet storm", "new jack swing",
            "\\bfunk\\b(?!.*punk)", "motown", "doo-wop", "urban contemporary"),
  
  # LATIN - latin american music
  "LATIN" = c("latin", "reggaeton", "salsa", "bachata", "cumbia", "merengue",
              "banda", "norteno", "corrido", "ranchera", "mariachi", "tropical",
              "urbano", "dembow", "mexican", "colombian", "puerto rican",
              "spanish(?!.*guitar)", "cubano", "bolero", "tango", "bossa nova",
              "mpb", "brazilian", "sertanejo", "forro", "axe"),
  
  # ELECTRONIC - electronic dance music
  "ELECTRONIC" = c("edm", "house", "techno", "trance", "dubstep", "drum and bass",
                   "\\bdnb\\b", "breakbeat", "jungle", "ambient", "chillout",
                   "downtempo", "electronica", "\\bdisco\\b", "eurodance",
                   "synthwave", "synthpop", "industrial(?!.*metal)", "idm",
                   "glitch", "future bass", "moombahton", "uk garage",
                   "hardstyle", "gabber", "happy hardcore", "electro(?!.*swing)"),
  
  # JAZZ - jazz variants
  "JAZZ" = c("jazz", "bebop", "swing(?!.*house)", "big band", "cool jazz",
             "fusion(?!.*metal)", "smooth jazz", "acid jazz", "free jazz",
             "hard bop", "modal jazz", "vocal jazz"),
  
  # BLUES - blues variants
  "BLUES" = c("\\bblues\\b", "delta blues", "chicago blues", "electric blues",
              "rhythm and blues", "jump blues", "blues rock"),
  
  # FOLK - folk and acoustic
  "FOLK" = c("\\bfolk\\b", "singer-songwriter", "acoustic", "celtic",
             "traditional", "appalachian", "world music", "roots"),
  
  # CLASSICAL - classical music
  "CLASSICAL" = c("classical", "orchestra", "symphony", "opera(?!.*rock|.*metal)",
                  "chamber", "baroque", "romantic era", "minimalism",
                  "contemporary classical", "choral", "cantata", "concerto"),
  
  # REGGAE - reggae and caribbean
  "REGGAE" = c("reggae", "\\bska\\b", "\\bdub\\b(?!step)", "dancehall",
               "roots reggae", "lovers rock", "rocksteady", "calypso",
               "soca", "caribbean"),
  
  # NEW AGE - relaxation and meditation
  "NEW AGE" = c("new age", "meditation", "relaxation", "healing",
                "spa", "sleep", "nature sounds"),
  
  # AVANT-GARDE - experimental
  "AVANT-GARDE" = c("experimental", "avant-garde", "noise", "\\bdrone\\b",
                    "musique concrete", "sound art", "field recordings")
)

# Function to classify a genre
classify_genre <- function(genre) {
  genre_lower <- str_to_lower(genre)
  
  for (macro in names(genre_patterns)) {
    patterns <- genre_patterns[[macro]]
    for (pattern in patterns) {
      if (str_detect(genre_lower, pattern)) {
        return(macro)
      }
    }
  }
  return("OTHER")
}

# Apply classification to all genres
genre_map <- tibble(
  micro_genre = all_genres,
  macro_genre = sapply(all_genres, classify_genre)
)

# Show distribution
cat("Macro genre distribution from dynamic mapping:\n")
genre_map %>% count(macro_genre, sort = TRUE) %>% print(n = 20)

# Apply to main dataframe
df <- df %>%
  left_join(genre_map, by = c("main_genre" = "micro_genre")) %>%
  mutate(macro_genre = if_else(is.na(macro_genre) & has_genre, "OTHER", macro_genre))

# Show final distribution in songs
cat("\n\nSongs per macro genre:\n")
df %>% count(macro_genre, sort = TRUE)

## Tokenize lyrics

In [34]:
# tokenize into words
tokens <- df %>%
  filter(!is.na(lyrics)) %>%
  select(song, band_singer, year, lyrics, ranking, macro_genre, main_genre) %>%
  unnest_tokens(word, lyrics) %>%
  mutate(
    word = str_replace_all(word, "\\d+", ""),
    word = str_replace_all(word, "'s$", ""),
    word = str_replace_all(word, "[^a-z]", "")
  ) %>%
  filter(word != "")

sprintf("%s words total", format(nrow(tokens), big.mark=","))

In [35]:
# tag stop words
data(stop_words)
tokens <- tokens %>%
  mutate(is_stop = word %in% stop_words$word)

sprintf("stop words: %.1f%%", mean(tokens$is_stop)*100)

## Calculate metrics per song

**Lexical diversity metrics:**
- TTR, lexical density, hapax ratio, repetition rate

**Repetitiveness metrics:**
- **repeated_line_ratio**: % of lines that are duplicates (choruses, hooks)
- **unique_line_ratio**: % of lines that appear only once
- **compression_ratio**: gzip compressed size / original size (like Nature paper)
  - Lower ratio = more repetitive (compresses better)
  - Higher ratio = less repetitive (less compressible)

**Similarity metrics (Jaccard):**
- **jaccard_genre**: Overlap with top 500 words in song's genre (genre-typicality)
- **jaccard_corpus**: Overlap with top 1000 words across all songs (mainstream vocabulary)
- **jaccard_common**: Overlap with 10k most common English words (everyday language)

**Vocabulary uniqueness metrics:**
- **rare_word_ratio**: % of words NOT in common 10k English words (higher = more unique)
- **vocab_uniqueness**: 1 - (common words / unique words) (higher = more creative vocabulary)

In [36]:
# basic metrics
metrics <- tokens %>%
  group_by(song, band_singer, year) %>%
  summarise(
    total_words = n(),
    unique_words = n_distinct(word),
    ttr = unique_words / total_words,
    content_words = sum(!is_stop),
    lexical_density = content_words / total_words,
    repetition_rate = 1 - ttr,
    avg_word_length = mean(nchar(word)),
    .groups = "drop"
  )

# hapax (words used once)
hapax <- tokens %>%
  count(song, band_singer, year, word) %>%
  group_by(song, band_singer, year) %>%
  summarise(
    hapax_count = sum(n == 1),
    hapax_ratio = mean(n == 1),
    .groups = "drop"
  )

metrics <- metrics %>% left_join(hapax)

# --- Repetitiveness: Line-based analysis ---
# Split lyrics into lines and count repeated lines
line_metrics <- df %>%
  filter(!is.na(lyrics)) %>%
  select(song, band_singer, year, lyrics) %>%
  mutate(
    # Split into lines, clean whitespace
    lines = str_split(lyrics, "\\s{2,}|\\n"),
  ) %>%
  unnest(lines) %>%
  mutate(lines = str_trim(str_to_lower(lines))) %>%
  filter(lines != "") %>%
  group_by(song, band_singer, year) %>%
  summarise(
    total_lines = n(),
    unique_lines = n_distinct(lines),
    repeated_lines = total_lines - unique_lines,
    repeated_line_ratio = repeated_lines / total_lines,
    unique_line_ratio = unique_lines / total_lines,
    .groups = "drop"
  )

cat(sprintf("Line-based repetitiveness calculated for %d songs\n", nrow(line_metrics)))
cat(sprintf("Mean repeated line ratio: %.1f%%\n", mean(line_metrics$repeated_line_ratio) * 100))

# --- Compression ratio (Nature paper method) ---
# Lower compression ratio = more repetitive (compresses better)
compression_metrics <- df %>%
  filter(!is.na(lyrics)) %>%
  select(song, band_singer, year, lyrics) %>%
  rowwise() %>%
  mutate(
    original_size = nchar(lyrics),
    compressed_size = length(memCompress(charToRaw(lyrics), "gzip")),
    compression_ratio = compressed_size / original_size
  ) %>%
  ungroup() %>%
  select(song, band_singer, year, compression_ratio, original_size, compressed_size)

cat(sprintf("Compression ratio calculated for %d songs\n", nrow(compression_metrics)))
cat(sprintf("Mean compression ratio: %.3f\n", mean(compression_metrics$compression_ratio)))

metrics <- metrics %>%
  left_join(line_metrics %>% select(song, band_singer, year,
                                     total_lines, unique_lines,
                                     repeated_line_ratio, unique_line_ratio),
            by = c("song", "band_singer", "year")) %>%
  left_join(compression_metrics %>% select(song, band_singer, year, compression_ratio),
            by = c("song", "band_singer", "year"))

# --- Jaccard Similarity to Genre Vocabulary ---
# Build genre vocabulary (top 500 words per genre for efficiency)
genre_vocab <- tokens %>%
  filter(!is.na(macro_genre)) %>%
  count(macro_genre, word, sort = TRUE) %>%
  group_by(macro_genre) %>%
  slice_head(n = 500) %>%
  summarise(vocab = list(unique(word)), .groups = "drop")

# Get each song's unique words
song_vocab <- tokens %>%
  group_by(song, band_singer, year, macro_genre) %>%
  summarise(words = list(unique(word)), .groups = "drop")

# Calculate Jaccard similarity: |A ∩ B| / |A ∪ B|
jaccard_calc <- song_vocab %>%
  filter(!is.na(macro_genre)) %>%
  left_join(genre_vocab, by = "macro_genre") %>%
  rowwise() %>%
  mutate(
    intersection = length(intersect(words, vocab)),
    union = length(union(words, vocab)),
    jaccard_genre = intersection / union
  ) %>%
  ungroup() %>%
  select(song, band_singer, year, jaccard_genre)

# Jaccard to corpus (all songs)
corpus_vocab <- tokens %>%
  count(word, sort = TRUE) %>%
  slice_head(n = 1000) %>%
  pull(word)

jaccard_corpus <- song_vocab %>%
  rowwise() %>%
  mutate(
    intersection = length(intersect(words, corpus_vocab)),
    union = length(union(words, corpus_vocab)),
    jaccard_corpus = intersection / union
  ) %>%
  ungroup() %>%
  select(song, band_singer, year, jaccard_corpus)

# --- Vocabulary Uniqueness (vs 10k Common English Words) ---
# Load 10k most common English words
common_words <- read_csv('../../data/cleaned/common_english_words_10k.csv', show_col_types = F)
common_word_set <- common_words$word

cat(sprintf("Loaded %d common English words\n", length(common_word_set)))

# Calculate uniqueness metrics
vocab_uniqueness <- song_vocab %>%
  rowwise() %>%
  mutate(
    # How many words are in common 10k list?
    common_count = sum(words %in% common_word_set),
    # How many are NOT in common list (rare/unique)?
    rare_count = sum(!words %in% common_word_set),
    # Percentage of rare words
    rare_word_ratio = rare_count / length(words),
    # Jaccard similarity to common words (lower = more unique)
    intersection_common = length(intersect(words, common_word_set)),
    union_common = length(union(words, common_word_set)),
    jaccard_common = intersection_common / union_common,
    # Vocabulary uniqueness score (inverse of commonality)
    vocab_uniqueness = 1 - (common_count / length(words))
  ) %>%
  ungroup() %>%
  select(song, band_singer, year, rare_word_ratio, jaccard_common, vocab_uniqueness, rare_count, common_count)

# Join all metrics
metrics <- metrics %>%
  left_join(jaccard_calc, by = c("song", "band_singer", "year")) %>%
  left_join(jaccard_corpus, by = c("song", "band_singer", "year")) %>%
  left_join(vocab_uniqueness, by = c("song", "band_singer", "year"))

cat("\nMetrics summary:\n")
metrics %>% 
  select(total_words:hapax_ratio, repeated_line_ratio, unique_line_ratio, compression_ratio,
         jaccard_genre, jaccard_corpus, rare_word_ratio, jaccard_common, vocab_uniqueness) %>% 
  summary()

[1m[22mJoining with `by = join_by(song, band_singer, year)`


Line-based repetitiveness calculated for 3397 songs
Mean repeated line ratio: 0.0%
Compression ratio calculated for 3399 songs
Mean compression ratio: 0.331
Loaded 10000 common English words


"[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mi[39m Row 1649 of `x` matches multiple rows in `y`.
[36mi[39m Row 1649 of `y` matches multiple rows in `x`.
[36mi[39m If a many-to-many relationship is expected, set `relationship =
"[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mi[39m Row 1649 of `x` matches multiple rows in `y`.
[36mi[39m Row 1649 of `y` matches multiple rows in `x`.
[36mi[39m If a many-to-many relationship is expected, set `relationship =



Metrics summary:


  total_words       unique_words         ttr          content_words   
 Min.   :    7.0   Min.   :   6.0   Min.   :0.07059   Min.   :   4.0  
 1st Qu.:  354.0   1st Qu.: 106.0   1st Qu.:0.25528   1st Qu.: 120.0  
 Median :  488.0   Median : 139.0   Median :0.30656   Median : 180.0  
 Mean   :  522.7   Mean   : 158.9   Mean   :0.31374   Mean   : 201.9  
 3rd Qu.:  647.0   3rd Qu.: 196.0   3rd Qu.:0.35894   3rd Qu.: 259.0  
 Max.   :10475.0   Max.   :2606.0   Max.   :0.91071   Max.   :3993.0  
                                                                      
 lexical_density   repetition_rate   avg_word_length  hapax_count     
 Min.   :0.07129   Min.   :0.08929   Min.   :2.626   Min.   :   0.00  
 1st Qu.:0.31043   1st Qu.:0.64106   1st Qu.:3.518   1st Qu.:  48.00  
 Median :0.36632   Median :0.69344   Median :3.653   Median :  73.00  
 Mean   :0.38070   Mean   :0.68626   Mean   :3.676   Mean   :  88.59  
 3rd Qu.:0.42917   3rd Qu.:0.74472   3rd Qu.:3.806   3rd Qu.: 114.00  
 Max. 

## Add features

In [37]:
# join back with main data
df_final <- df %>%
  left_join(metrics) %>%
  mutate(
    # chart categories
    chart_tier = case_when(
      ranking <= 10 ~ "Top 10",
      ranking <= 25 ~ "11-25",
      ranking <= 50 ~ "26-50",
      ranking <= 75 ~ "51-75",
      ranking <= 100 ~ "76-100"
    ),
    is_top10 = ranking <= 10,
    is_top25 = ranking <= 25,
    chart_score = 101 - ranking,
    
    # time periods
    decade = case_when(
      year < 2010 ~ "2000s",
      year < 2020 ~ "2010s",
      TRUE ~ "2020s"
    ),
    era = case_when(
      year < 2008 ~ "Early",
      year < 2015 ~ "Middle",
      TRUE ~ "Late"
    ),
    years_since_2000 = year - 2000,
    
    # flags
    is_short = total_words < 100,
    is_long = total_words > 900,
    is_normal_length = total_words >= 100 & total_words <= 900,
    has_complete_data = !is.na(macro_genre) & is_normal_length & !is.na(ttr)
  )

[1m[22mJoining with `by = join_by(song, band_singer, year)`
"[1m[22mDetected an unexpected many-to-many relationship between `x` and `y`.
[36mi[39m Row 573 of `x` matches multiple rows in `y`.
[36mi[39m Row 1649 of `y` matches multiple rows in `x`.
[36mi[39m If a many-to-many relationship is expected, set `relationship =


## Normalized scores

Compare to genre/year averages

In [38]:
# genre norms
genre_stats <- df_final %>%
  filter(!is.na(macro_genre) & !is.na(ttr)) %>%
  group_by(macro_genre) %>%
  summarise(m = mean(ttr), s = sd(ttr))

df_final <- df_final %>%
  left_join(genre_stats, by = "macro_genre") %>%
  mutate(ttr_z_genre = (ttr - m) / s) %>%
  select(-m, -s)

# year norms
year_stats <- df_final %>%
  filter(!is.na(ttr)) %>%
  group_by(year) %>%
  summarise(m = mean(ttr), s = sd(ttr))

df_final <- df_final %>%
  left_join(year_stats, by = "year") %>%
  mutate(ttr_z_year = (ttr - m) / s) %>%
  select(-m, -s)

# log transforms
df_final <- df_final %>%
  mutate(
    log_words = log(total_words + 1),
    log_unique = log(unique_words + 1)
  )

## Quality check

In [39]:
cat(sprintf("total: %d\n", nrow(df_final)))
cat(sprintf("complete data: %d (%.1f%%)\n", 
            sum(df_final$has_complete_data, na.rm=T),
            mean(df_final$has_complete_data, na.rm=T)*100))

cat("\nlength distribution:\n")
cat(sprintf("  short: %d\n", sum(df_final$is_short, na.rm=T)))
cat(sprintf("  normal: %d\n", sum(df_final$is_normal_length, na.rm=T)))
cat(sprintf("  long: %d\n", sum(df_final$is_long, na.rm=T)))

total: 3427
complete data: 2822 (82.3%)

length distribution:
  short: 12
  normal: 3257
  long: 158


## Export

In [40]:
# select final columns (including repetitiveness and vocabulary uniqueness metrics)
final <- df_final %>%
  select(
    song, band_singer, year,
    main_genre, macro_genre,
    ranking, chart_tier, chart_score, is_top10, is_top25,
    decade, era, years_since_2000,
    total_words, unique_words, ttr, lexical_density, repetition_rate, avg_word_length,
    hapax_count, hapax_ratio,
    total_lines, unique_lines, repeated_line_ratio, unique_line_ratio, compression_ratio,
    jaccard_genre, jaccard_corpus, jaccard_common,
    rare_word_ratio, vocab_uniqueness, rare_count, common_count,
    ttr_z_genre, ttr_z_year, log_words, log_unique,
    is_short, is_long, is_normal_length, has_complete_data,
    lyrics
  )

write_csv(final, '../../data/cleaned/billboard_lexical_analysis_ready.csv')
write_csv(genre_map, '../../data/cleaned/genre_macro_mapping.csv')

cat("\nexported files:\n")
cat("  billboard_lexical_analysis_ready.csv\n")
cat("  genre_macro_mapping.csv\n")

# summary stats
tibble(
  metric = c("total_songs", "with_genre", "complete_data", "years", "macro_genres"),
  value = c(
    nrow(final),
    sum(!is.na(final$macro_genre)),
    sum(final$has_complete_data, na.rm=T),
    n_distinct(final$year),
    n_distinct(final$macro_genre, na.rm=T)
  )
)


exported files:
  billboard_lexical_analysis_ready.csv
  genre_macro_mapping.csv


metric,value
<chr>,<int>
total_songs,3427
with_genre,2962
complete_data,2822
years,24
macro_genres,8


In [41]:
# preview
final %>% 
  filter(has_complete_data) %>%
  select(song, band_singer, macro_genre, ranking, ttr, total_words) %>%
  head(10)

song,band_singer,macro_genre,ranking,ttr,total_words
<chr>,<chr>,<chr>,<dbl>,<dbl>,<int>
Breathe,Faith Hill,COUNTRY,1,0.3769231,260
Smooth,Santana,ROCK,2,0.3169399,366
Smooth,Rob Thomas,POP,2,0.3169399,366
Maria Maria,Santana,ROCK,3,0.3488943,407
I Wanna Know,Joe,POP,4,0.2319688,513
Everything You Want,Vertical Horizon,R&B,5,0.3224852,338
I Knew I Loved You,Savage Garden,POP,7,0.2453416,322
Amazed,Lonestar,COUNTRY,8,0.3473282,262
Bent,Matchbox Twenty,R&B,9,0.3741497,294
He Wasn't Man Enough,Toni Braxton,POP,10,0.2168508,724


---

Done. Ready for analysis.