Install and load packages

In [None]:
install.packages('readxl')
install.packages('dplyr')
install.packages('ggplot2')
install.packages('stargazer')
install.packages('purrr')
install.packages('stringr')
install.packages('lessR')
install.packages('fixest')
install.packages('ordinal')


In [None]:
library(readxl)
library(dplyr)
library(ggplot2)
library(stats)
library(stargazer)
library(purrr)
library(stringr)
library(tidyverse)
library(lessR)
library(fixest)
library(sandwich)
library(lmtest)
library(car)


Load df

In [None]:
# create a list of file names and corresponding is_editorial values
file_names <- c("df_rap_final.xlsx", "df_pop_final.xlsx", "df_variete_final.xlsx", "df_ete_final.xlsx",
                "df_latino_final.xlsx", "df_ambiance_final.xlsx", "df_electro_final.xlsx", "df_rock_final.xlsx",
                "df_indie_final.xlsx", "df_sport_final.xlsx", "df_rnb_final.xlsx", "df_afro_final.xlsx",
                "df_kpop_final.xlsx", "df_detente_final.xlsx", "df_dormir_final.xlsx", "df_soiree_final.xlsx",
                "df_zouk_final.xlsx", "df_reggae_final.xlsx", "df_maison_final.xlsx", "df_decennies_final.xlsx",
                "df_alternatif_final.xlsx", "df_amour_final.xlsx", "df_metal_final.xlsx", "df_jazz_final.xlsx",
                "df_tendances_final.xlsx", "df_equal_final.xlsx", "df_classique_final.xlsx", "df_folk_final.xlsx",
                "df_focus_final.xlsx", "df_soul_final.xlsx", "df_netflix_final.xlsx", "df_enfants_final.xlsx",
                "df_etudiants_final.xlsx", "df_bienetre_final.xlsx", "df_punk_final.xlsx", "df_ambient_final.xlsx",
                "df_country_final.xlsx", "df_blues_final.xlsx", "df_arabe_final.xlsx", "df_voyage_final.xlsx",
                "df_frequency_final.xlsx", "df_tops_final.xlsx"
                )

is_editorial_values <- c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
                         )

type_values <- c('genre', 'genre', 'genre', 'mood', 'genre', 'mood', 'genre', 'genre', 'genre', 'mood', 'genre',
                 'genre', 'genre', 'mood', 'mood', 'mood', 'genre', 'genre', 'mood', 'trend', 'genre', 'mood',
                 'genre', 'genre', 'trend', 'trend', 'genre', 'genre', 'mood', 'genre', 'trend', 'mood', 'mood',
                 'mood', 'genre', 'genre', 'genre', 'genre', 'genre', 'mood', 'trend'
                 )

df_list <- list()
for (i in seq_along(file_names)) {
  file_name <- file_names[i]
  is_editorial <- is_editorial_values[i]
  type <- type_values[i]
  df <- read_xlsx(file_name)
  df$is_editorial <- is_editorial
  df$type <- type
  df_list[[i]] <- df
}

df <- Reduce(function(x, y) merge(x, y, all = TRUE), df_list)


Data management

In [None]:
# Extract numeric values
df$diversity_clean <- as.numeric(gsub("[^[:digit:].-]", "", df$stirling))
df$diversity_clean_2 <- as.numeric(gsub("[^[:digit:].-]", "", df$stirling_2))

# Print numeric values
df$diversity_clean
df$diversity_clean_2

# Scale values from 0 to 1
df$diversity_norm <- rescale(diversity_clean, df, kind='z')
df$diversity_norm_2 <- rescale(diversity_clean_2, df, kind='z')

# Print scaled values
df$diversity_norm
df$diversity_norm_2


In [None]:
# Scale covariates and log followers
df$log_followers <- ifelse(df$playlist_followers == 0, 0, log(df$playlist_followers))
df$log_followers

df$track_pop_norm <- rescale(as.numeric(track_pop), df, kind='0to1')
df$track_pop_norm

df$artist_pop_norm <- rescale(as.numeric(artist_pop), df, kind='0to1')
df$artist_pop_norm

df$nb_tracks_res <- rescale(as.numeric(nb_tracks), df, kind='z')
df$nb_tracks_norm <- rescale(nb_tracks_res, df, kind='0to1')
df$nb_tracks_norm

df$nb_artists_norm <- rescale(as.numeric(nb_artists), df, kind='0to1')
df$nb_artists_norm

df$nb_clusters_norm <- rescale(as.numeric(nb_clusters), df, kind='0to1')
df$nb_clusters_norm

df$nb_clusters_2_norm <- rescale(as.numeric(nb_clusters_2), df, kind='0to1')
df$nb_clusters_2_norm


Descriptive stats

Summary stats

In [None]:
nrow(df)
sum(df$nb_tracks)


In [None]:
summary(df$diversity_clean)
summary(df$diversity_clean_2)


In [None]:
summary(df$nb_clusters)
summary(df$nb_clusters_2)
summary(df$hhi)
summary(df$hhi_2)
summary(df$mean_distance)
summary(df$mean_distance_2)


In [None]:
table(df$is_editorial)


In [None]:
tapply(df$nb_clusters, df$is_editorial, mean)
tapply(df$nb_clusters_2, df$is_editorial, mean)
tapply(df$hhi, df$is_editorial, mean)
tapply(df$hhi_2, df$is_editorial, mean)
tapply(df$mean_distance, df$is_editorial, mean)
tapply(df$mean_distance_2, df$is_editorial, mean)
tapply(df$diversity_clean, df$is_editorial, mean)
tapply(df$diversity_clean_2, df$is_editorial, mean)


In [None]:
table(df$type)


In [None]:
tapply(df$nb_clusters, df$type, mean)
tapply(df$nb_clusters_2, df$type, mean)
tapply(df$hhi, df$type, mean)
tapply(df$hhi_2, df$type, mean)
tapply(df$mean_distance, df$type, mean)
tapply(df$mean_distance_2, df$type, mean)
tapply(df$diversity_clean, df$type, mean)
tapply(df$diversity_clean_2, df$type, mean)


In [None]:
sapply(df[, c('nb_tracks', 'playlist_followers', 'track_pop', 'artist_pop')], function(x) c(summary(x), sd(x)))


Barplots

In [None]:
ggplot(df, aes(x = diversity_clean)) + 
  geom_histogram(binwidth = 0.025, color = "black", fill = "#a7a7f9") +
  xlim(0, 0.5) +
  theme_bw() +
  theme(panel.grid.major.y = element_line(linetype = "dotted"),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        aspect.ratio = 0.8) +
  labs(x = 'k* Rao-Stirling', y = 'Frequency')


In [None]:
ggplot(df, aes(x = diversity_clean_2)) + 
  geom_histogram(binwidth = 0.025, color = "black", fill = "#a7a7f9") +
  xlim(0, 0.7) +
  theme_bw() +
  theme(panel.grid.major.y = element_line(linetype = "dotted"),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        aspect.ratio = 0.8) +
  labs(x = 'k** Rao-Stirling', y = 'Frequency')


Radar charts

In [None]:
# Adjust options to display all columns of the data frame
options(width = 10000)

# Convert columns 15 and 16 to numeric
df_3[, 15:16] <- lapply(df_3[, 15:16], as.numeric)

# Create a new data frame to store scaled values
scaled_df <- data.frame(df_3)

# Rescale loudness, tempo, and duration to the range 0-1
scaled_df$loudness <- scale(scaled_df$loudness, center = min(scaled_df$loudness), scale = max(scaled_df$loudness) - min(scaled_df$loudness))
scaled_df$tempo <- scale(scaled_df$tempo, center = min(scaled_df$tempo), scale = max(scaled_df$tempo) - min(scaled_df$tempo))
scaled_df$duration_ms <- scale(scaled_df$duration_ms, center = min(scaled_df$duration_ms), scale = max(scaled_df$duration_ms) - min(scaled_df$duration_ms))

# Subset your scaled dataframe to select the tracks you want to compare
selected_tracks <- scaled_df[scaled_df$track_name %in% c("I Feel It Coming", "November Rain"), ]

# Print the values for the selected tracks
for (track_name in unique(selected_tracks$track_name)) {
  track_data <- selected_tracks[selected_tracks$track_name == track_name, 7:16]
  cat("Track:", track_name, "\n")
  print(track_data)
  cat("\n")
}


OLS models

Simple OLS

In [None]:
# Perform OLS k*
lm_1 <- lm(diversity_norm ~ playlist_followers + nb_tracks + track_pop + type, data = df)
summary(lm_1)

# Perform OLS k**
lm_2 <- lm(diversity_norm_2 ~ playlist_followers + nb_tracks + track_pop + type, data = df)
summary(lm_2)


In [None]:
# Perform OLS
lm_3 <- lm(diversity_norm ~ playlist_followers + nb_tracks + track_pop + type + as.numeric(nb_clusters), data = df)
summary(lm_3)

# Perform OLS
lm_4 <- lm(diversity_norm_2 ~ playlist_followers + nb_tracks + track_pop + type + as.numeric(nb_clusters_2), data = df)
summary(lm_4)


In [None]:
# Export regressions in LaTeX
stargazer(lm_1, lm_3, lm_2, lm_4)


Robusts OLS

In [None]:
# Fit your robust linear regression 
lm1_robust_clustered <- coeftest(lm_1, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
print(lm1_robust_clustered)

lm2_robust_clustered <- coeftest(lm_2, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
print(lm2_robust_clustered)


In [None]:
# Fit your robust linear regression 
lm3_robust_clustered <- coeftest(lm_3, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
lm4_robust_clustered <- coeftest(lm_4, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)

stargazer(lm1_robust_clustered, lm2_robust_clustered, lm3_robust_clustered, lm4_robust_clustered)


Robustness checks

Testing residuals

In [None]:
# Shapiro-Wilk test
residuals_1 <- residuals(lm_1)
shapiro.test(residuals_1)

residuals_2 <- residuals(lm_2)
shapiro.test(residuals_2)

# Durbin-Watson test
dwtest(lm_1)
dwtest(lm_2)

# Breusch-Pagan test
bptest(lm_1)
bptest(lm_2)


Multicolinearity

In [None]:
vif(lm_1)
vif(lm_2)
