# Data analysis

## Setup

In [None]:
# Load packages
library(readxl)
library(dplyr)
library(ggplot2)
library(stats)
library(stargazer)
library(purrr)
library(stringr)
library(tidyverse)
library(lessR)
library(fixest)
library(sandwich)
library(lmtest)
library(car)
library(effsize)


In [None]:
# Import the data
# Set the base directory path
base_directory <- "/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_management/" #nolint 

# Create a list of file names
file_names <- c(
    "df_afro_final.xlsx", "df_alternative_final.xlsx", "df_ambiance_final.xlsx",
    "df_ambient_final.xlsx", "df_amour_final.xlsx", "df_arabe_final.xlsx",
    "df_bienetre_final.xlsx", "df_blues_final.xlsx", "df_caraibes_final.xlsx",
    "df_classique_final.xlsx", "df_country_final.xlsx", "df_cuisine_final.xlsx",
    "df_dance_final.xlsx", "df_dancehall_final.xlsx", "df_decennies_final.xlsx",
    "df_detente_final.xlsx", "df_dormir_final.xlsx", "df_enfants_final.xlsx",
    "df_equal_final.xlsx", "df_etudiants_final.xlsx", "df_focus_final.xlsx",
    "df_folk_final.xlsx", "df_frequency_final.xlsx", "df_freshfinds_final.xlsx",
    "df_funk_final.xlsx", "df_gaming_final.xlsx", "df_glow_final.xlsx",
    "df_hiphop_final.xlsx", "df_indie_final.xlsx", "df_influenceurs_final.xlsx",
    "df_instrumental_final.xlsx", "df_jazz_final.xlsx", "df_kpop_final.xlsx",
    "df_latino_final.xlsx", "df_maison_final.xlsx", "df_metal_final.xlsx",
    "df_nature_final.xlsx", "df_netflix_final.xlsx", "df_pop_final.xlsx",
    "df_punk_final.xlsx", "df_radar_final.xlsx", "df_reggae_final.xlsx",
    "df_rnb_final.xlsx", "df_rock_final.xlsx", "df_soiree_final.xlsx",
    "df_soul_final.xlsx", "df_sport_final.xlsx", "df_spotsingles_final.xlsx",
    "df_tendances_final.xlsx", "df_tops_final.xlsx", "df_tvcine_final.xlsx",
    "df_variete_final.xlsx", "df_voiture_final.xlsx", "df_voyage_final.xlsx"
)

# Create a list of playlist types
type_values <- c(
    "genre", "genre", "mood", "genre", "mood", "genre", "mood", "genre",
    "genre", "genre", "genre", "mood", "genre", "genre", "mood", "mood",
    "mood", "mood", "mood", "mood", "mood", "genre", "mood", "mood",
    "genre", "mood", "mood", "genre", "genre", "mood", "genre", "genre",
    "genre", "genre", "mood", "genre", "mood", "mood", "genre", "genre",
    "mood", "genre", "genre", "genre", "mood", "genre", "mood", "mood",
    "mood", "top", "mood", "genre", "mood", "mood"
)

# Create a list of playlists curators
curators <- c(
    "spotify", "spotify", "spotify", "spotify", "spotify", "spotify", "spotify",
    "spotify", "spotify", "spotify", "spotify", "spotify", "spotify", "spotify",
    "spotify", "spotify", "spotify", "spotify", "spotify", "spotify", "spotify",
    "spotify", "spotify", "spotify", "spotify", "spotify", "spotify", "spotify",
    "spotify", "external", "spotify", "spotify", "spotify", "spotify",
    "spotify", "spotify", "spotify", "external", "spotify", "spotify",
    "spotify", "spotify", "spotify", "spotify", "spotify", "spotify", "spotify",
    "spotify", "spotify", "spotify", "external", "spotify", "spotify",
    "spotify"
)

# Modify the data frame
df_list <- list()

for (i in seq_along(file_names)) {
    file_name <- file_names[i]
    type <- type_values[i]
    curator <- curators[i]

    full_path <- file.path(base_directory, file_name)
    df <- read_xlsx(full_path)

    df$type <- type
    df$curator <- curator

    # Append the data frame to the list
    df_list[[i]] <- df
}

# Combine all data frames
df <- Reduce(function(x, y) merge(x, y, all = TRUE), df_list)

# Export to Excel
writexl::write_xlsx(df, "df_final.xlsx")


## Data management

In [None]:
# Load data
df <- read_xlsx("df_final.xlsx")


### Main measure

In [None]:
# Extract numeric values
df$diversity_clean <- as.numeric(gsub("[^[:digit:].-]", "", df$stirling_index))
df$diversity_clean

df$diversity_clean_2 <- as.numeric(
    gsub("[^[:digit:].-]", "", df$stirling_index_2)
)
df$diversity_clean_2

# Scale values from 0 to 1
df$diversity_norm <- rescale(diversity_clean, df, kind = "z")
df$diversity_norm

df$diversity_norm_2 <- rescale(diversity_clean_2, df, kind = "z")
df$diversity_norm_2


### Secondary measures

In [None]:
# HH-Index
# Scale values from 0 to 1
df$hhi_norm <- rescale(hh_index, df, kind = "z")
df$hhi_norm

df$hhi_norm_2 <- rescale(hh_index_2, df, kind = "z")
df$hhi_norm_2


In [None]:
# Distances
# Scale values from 0 to 1
df$dist_norm <- rescale(distances, df, kind = "z")
df$dist_norm

df$dist_norm_2 <- rescale(distances_2, df, kind = "z")
df$dist_norm_2

df$dist_norm_3 <- rescale(distances_3, df, kind = "z")
df$dist_norm_3


### Covariates

In [None]:
# Relevel factors
df$type <- relevel(as.factor(df$type), ref = "genre")
table(df$type)

df$curator <- relevel(as.factor(df$curator), ref = "spotify")
table(df$curator)


In [None]:
# Log Followers
df$log_followers <- log(df$playlist_followers)
df$log_followers

# Followers class
summary(df$playlist_followers)
df$followers_class <- case_when(
    df$playlist_followers <= 81624 ~ "low",
    df$playlist_followers > 81624 &
    df$playlist_followers <= 238625 ~ "mid_low",
    df$playlist_followers > 238625 &
    df$playlist_followers <= 755174 ~ "mid_high",
    df$playlist_followers > 755174 ~ "high"
)
table(df$followers_class)


## Descriptive stats

### Stats

In [None]:
# Number of tracks per playlist
df %>%
  group_by(playlist_name) %>%
  summarise(nb_tracks = n()) %>%
  summary()


In [None]:
# Number of clusters per playlist
df %>%
  group_by(playlist_name) %>%
  summarise(nb_clusters = n_distinct(cluster)) %>%
  summary()

df %>%
  group_by(playlist_name) %>%
  summarise(nb_clusters_2 = n_distinct(cluster_2)) %>%
  summary()


In [None]:
# HH-Index
playlist_hh_index <- df %>%
  distinct(playlist_name, hh_index)
summary(playlist_hh_index$hh_index)

playlist_hh_index_2 <- df %>%
  distinct(playlist_name, hh_index_2)
summary(playlist_hh_index_2$hh_index_2)


In [None]:
tapply(df$mean_distance, df$type, mean)


### Plots

In [None]:
# Extract unique diversity values per playlist
playlist_diversity <- df %>%
  distinct(playlist_name, diversity_clean)

# Plot the histogram using one observation per playlist
ggplot(
  playlist_diversity,
  aes(x = diversity_clean)
  ) +
  geom_histogram(
    binwidth = 0.025,
    color = "black",
    fill = "#a7a7f9"
    ) +
  xlim(0, 1) +
  theme_bw() +
  theme(
    panel.grid.major.y = element_line(linetype = "dotted"),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    aspect.ratio = 0.8
  ) +
  labs(
    x = expression(k*alpha*" Rao-Stirling"), #nolint
    y = "Frequency"
  )


In [None]:
# Extract unique diversity values per playlist
playlist_diversity <- df %>%
  distinct(playlist_name, diversity_clean_2)

# Plot the histogram using one observation per playlist
ggplot(
  playlist_diversity,
  aes(x = diversity_clean_2)
  ) +
  geom_histogram(
    binwidth = 0.025,
    color = "black",
    fill = "#a7a7f9"
    ) +
  xlim(0, 1) +
  theme_bw() +
  theme(
    panel.grid.major.y = element_line(linetype = "dotted"),
    panel.grid.minor.y = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    aspect.ratio = 0.8
  ) +
  labs(
    x = expression(k*beta*" Rao-Stirling"), #nolint
    y = "Frequency"
  )


## Models

### OLS models

In [None]:
# Collapse data to playlist level
playlist_level_data <- df %>%
  group_by(playlist_name) %>%
  summarize(
    # Averaging track-level variables
    avg_track_popularity = mean(track__popularity, na.rm = TRUE),
    avg_artist_popularity = mean(artist_popularity, na.rm = TRUE),

    # Retaining playlist-level variables
    diversity_norm = unique(diversity_norm),
    diversity_norm_2 = unique(diversity_norm_2),
    dist_norm_3 = unique(dist_norm_3),
    log_followers = unique(log_followers),
    type = unique(type),
    curator = unique(curator),
    nb_tracks = n(),
    .groups = "drop"
  )


In [None]:
# Robust OLS model diversity 1
model <- lm(
    diversity_norm ~
    log_followers + nb_tracks +
    avg_track_popularity + avg_artist_popularity + type,
    data = playlist_level_data
)

model_robust <- coeftest(
    model, vcov = vcovCL,
    type = "HC1",
    df = 2,
    cluster = ~type
)
stargazer(model, model_robust, type = "text")


In [None]:
# Robust OLS model diversity 2
model <- lm(
    diversity_norm_2 ~
    log_followers + nb_tracks +
    avg_track_popularity + avg_artist_popularity + type,
    data = playlist_level_data
)

model_robust <- coeftest(
    model, vcov = vcovCL,
    type = "HC1",
    df = 2,
    cluster = ~type
)
stargazer(model, model_robust, type = "text")


In [None]:
# Robust OLS model distance
model <- lm(
    dist_norm_3 ~
    log_followers + nb_tracks +
    avg_track_popularity + avg_artist_popularity + type,
    data = playlist_level_data
)

model_robust <- coeftest(
    model, vcov = vcovCL,
    type = "HC1",
    df = 2,
    cluster = ~type
)
stargazer(model, model_robust, type = "text")


### Fixed effect models

In [None]:
# Fixed effects model playlist type and diversity 1
df_subset <- df %>%
  select(diversity_norm, type, log_followers, track_name, playlist_name)

model <- feols(
  diversity_norm ~ type + log_followers | track_name,
  data = df_subset
)

# Display the summary of the model
summary(model)


In [None]:
# Fixed effects model playlist type and diversity 2
df_subset <- df %>%
  select(diversity_norm_2, type, log_followers, track_name, playlist_name)

model <- feols(
  diversity_norm_2 ~ type + log_followers | track_name,
  data = df_subset
)

# Display the summary of the model
summary(model)


In [None]:
# Fixed effects model playlist type and distance
df_subset <- df %>%
  select(dist_norm_3, type, log_followers, track_name, playlist_name)

model <- feols(
  dist_norm_3 ~ type + log_followers | track_name,
  data = df_subset
)

# Display the summary of the model
summary(model)


### Secondary models

In [None]:
# OLS model playlist curator and diversity 1
model <- lm(
    diversity_norm ~
    log_followers + nb_tracks +
    avg_track_popularity + avg_artist_popularity + curator,
    data = playlist_level_data
)

model_robust <- coeftest(
    model, vcov = vcovCL,
    type = "HC1",
    df = 2,
    cluster = ~curator
)
stargazer(model, model_robust, type = "text")


In [None]:
# OLS model playlist curator and diversity 2
model <- lm(
    diversity_norm_2 ~
    log_followers + nb_tracks +
    avg_track_popularity + avg_artist_popularity + curator,
    data = playlist_level_data
)

model_robust <- coeftest(
    model, vcov = vcovCL,
    type = "HC1",
    df = 2,
    cluster = ~curator
)
stargazer(model, model_robust, type = "text")


In [None]:
# OLS model playlist curator and distance
model <- lm(
    dist_norm_3 ~
    log_followers + nb_tracks +
    avg_track_popularity + avg_artist_popularity + curator,
    data = playlist_level_data
)

model_robust <- coeftest(
    model, vcov = vcovCL,
    type = "HC1",
    df = 2,
    cluster = ~curator
)
stargazer(model, model_robust, type = "text")


In [None]:
# Fixed effects model playlist curator and diversity 1
df_subset <- df %>%
  select(
    diversity_norm, curator, log_followers, track_name, playlist_name
  )

model <- feols(
  diversity_norm ~ curator + log_followers | track_name,
  data = df_subset
)

# Display the summary of the model
summary(model)


In [None]:
# Fixed effects model playlist curator and diversity 2
df_subset <- df %>%
  select(
    diversity_norm_2, curator, log_followers, track_name, playlist_name
  )

model <- feols(
  diversity_norm_2 ~ curator + log_followers | track_name,
  data = df_subset
)

# Display the summary of the model
summary(model)
