# Data analysis

## Setup

In [None]:
# Load packages
library(readxl)
library(dplyr)
library(ggplot2)
library(stats)
library(stargazer)
library(purrr)
library(stringr)
library(tidyverse)
library(lessR)
library(fixest)
library(sandwich)
library(lmtest)
library(car)
library(effsize)


In [None]:
# create a list of file names and corresponding is_editorial values
file_names <- c("df_rap_final_2.xlsx", "df_pop_final_2.xlsx", "df_variete_final_2.xlsx", "df_ete_final_2.xlsx",
                "df_latino_final_2.xlsx", "df_ambiance_final_2.xlsx", "df_electro_final_2.xlsx", "df_rock_final_2.xlsx",
                "df_indie_final_2.xlsx", "df_sport_final_2.xlsx", "df_rnb_final_2.xlsx", "df_afro_final_2.xlsx",
                "df_kpop_final_2.xlsx", "df_detente_final_2.xlsx", "df_dormir_final_2.xlsx", "df_soiree_final_2.xlsx",
                "df_zouk_final_2.xlsx", "df_reggae_final_2.xlsx", "df_maison_final_2.xlsx", "df_decennies_final_2.xlsx",
                "df_alternatif_final_2.xlsx", "df_amour_final_2.xlsx", "df_metal_final_2.xlsx", "df_jazz_final_2.xlsx",
                "df_tendances_final_2.xlsx", "df_equal_final_2.xlsx", "df_classique_final_2.xlsx", "df_folk_final_2.xlsx",
                "df_focus_final_2.xlsx", "df_soul_final_2.xlsx", "df_netflix_final_2.xlsx", "df_enfants_final_2.xlsx",
                "df_etudiants_final_2.xlsx", "df_bienetre_final_2.xlsx", "df_punk_final_2.xlsx", "df_ambient_final_2.xlsx",
                "df_country_final_2.xlsx", "df_blues_final_2.xlsx", "df_arabe_final_2.xlsx", "df_voyage_final_2.xlsx",
                "df_frequency_final_2.xlsx"
                )


type_values <- c('genre', 'genre', 'genre', 'mood', 'genre', 'mood', 'genre', 'genre', 'genre', 'mood', 'genre',
                 'genre', 'genre', 'mood', 'mood', 'mood', 'genre', 'genre', 'mood', 'trend', 'genre', 'mood',
                 'genre', 'genre', 'trend', 'trend', 'genre', 'genre', 'mood', 'genre', 'trend', 'mood', 'mood',
                 'mood', 'genre', 'genre', 'genre', 'genre', 'genre', 'mood', 'trend'
                 )

df_list <- list()
for (i in seq_along(file_names)) {
  file_name <- file_names[i]
  type <- type_values[i]
  df <- read_xlsx(file_name)
  df$type <- type
  df_list[[i]] <- df
}

df <- Reduce(function(x, y) merge(x, y, all = TRUE), df_list)


In [None]:
writexl::write_xlsx(df, "df_final_2.xlsx")


Data management

In [None]:
df <- read_xlsx("df_final_2.xlsx")

df$type <- relevel(as.factor(df$type), ref = "genre")


Dist

In [None]:
# Scale values from 0 to 1
df$dist <- rescale(mean_distance, df, kind='z')

# Print scaled values
df$dist


Covariates

In [None]:
# Scale covariates and log followers
df$track_pop_norm <- rescale(as.numeric(track_pop), df, kind='0to1')
df$track_pop_norm

df$artist_pop_norm <- rescale(as.numeric(artist_pop), df, kind='0to1')
df$artist_pop_norm

df$nb_tracks_res <- rescale(as.numeric(nb_tracks), df, kind='z')
df$nb_tracks_norm <- rescale(nb_tracks_res, df, kind='0to1')
df$nb_tracks_norm


In [None]:
summary(df$playlist_followers)
df$followers_class <- case_when(df$playlist_followers < 44010 ~ "low",
                                df$playlist_followers >= 44010 & df$playlist_followers < 173911 ~ "mid_low",
                                df$playlist_followers >= 173911 & df$playlist_followers < 561786 ~ "mid_high",
                                df$playlist_followers >= 561786 ~ "high")
table(df$followers_class)


Descriptive stats

In [None]:
nrow(df)
sum(df$nb_tracks)


In [None]:
summary(df$mean_distance)


In [None]:
table(df$type)


In [None]:
tapply(df$mean_distance, df$type, mean)


In [None]:
sapply(df[, c('nb_tracks', 'playlist_followers', 'track_pop', 'artist_pop')], function(x) c(summary(x), sd(x)))


Barplots

In [None]:
ggplot(df, aes(x = dist)) + 
  geom_histogram(binwidth = 0.025, color = "black", fill = "#a7a7f9") +
  xlim(0.5, 1.4) +
  theme_bw() +
  theme(panel.grid.major.y = element_line(linetype = "dotted"),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        panel.grid.minor.x = element_blank(),
        aspect.ratio = 0.8) +
  labs(x = 'Mean distance', y = 'Frequency')


OLS models

Robusts OLS

In [None]:
# Fit your robust linear regression 
lm_1 <- lm(dist ~ followers_class + nb_tracks + track_pop + type, data = df)
lm_1_1 <- lm(dist ~ type, data = df)
summary(lm_1)

lm1_robust_clustered <- coeftest(lm_1, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
lm1_1_robust_clustered <- coeftest(lm_1_1, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
print(lm1_robust_clustered)


In [None]:
stargazer(lm_1_1, lm_1)


Control for clusters

In [None]:
# Perform OLS
lm_1_2 <- lm(diversity_norm ~ followers_class + nb_tracks + track_pop + type + as.numeric(nb_clusters), data = df)
lm1_2_robust_clustered <- coeftest(lm_1_2, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
summary(lm_1_2)

# Perform OLS
lm_2_2 <- lm(diversity_norm_2 ~ followers_class + nb_tracks + track_pop + type + as.numeric(nb_clusters_2), data = df)
lm2_2_robust_clustered <- coeftest(lm_2_2, vcov = vcovCL, type = "HC1", df = 2, cluster = ~type)
summary(lm_2_2)

stargazer(lm1_1_robust_clustered, lm1_robust_clustered, lm1_2_robust_clustered, lm2_1_robust_clustered, lm2_robust_clustered, lm2_2_robust_clustered)


In [None]:
stargazer(lm_1_1, lm_1, lm_1_2, lm_2_1, lm_2, lm_2_2)


Multicolinearity

In [None]:
vif(lm_1)


Test on silhouette scores

In [None]:
# create a list of file names and corresponding is_editorial values
file_names <- c("robust_afro.xlsx", "robust_alternatif.xlsx", "robust_ambiance.xlsx", "robust_ambient.xlsx",
                "robust_amour.xlsx", "robust_arabe.xlsx", "robust_bienetre.xlsx", "robust_blues.xlsx",
                "robust_classique.xlsx", "robust_country.xlsx", "robust_decennies.xlsx", "robust_detente.xlsx",
                "robust_dormir.xlsx", "robust_electro.xlsx", "robust_enfants.xlsx", "robust_equal.xlsx",
                "robust_ete.xlsx", "robust_etudiants.xlsx"
                )

df_list <- list()
for (i in seq_along(file_names)) {
  file_name <- file_names[i]
  df <- read_xlsx(file_name)
  df_list[[i]] <- df
}

df_robust <- Reduce(function(x, y) merge(x, y, all = TRUE), df_list)


In [None]:
# Perform paired t-tests
t_test_5_6 <- t.test(df_robust$silhouette_score_5, df_robust$silhouette_score_6, paired = TRUE)
cohen_d_5_6 <- cohen.d(df_robust$silhouette_score_5, df_robust$silhouette_score_6, hedges.correction = TRUE)
t_test_5_6
cohen_d_5_6

t_test_5_4 <- t.test(df_robust$silhouette_score_5, df_robust$silhouette_score_4, paired = TRUE)
cohen_d_5_4 <- cohen.d(df_robust$silhouette_score_5, df_robust$silhouette_score_4, hedges.correction = TRUE)
t_test_5_4
cohen_d_5_4

t_test_5_6_2 <- t.test(df_robust$silhouette_score_5_2, df_robust$silhouette_score_6_2, paired = TRUE)
cohen_d_5_6_2 <- cohen.d(df_robust$silhouette_score_5_2, df_robust$silhouette_score_6_2, hedges.correction = TRUE)
t_test_5_6_2
cohen_d_5_6_2

t_test_5_4_2 <- t.test(df_robust$silhouette_score_5_2, df_robust$silhouette_score_4_2, paired = TRUE)
cohen_d_5_4_2 <- cohen.d(df_robust$silhouette_score_5_2, df_robust$silhouette_score_4_2, hedges.correction = TRUE)
t_test_5_4_2
cohen_d_5_4_2
