# Data analysis

## Setup

In [None]:
# Load packages
library(readxl)
library(dplyr)
library(ggplot2)
library(stats)
library(stargazer)
library(purrr)
library(stringr)
library(tidyverse)
library(lubridate)
library(lessR)
library(fixest)
library(sandwich)
library(lmtest)
library(car)
library(effsize)


In [None]:
# Import the data
# Set the base directory path
base_directory <- "/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_management/" #nolint 

# Create a list of file names
file_names <- c(
    "editorial_playlists_23-24_final.csv",
    "editorial_playlists_22-23_final.csv",
    "editorial_playlists_21-22_final.csv",
    "major_playlists_23-24_final.csv",
    "major_playlists_22-23_final.csv",
    "major_playlists_21-22_final.csv"
)

# Modify the data frame
df_list <- list()

for (i in seq_along(file_names)) {
    file_name <- file_names[i]

    full_path <- file.path(base_directory, file_name)
    df <- read_csv(full_path)

    # Append the data frame to the list
    df_list[[i]] <- df
}

# Combine all data frames
df <- Reduce(function(x, y) merge(x, y, all = TRUE), df_list)

# Export to Excel
write.csv(df, "df_final.csv")


In [None]:
# Import the data
# Set the base directory path
base_directory <- "/Users/julienmbarki/Documents/Doctorat/Publications/Article 2/Data/Code/data_management/" #nolint 

# Create a list of file names
file_names <- c(
    "charts_playlists_23-24_final.csv",
    "charts_playlists_22-23_final.csv",
    "charts_playlists_21-22_final.csv"
)

# Modify the data frame
df_list <- list()

for (i in seq_along(file_names)) {
    file_name <- file_names[i]

    full_path <- file.path(base_directory, file_name)
    df <- read_csv(full_path)

    # Append the data frame to the list
    df_list[[i]] <- df
}

# Combine all data frames
df <- Reduce(function(x, y) merge(x, y, all = TRUE), df_list)

# Export to Excel
write.csv(df, "charts_final.csv")


## Data management

In [4]:
# Load data
df <- read.csv("df_final_ter.csv")
df_charts <- read.csv("charts_final_bis.csv")

df_combined <- bind_rows(df, df_charts)


### Main measure

In [None]:
# Extract numeric values
df$diversity_clean <- as.numeric(gsub("[^[:digit:].-]", "", df$stirling_index))
df$diversity_clean

df$diversity_clean_2 <- as.numeric(
    gsub("[^[:digit:].-]", "", df$stirling_index_2)
)
df$diversity_clean_2

# Scale values
df$diversity_norm <- rescale(diversity_clean, df, kind = "z")
df$diversity_norm

df$diversity_norm_2 <- rescale(diversity_clean_2, df, kind = "z")
df$diversity_norm_2

# Log values
df$diversity_log <- log(df$diversity_clean + 1)
df$diversity_log

df$diversity_log_2 <- log(df$diversity_clean_2 + 1)
df$diversity_log_2


### Secondary measures

In [None]:
# HH-Index
# Scale values
df$hhi_norm <- rescale(hh_index, df, kind = "z")
df$hhi_norm

df$hhi_norm_2 <- rescale(hh_index_2, df, kind = "z")
df$hhi_norm_2


In [None]:
# Distances
# Scale values
df$dist_norm <- rescale(distances, df, kind = "z")
df$dist_norm

df$dist_norm_2 <- rescale(distances_2, df, kind = "z")
df$dist_norm_2

df$dist_norm_3 <- rescale(distances_3, df, kind = "z")
df$dist_norm_3


### Covariates

In [None]:
# Relevel factors
df$editorial_type <- relevel(as.factor(df$editorial_type), ref = "genre")
table(df$editorial_type)

df$curator <- relevel(as.factor(df$playlist_type), ref = "Editorial")
table(df$curator)


In [None]:
# Log Followers
df$log_followers <- log(df$playlist_followers)
df$log_followers

# Followers class
summary(df$playlist_followers)
df$followers_class <- case_when(
    df$playlist_followers <= 25341 ~ "low",
    df$playlist_followers > 25341 &
    df$playlist_followers <= 98114 ~ "mid_low",
    df$playlist_followers > 98114 &
    df$playlist_followers <= 260541 ~ "mid_high",
    df$playlist_followers > 260541 ~ "high"
)
table(df$followers_class)


In [None]:
# Playlist dates
df <- df %>%
    mutate(
        collection_date = as.Date(collection_date),
        mean_track_date = as.Date(mean_track_date)
    ) %>%
    mutate(
        playlist_date = case_when(
            mean_track_date > collection_date - dyears(1.5) ~ "frontline",
            TRUE ~ "backline"
        )
    )

df$playlist_date <- relevel(as.factor(df$playlist_date), ref = "frontline")
table(df$playlist_date)


## Descriptive stats

### Stats

In [None]:
# Number of unique playlists and per playlist type
length(unique(df$playlist_id))

df %>%
    group_by(editorial_type) %>%
    summarize(n_distinct(playlist_id))

# Number of tracks per playlist and by playlist type
summary(df$nb_tracks)
sd(df$nb_tracks)

df %>%
    group_by(playlist_type) %>%
    summarise(
        count = n(),
        mean_tracks = mean(nb_tracks, na.rm = TRUE),
        median_tracks = median(nb_tracks, na.rm = TRUE),
        min_tracks = min(nb_tracks, na.rm = TRUE),
        max_tracks = max(nb_tracks, na.rm = TRUE),
        sd_tracks = sd(nb_tracks, na.rm = TRUE)
    ) %>%
    arrange(desc(mean_tracks))


In [None]:
# Followers
summary(df$playlist_followers)
sd(df$playlist_followers)

# Followers per playlist type
df %>%
    group_by(playlist_type) %>%
    summarise(
        count = n(),
        mean_followers = mean(playlist_followers, na.rm = TRUE),
        median_followers = median(playlist_followers, na.rm = TRUE),
        min_followers = min(playlist_followers, na.rm = TRUE),
        max_followers = max(playlist_followers, na.rm = TRUE),
        sd_followers = sd(playlist_followers, na.rm = TRUE)
    ) %>%
    arrange(desc(mean_followers))


In [None]:
# Number of clusters per playlist
summary(df$nb_clusters)
summary(df$nb_clusters_2)

df %>%
    group_by(editorial_type) %>%
    summarise(
        count = n(),
        mean_clusters_1 = mean(nb_clusters, na.rm = TRUE),
        mean_clusters_2 = mean(nb_clusters_2, na.rm = TRUE)
    )


In [None]:
# HH-Index
summary(df$hh_index)
summary(df$hh_index_2)

df %>%
    group_by(editorial_type) %>%
    summarise(
        count = n(),
        mean_hhi_1 = mean(hh_index, na.rm = TRUE),
        mean_hhi_2 = mean(hh_index_2, na.rm = TRUE)
    )


In [None]:
# Distances index
summary(df$distances)
summary(df$distances_2)

df %>%
    group_by(editorial_type) %>%
    summarise(
        count = n(),
        mean_dist_1 = mean(distances, na.rm = TRUE),
        mean_dist_2 = mean(distances_2, na.rm = TRUE)
    )


In [None]:
# Striling index
summary(df$diversity_clean)
summary(df$diversity_clean_2)

df %>%
    group_by(editorial_type) %>%
    summarise(
        count = n(),
        mean_div_1 = mean(diversity_clean, na.rm = TRUE),
        mean_div_2 = mean(diversity_clean_2, na.rm = TRUE)
    )


### Plots

In [None]:
ggplot(
  df,
  aes(x = diversity_clean)
  ) +
  geom_histogram(
    bins = 25,
    fill = "lightblue",
    color = "black"
  ) +
  labs(
    x = expression(k*alpha*" Rao-Stirling"), #nolint
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    panel.grid.minor.y = element_blank(),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black")
  )

# Save the plot
ggsave("div_plot_1.png", width = 8, height = 8, dpi = 300)


In [None]:
ggplot(
  df,
  aes(x = diversity_clean_2)
  ) +
  geom_histogram(
    bins = 25,
    fill = "lightblue",
    color = "black"
  ) +
  labs(
    x = expression(k*beta*" Rao-Stirling"), #nolint
    y = "Frequency"
  ) +
  theme_minimal() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    panel.grid.minor.y = element_blank(),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black")
  )

# Save the plot
ggsave("div_plot_2.png", width = 8, height = 8, dpi = 300)


In [None]:
# Compute mean Stirling Index per collection date
df_mean <- df %>%
  group_by(collection_date) %>%
  summarize(mean_stirling = mean(diversity_clean, na.rm = TRUE))

# Time series plot with trend line
ggplot(df_mean, aes(x = collection_date, y = mean_stirling)) +
  geom_line(color = "blue", linewidth = 0.5) +
  geom_smooth(method = "lm", se = FALSE, color = "red", linewidth = 1) +
  labs(
    x = "Collection Date",
    y = expression(k*alpha*" Rao-Stirling"), #nolint
  ) +
  theme_minimal() +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    panel.grid.minor.y = element_blank(),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
  )

# Save the plot
ggsave("div_time_plot_1.png", width = 8, height = 5, dpi = 300)


In [None]:
# Time series plot of stirling_index vs collection_date
df_mean <- df %>%
  group_by(collection_date) %>%
  summarize(mean_stirling = mean(diversity_clean_2))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_stirling
  )
  ) +
  geom_line(color = "blue", linewidth = 0.5) +  # Original time series
  geom_smooth(method = "lm", se = FALSE, color = "red", linewidth = 1) +
  labs(
    x = "Collection Date",
    y = expression(k*beta*" Rao-Stirling"), #nolint
  ) +
  theme_minimal() +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    panel.grid.minor.y = element_blank(),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
  )

ggsave(
    "div_time_plot_2.png",
    width = 8,
    height = 5,
    dpi = 300
)


#### Editorial type

In [None]:
# Time series plot of stirling_index vs collection_date
df_mean <- df %>%
  group_by(collection_date, editorial_type) %>%
  summarize(mean_stirling = mean(diversity_clean))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_stirling,
    color = editorial_type
  )
  ) +
  geom_line(linewidth = 0.5) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
  labs(
    x = "Collection Date",
    y = expression(k*alpha*" Rao-Stirling") #nolint
  ) +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
    legend.position = "bottom"
  )

ggsave(
    "div_1_time_plot.png",
    width = 8,
    height = 5,
    dpi = 300
)


In [None]:
# Time series plot of stirling_index_2 vs collection_date
df_mean <- df %>%
  group_by(collection_date, editorial_type) %>%
  summarize(mean_stirling = mean(diversity_clean_2))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_stirling,
    color = editorial_type
  )
  ) +
  geom_line(linewidth = 0.5) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
  labs(
    x = "Collection Date",
    y = expression(k*beta*" Rao-Stirling") #nolint
  ) +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
    legend.position = "bottom"
  )

ggsave(
    "div_2_time_plot.png",
    width = 8,
    height = 5,
    dpi = 300
)


In [None]:
# Time series plot of distance_index vs collection_date
df_mean <- df %>%
  group_by(collection_date, editorial_type) %>%
  summarize(mean_distances = mean(distances_3))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_distances,
    color = editorial_type
  )
  ) +
  geom_line(linewidth = 0.5) +
  geom_smooth(method = "lm", se = FALSE, linewidth = 1) +
  labs(
    x = "Collection Date",
    y = "Mean distance"
  ) +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
    legend.position = "bottom"
  )

ggsave(
    "div_3_time_plot.png",
    width = 8,
    height = 5,
    dpi = 300
)


#### Curator

In [None]:
# Time series plot of stirling_index vs collection_date
df_mean <- df %>%
  group_by(collection_date, playlist_type) %>%
  summarize(mean_stirling = mean(diversity_clean))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_stirling,
    color = playlist_type
  )
  ) +
  geom_line() +
  #geom_point() +
  labs(
    x = "Collection Date",
    y = expression(k*alpha*" Rao-Stirling") #nolint
  ) +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
    legend.position = "bottom"
  )

ggsave(
    "div_1_time_plot.png",
    width = 8,
    height = 5,
    dpi = 300
)


In [None]:
# Time series plot of stirling_index_2 vs collection_date
df_mean <- df %>%
  group_by(collection_date, playlist_type) %>%
  summarize(mean_stirling = mean(diversity_clean_2))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_stirling,
    color = playlist_type
  )
  ) +
  geom_line() +
  #geom_point() +
  labs(
    x = "Collection Date",
    y = expression(k*beta*" Rao-Stirling") #nolint
  ) +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
    legend.position = "bottom"
  )

ggsave(
    "div_2_time_plot.png",
    width = 8,
    height = 5,
    dpi = 300
)


In [None]:
# Time series plot of dist_index vs collection_date
df_mean <- df %>%
  group_by(collection_date, playlist_type) %>%
  summarize(mean_distances = mean(distances_3))

ggplot(
  df_mean,
  aes(
    x = collection_date,
    y = mean_distances,
    color = playlist_type
  )
  ) +
  geom_line() +
  #geom_point() +
  labs(
    x = "Collection Date",
    y = "Mean distance"
  ) +
  theme(
    panel.background = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.major.y = element_line(color = "gray80"),
    axis.title.y = element_text(vjust = +2, size = 12),
    axis.title.x = element_text(vjust = 0.5, size = 12),
    axis.line.x = element_line(linewidth = 1, color = "black"),
    axis.line.y = element_line(linewidth = 1, color = "black"),
    axis.text.y = element_text(
      angle = 90,
      color = "black",
      size = 11,
      face = 1,
      hjust = 0.5
    ),
    aspect.ratio = 0.6,
    legend.position = "bottom"
  )

ggsave(
    "div_3_time_plot.png",
    width = 8,
    height = 5,
    dpi = 300
)


## Models

### FE static

In [None]:
# k alpha Rao-Stirling
model_1 <- feols(
    diversity_norm ~ editorial_type,
    data = df,
    cluster = c("editorial_type")
)
summary(model_1)

model_2 <- feols(
    diversity_norm ~ editorial_type + nb_tracks + playlist_date +
    followers_class |
    collection_date,
    data = df,
    cluster = c("editorial_type")
)
summary(model_2)


In [None]:
# k beta Rao-Stirling
model_1 <- feols(
    diversity_norm_2 ~ editorial_type,
    data = df,
    cluster = c("editorial_type")
)
summary(model_1)

model_2 <- feols(
    diversity_norm_2 ~ editorial_type + nb_tracks + playlist_date +
    followers_class |
    collection_date,
    data = df,
    cluster = c("editorial_type")
)
summary(model_2)


In [None]:
# Distance index
model <- feols(
    dist_norm_3 ~ editorial_type + nb_tracks + playlist_date +
    followers_class |
    collection_date,
    data = df,
    cluster = c("editorial_type")
)
summary(model)


### Time series

In [None]:
# Create a time index
df$time_index <- as.numeric(df$collection_date - min(df$collection_date))
df$year_month <- format(as.Date(df$collection_date), "%Y-%m")
df$month_index <- as.numeric(as.factor(df$year_month))


In [None]:
# k alpha Rao-Stirling
model_trend <- feols(
  diversity_norm ~ month_index + nb_tracks + playlist_date +
  followers_class * month_index,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)

# k beta Rao-Stirling
model_trend_2 <- feols(
  diversity_norm_2 ~ month_index + nb_tracks + playlist_date +
  followers_class * month_index,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend_2)


In [None]:
# Evolution of Diversity Over Time with Playlist Type Interaction

# k alpha Rao-Stirling
model_trend <- feols(
  diversity_norm ~ month_index * editorial_type +
  nb_tracks + playlist_date + followers_class * month_index,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)

# k beta Rao-Stirling
model_trend <- feols(
  diversity_norm_2 ~ month_index * editorial_type +
  nb_tracks + playlist_date + followers_class * month_index,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)


In [None]:
model_trend <- feols(
  dist_norm_3 ~ time_index + time_index:editorial_type +
  nb_tracks + playlist_date + followers_class * time_index,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)


In [None]:
# Quadratic model
# Model: Evolution of Diversity Over Time with Playlist Type Interaction
model_trend <- feols(
  diversity_norm ~ month_index^2 +
  nb_tracks + playlist_date + followers_class * month_index,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)


### Secondary models

#### Playlist curator

In [14]:
# Extract numeric values
df_combined$diversity_clean <- as.numeric(
    gsub("[^[:digit:].-]", "", df_combined$stirling_index)
)
df_combined$diversity_clean_2 <- as.numeric(
    gsub("[^[:digit:].-]", "", df_combined$stirling_index_2)
)

# Scale values
df_combined$diversity_norm <- rescale(diversity_clean, df_combined, kind = "z")
df_combined$diversity_norm_2 <- rescale(
    diversity_clean_2, df_combined, kind = "z"
)

# Log values
df_combined$diversity_log <- log(df_combined$diversity_clean + 1)
df_combined$diversity_log_2 <- log(df_combined$diversity_clean_2 + 1)

# Log Followers
df_combined$log_followers <- log(df_combined$playlist_followers)

# Followers class
summary(df_combined$playlist_followers)
df_combined$followers_class <- case_when(
    df_combined$playlist_followers <= 25463 ~ "low",
    df_combined$playlist_followers > 25463 &
    df_combined$playlist_followers <= 98114 ~ "mid_low",
    df_combined$playlist_followers > 98114 &
    df_combined$playlist_followers <= 267642 ~ "mid_high",
    df_combined$playlist_followers > 267642 ~ "high"
)
table(df_combined$followers_class)

# Playlist dates
df_combined <- df_combined %>%
    mutate(
        collection_date = as.Date(collection_date),
        mean_track_date = as.Date(mean_track_date)
    ) %>%
    mutate(
        playlist_date = case_when(
            mean_track_date > collection_date - dyears(1.5) ~ "frontline",
            TRUE ~ "backline"
        )
    )

# Relevel factors
df_combined$curator <- relevel(
    as.factor(df_combined$playlist_type), ref = "Major label"
)
table(df_combined$curator)

df_combined$playlist_date <- relevel(
    as.factor(df_combined$playlist_date), ref = "frontline"
)
table(df_combined$playlist_date)


   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    118   25463   98114  216607  267642 2300846 


    high      low mid_high  mid_low 
    4867     4867     4866     4867 


Major label      Charts   Editorial 
       9577         314        9576 


frontline  backline 
     6934     12533 

In [16]:
model_1 <- feols(
    diversity_norm ~ curator + nb_tracks +
    playlist_date + followers_class | collection_date,
    data = df_combined, cluster = c("curator")
)
summary(model_1)

model_2 <- feols(
    diversity_norm_2 ~ curator + nb_tracks +
    playlist_date + followers_class | collection_date,
    data = df_combined, cluster = c("curator")
)
summary(model_2)


OLS estimation, Dep. Var.: diversity_norm
Observations: 19,467 
Fixed-effects: collection_date: 157
Standard-errors: Clustered (curator) 
                         Estimate Std. Error    t value  Pr(>|t|)    
curatorCharts            0.055879   0.062940   0.887804 0.4683139    
curatorEditorial         0.284458   0.067068   4.241357 0.0513454 .  
nb_tracks               -0.005921   0.000196 -30.227268 0.0010927 ** 
playlist_datebackline    0.033005   0.099715   0.330995 0.7721097    
followers_classlow       0.523766   0.040780  12.843828 0.0060074 ** 
followers_classmid_high  0.263281   0.028403   9.269577 0.0114387 *  
followers_classmid_low   0.684341   0.056623  12.086012 0.0067764 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RMSE: 0.956257     Adj. R2: 0.077822
                 Within R2: 0.082281

OLS estimation, Dep. Var.: diversity_norm_2
Observations: 19,467 
Fixed-effects: collection_date: 157
Standard-errors: Clustered (curator) 
                         Estimate Std. Error    t value  Pr(>|t|)    
curatorCharts            0.172120   0.124036   1.387668 0.2996239    
curatorEditorial         0.203967   0.129277   1.577751 0.2553550    
nb_tracks               -0.012180   0.000452 -26.949724 0.0013740 ** 
playlist_datebackline   -0.061069   0.142461  -0.428670 0.7099179    
followers_classlow       0.502642   0.071557   7.024343 0.0196709 *  
followers_classmid_high  0.178477   0.016370  10.902525 0.0083082 ** 
followers_classmid_low   0.605874   0.085431   7.091970 0.0193083 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RMSE: 0.852179     Adj. R2: 0.267614
                 Within R2: 0.272158

In [18]:
# Create a time index
df_combined$time_index <- as.numeric(
  df_combined$collection_date - min(df_combined$collection_date)
)
df_combined$year_month <- format(as.Date(df_combined$collection_date), "%Y-%m")
df_combined$month_index <- as.numeric(as.factor(df_combined$year_month))

# k alpha Rao-Stirling
model_trend <- feols(
  diversity_norm ~ month_index * curator +
  nb_tracks + playlist_date + followers_class * month_index,
  data = df_combined, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)

# k beta Rao-Stirling
model_trend <- feols(
  diversity_norm_2 ~ month_index * curator +
  nb_tracks + playlist_date + followers_class * month_index,
  data = df_combined, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)


OLS estimation, Dep. Var.: diversity_norm
Observations: 19,467 
Standard-errors: Clustered (playlist_id & collection_date) 
                                     Estimate Std. Error   t value   Pr(>|t|)
(Intercept)                         -0.061989   0.399188 -0.155289 0.87684832
month_index                         -0.000041   0.006868 -0.006020 0.99520642
curatorCharts                        0.138280   0.294190  0.470037 0.63916115
curatorEditorial                     0.162084   0.290141  0.558638 0.57742414
nb_tracks                           -0.005876   0.002695 -2.180605 0.03111580
playlist_datebackline                0.032948   0.100848  0.326710 0.74444223
followers_classlow                   0.503439   0.254228  1.980267 0.04990565
followers_classmid_high              0.377796   0.139904  2.700393 0.00790204
followers_classmid_low               0.570963   0.147147  3.880221 0.00016911
month_index:curatorCharts           -0.004201   0.006629 -0.633791 0.52739379
month_index:curato

OLS estimation, Dep. Var.: diversity_norm_2
Observations: 19,467 
Standard-errors: Clustered (playlist_id & collection_date) 
                                     Estimate Std. Error   t value    Pr(>|t|)
(Intercept)                          0.556324   0.464892  1.196674 0.233735110
month_index                          0.002229   0.008098  0.275210 0.783616631
curatorCharts                        0.325156   0.336330  0.966778 0.335552268
curatorEditorial                     0.125968   0.352617  0.357237 0.721526415
nb_tracks                           -0.012160   0.002993 -4.062780 0.000085775
playlist_datebackline               -0.058656   0.094651 -0.619705 0.536598088
followers_classlow                   0.437118   0.310627  1.407213 0.161886099
followers_classmid_high              0.346331   0.118829  2.914525 0.004233508
followers_classmid_low               0.525614   0.203809  2.578954 0.011087425
month_index:curatorCharts           -0.008261   0.006731 -1.227335 0.222040432
month

#### Playlist followers

In [None]:
# Whole sample
model_trend <- feols(
  log_followers ~ month_index * editorial_type,
  data = df, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend)


In [None]:
# Split the sample by curator
df_major <- df %>%
  filter(curator != "Editorial")
df_spotify <- df %>%
  filter(curator == "Editorial")

model_trend_1 <- feols(
  log_followers ~ month_index * editorial_type,
  data = df_major, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend_1)

model_trend_2 <- feols(
  log_followers ~ month_index * editorial_type,
  data = df_spotify, cluster = c("playlist_id", "collection_date")
  )
summary(model_trend_2)
