In [1]:
%load_ext autoreload
%autoreload 2

# Explore the columns of data and find outliers
import pandas as pd
import numpy as np
import os
import datetime
from typing import List, Tuple, Dict, Any, Union, Callable

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np


In [2]:
REFRESH_DATA = True
CHARTS_PATH = '../data/charts_processed.csv'
AUDIO_PATH = '../data/audio_features_processed.csv'


In [3]:
if REFRESH_DATA:

    # Load the datasets into dataframes. Read the date column as datetime.date
    charts_df = pd.read_csv(CHARTS_PATH, parse_dates=['date'])

charts_df['date'] = pd.to_datetime(charts_df['date'])

# Display the first few rows of the DataFrame to verify
charts_df.head()

In [None]:
if REFRESH_DATA:
    # Load the audio features dataset
    audio_df = pd.read_csv(AUDIO_PATH)

    # Change the column-name of 'id' to 'track_id' to match the charts_df
    audio_df.rename(columns={'id': 'track_id'}, inplace=True)
    print(audio_df.head())


   acousticness  danceability  duration_ms  energy  instrumentalness  key  \
0       0.18700         0.852       195840   0.773          0.000030    8   
1       0.00431         0.663       259196   0.920          0.000017   11   
2       0.40000         0.761       222560   0.838          0.000000    4   
3       0.55100         0.508       205600   0.687          0.000003    0   
4       0.07600         0.899       234320   0.626          0.000000    6   

   liveness  loudness  mode  speechiness    tempo  time_signature  valence  \
0    0.1590    -2.921     0       0.0776  102.034               4    0.907   
1    0.1010    -4.070     0       0.2260   99.935               4    0.533   
2    0.1760    -3.073     0       0.0502   93.974               4    0.710   
3    0.1260    -4.361     1       0.3260  180.044               4    0.555   
4    0.0631    -4.228     0       0.2920   88.007               4    0.873   

                 track_id  
0  6mICuAdrwEjh6Y6lroV2Kg  
1  7DM4BPaS7

In [None]:
# For now only restrict the dataset to top200 charts for stream/ranking analysis.
charts_df = charts_df[charts_df["chart"] == "top200"]
charts_df["track_id"] = charts_df["url"].apply(lambda x: x.split("/")[-1])

## Topic : Regional Correlations
In the previous notebook, a popularity metric is defined for a track and we explored the correlation between audio features and the songs. This notebook investigates and considers every regions popularity correlations with their top songs. 


In [14]:
import sys
import tqdm
sys.path.append("..")
from source.utils.regions import get_regional_charts_delta_rank, get_charts_by_region,calculate_popularity_metrics
from scipy.stats import kendalltau, spearmanr, pearsonr
def get_region_correlations(region : str,
                            date : Union[str, Tuple[str,str]],
                            charts_df : pd.DataFrame,
                            audio_df : pd.DataFrame,
                            delta_k = 200,
                            correlation_column : str = 'weighted_popularity'
                            ) -> pd.DataFrame:
    """
    Get the correlation between the audio features and the chart position for a given region.
    """
    test_df = calculate_popularity_metrics(
        get_charts_by_region(charts_df, region),
        date,
        delta_k = delta_k,
    )

    test_df['weighted_popularity'] = test_df['popularity'] * test_df['average_stream_proportion']
    test_df_joined = test_df[['track_id',correlation_column]].join(audio_df.set_index('track_id'), on='track_id').drop_duplicates(subset="track_id")

    # Get the correliatons usning scipy
    corr_dict = {}
    spearman_corr_dict = {}
    kendal_corr_dict = {}
    for col in audio_df.columns:
        if col == 'track_id':
            continue
        corr = pearsonr(test_df_joined[correlation_column], test_df_joined[col])
        spearman_corr = spearmanr(test_df_joined[correlation_column], test_df_joined[col])
        kendal_corr = kendalltau(test_df_joined[correlation_column], test_df_joined[col])

        corr_dict[col] = corr
        spearman_corr_dict[col] = spearman_corr
        kendal_corr_dict[col] = kendal_corr

    # Make the values of dict Series to be compatible
    corr_dict = pd.DataFrame(corr_dict, index=['corr', 'p-value']).T
    spearman_corr_dict = pd.DataFrame(spearman_corr_dict, index=['corr', 'p-value']).T
    kendal_corr_dict = pd.DataFrame(kendal_corr_dict, index=['corr', 'p-value']).T
    
    return corr_dict, spearman_corr_dict, kendal_corr_dict

delta_k = 200
COI = "weighted_popularity"
corrs_dict = {}
for unique_region in tqdm.tqdm(charts_df['region'].unique(), desc="Processing regions"):
    corrs = get_region_correlations(unique_region,
                                    ("2017-01-01", "2022-01-01"),
                                    charts_df, audio_df,
                                    delta_k = delta_k,
                                    correlation_column = COI)
    # Save it to a dictionary
    corrs_dict[unique_region] = corrs

Processing regions: 100%|██████████| 69/69 [01:57<00:00,  1.70s/it]


In [15]:
# Save the dictionary to a pickle file
import pickle
with open(f'correlations_2017_2022_{COI}_with_p_value.pickle', 'wb') as handle:
    pickle.dump(corrs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [27]:
corrs_dict["Global"][2]

Unnamed: 0,corr,p-value
acousticness,0.021045,0.006124759
danceability,0.060314,4.131393e-15
duration_ms,0.010818,0.1586804
energy,0.005494,0.4744642
instrumentalness,-0.042096,6.067499e-07
key,0.008875,0.2683236
liveness,-0.026198,0.0006545478
loudness,0.066277,5.872479e-18
mode,-0.008098,0.3888851
speechiness,-0.031093,5.143103e-05


In [92]:
spearman_corrs = {}
spearman_corrs_p = {}
for k,v in corrs_dict.items():
    spearman_corrs[k] = v[1]["corr"]
    spearman_corrs_p[k] = v[1]["p-value"]


spearman_df_corrs = pd.concat(spearman_corrs, axis=1)
spearman_df_corrs_p = pd.concat(spearman_corrs_p, axis=1)

# Get the min/max of the rows and their columns
min_corr, max_corr = spearman_df_corrs.min(axis=1), spearman_df_corrs.max(axis=1)
min_corr_idx, max_corr_idx = spearman_df_corrs.idxmin(axis=1), spearman_df_corrs.idxmax(axis=1)
min_corr_p_values, max_corr_p_value = spearman_df_corrs_p[min_corr_idx.values], spearman_df_corrs_p[max_corr_idx.values]

# Create a dataframe
empty_df = pd.DataFrame(index=spearman_df_corrs.index)
empty_df["min_corr"] = min_corr
empty_df["min_corr_idx"] = min_corr_idx
empty_df["max_corr_idx"] = max_corr_idx
min_corr_p_values_list = []
max_corr_p_values_list = []
for idx, row in empty_df.iterrows():
    min_corr_p_values_list.append(spearman_df_corrs_p.loc[idx, row['min_corr_idx']])
    max_corr_p_values_list.append(spearman_df_corrs_p.loc[idx, row['max_corr_idx']])
    
empty_df["max_corr"] = max_corr

empty_df['min_corr_p_values'] = min_corr_p_values_list
empty_df['max_corr_p_values'] = max_corr_p_values_list

empty_df = empty_df[['max_corr_idx','max_corr', 'max_corr_p_values','min_corr_idx','min_corr', 'min_corr_p_values' ]]

#All values should be :.3f
empty_df['min_corr'] = empty_df['min_corr'].apply(lambda x: f"{x:.3f}")
empty_df['max_corr'] = empty_df['max_corr'].apply(lambda x: f"{x:.3f}")
empty_df['min_corr_p_values'] = empty_df['min_corr_p_values'].apply(lambda x: f"{x:.3e}")
empty_df['max_corr_p_values'] = empty_df['max_corr_p_values'].apply(lambda x: f"{x:.3e}")
empty_df.sort_values(by="max_corr", inplace=True, ascending=False)

empty_df

Unnamed: 0,max_corr_idx,max_corr,max_corr_p_values,min_corr_idx,min_corr,min_corr_p_values
loudness,Nicaragua,0.384,8.911999999999999e-73,Indonesia,-0.075,1.739e-06
danceability,Nicaragua,0.342,2.861e-57,Switzerland,0.014,0.139
speechiness,Nicaragua,0.284,2.321e-39,Morocco,-0.103,4.592e-08
energy,Nicaragua,0.255,7.876e-32,Indonesia,-0.122,8.281e-15
valence,Brazil,0.21,3.966e-49,Indonesia,-0.035,0.02474
acousticness,Indonesia,0.124,2.379e-15,Luxembourg,-0.159,9.734e-10
time_signature,Chile,0.098,3.249e-11,Morocco,-0.013,0.5076
tempo,Brazil,0.096,2.39e-11,Netherlands,-0.042,5.754e-05
duration_ms,Spain,0.083,2.839e-10,Brazil,-0.188,1.4769999999999998e-39
liveness,Brazil,0.068,2.706e-06,Philippines,-0.079,5.12e-07
