In [None]:
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import plotly.express as px


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
sns.set_theme()

In [None]:
proportion_local_datasets = []

for dataset in ['DEEZER', 'LFM']:
    for country in ['DE', 'BR', 'FR']:
        f = dataset + '_' + country
        # original data
        df = pd.read_csv('dataset/' + f + '/' + f + '.inter')
        df = df.rename(columns={'user_id:token': 'user_id' , 'item_id:token': 'media_id'})
        metadata = pd.read_csv('dataset/metadata_' + dataset + '.csv')[['media_id', 'country']]
        df = pd.merge(df, metadata, on=['media_id'], how='left')
        proportion_local_datasets.append([dataset,
                                 country,
                                 df['country'].value_counts(normalize=True)[country]])
        
proportion_local_datasets = pd.DataFrame(proportion_local_datasets, columns=['Dataset', 'Country', 'Proportion of local streams'])
sns.barplot(proportion_local_datasets, x='Country', y='Proportion of local streams', hue='Dataset')
plt.legend(title='')
plt.xlabel('')
plt.savefig('figures/1.pdf')

In [None]:
# Dataset - Figure 2.1: Distribution of local streams proportion for each user
# LFM dataset

for dataset in ['DEEZER', 'LFM']:
    
    res = []
    
    for country in ['DE', 'FR', 'BR']:
        
        # original data
        f = dataset + '_' + country
        df = pd.read_csv('dataset/' + f + '/' + f + '.inter')
        df = df.rename(columns={'user_id:token': 'user_id' , 'item_id:token': 'media_id'})
        metadata = pd.read_csv('dataset/metadata_' + dataset + '.csv')
        df = pd.merge(df, metadata, on=['media_id'], how='left')[['user_id', 'media_id', 'country']]
        df['user_nb_streams'] = df.groupby(['user_id'])['user_id'].transform('count')
        df['user_nb_streams_local'] = df[df['country'] == country].groupby(['user_id'])['user_id'].transform('count')
        df['user_nb_streams_local'] = df['user_nb_streams_local'].fillna(0)
        df = df.groupby(['user_id']).first()
        df['ratio_local'] = df['user_nb_streams_local'] / df['user_nb_streams']
        df = df.sort_values(by=['ratio_local'], ascending=False).reset_index()
        for r in df['ratio_local'].to_list():
            res.append([country, r])

    res = pd.DataFrame(res, columns=['Country', 'Proportion of local streams per user'])
    sns.histplot(data=res, x='Proportion of local streams per user', hue='Country', bins=10, stat='probability', element="poly")
    if dataset == 'deezer':
        plt.legend('',frameon=False)
    plt.xlabel('Proportion of local streams per user \n\n (' + dataset + ')')
    plt.xticks(np.arange(0, 1, step=0.2))
    plt.show()
    plt.savefig('figures/2-' + dataset + '.pdf')
    plt.close()
    

In [None]:
proportion_local = []

for country in ['FR', 'DE', 'BR']:
    
    users = pd.read_csv('dataset/LFM_GLOBAL/demo.txt', delimiter='\t', names=['user_country', 'age', 'sex', 'date'])
    users = users.index[users['user_country'] == country].unique().to_list()
    
    # original data
    df = pd.read_csv('dataset/LFM_GLOBAL/LFM_GLOBAL.inter')
    df = df.rename(columns={'user_id:token': 'user_id', 'item_id:token': 'media_id'})
    df = df[df['user_id'].isin(users)]
    metadata = pd.read_csv('dataset/LFM_GLOBAL/tracks.txt', delimiter='\t', names=['artist', 'title', 'country'])
    metadata['media_id'] = metadata.index
    df = pd.merge(df, metadata, on=['media_id'], how='left')
    proportion_local.append([country,
                             df['country'].value_counts(normalize=True)[country],
                             'dataset'])

    # recommendation results
    for model in ['NeuMF', 'ItemKNN']:
        filename = os.listdir('predicted/LFM/GLOBAL/' + model)[0]
        df2 = pd.read_csv('predicted/LFM/GLOBAL/' + model + '/' + filename)
        df2 = df2[df2['user_id'].isin(users)]
        df2 = pd.merge(df2, metadata, on=['media_id'], how='left')
        
        proportion_local.append([country,
                             df2['country'].value_counts(normalize=True)[country],
                             model])

proportion_local = pd.DataFrame(proportion_local, columns=['Country', '% local streams', 'data'])
sns.barplot(proportion_local, x='Country', y='% local streams', hue='data')
plt.savefig('../../figures/3-1.pdf')

In [None]:
proportion_local = []

dataset = 'LFM'

for country in ['FR', 'DE', 'BR']:
    # original data
    f = dataset + '_' + country
    df = pd.read_csv('dataset/' + f + '/' + f + '.inter')
    df = df.rename(columns={'user_id:token': 'user_id' , 'item_id:token': 'media_id'})
    metadata = pd.read_csv('dataset/metadata_' + dataset + '.csv')[['media_id', 'country']]
    df = pd.merge(df, metadata, on=['media_id'], how='left')
    proportion_local.append(['dataset',
                             country,
                             df['country'].value_counts(normalize=True)[country]])
    # recommendation results
    for model in ['NeuMF', 'ItemKNN']:
        df2 = pd.read_csv('predicted/' + dataset + '/' + country + '/' + model + '.csv')
        df2 = pd.merge(df2, metadata, on=['media_id'], how='left')
        proportion_local.append([model,
                                 country,
                                 df2['country'].value_counts(normalize=True)[country]])
        
proportion_local = pd.DataFrame(proportion_local, columns=['Data', 'Country', '% local streams'])
sns.barplot(proportion_local, x='Country', y='% local streams', hue='Data')

In [None]:
proportion_local = []

dataset = 'DEEZER'

for country in ['DE', 'FR', 'BR']:
    # original data
    f = dataset + '_' + country
    df = pd.read_csv('dataset/' + f + '/' + f + '.inter')
    df = df.rename(columns={'user_id:token': 'user_id' , 'item_id:token': 'media_id'})
    metadata = pd.read_csv('dataset/metadata_' + dataset + '.csv')[['media_id', 'country']]
    df = pd.merge(df, metadata, on=['media_id'], how='left')
    proportion_local.append(['dataset',
                             country,
                             df['country'].value_counts(normalize=True)[country]])
    # recommendation results
    for model in ['NeuMF', 'ItemKNN']:
        filename = os.listdir('predicted/' + dataset + '/' + country + '/' + model)[0]
        df2 = pd.read_csv('predicted/' + dataset + '/' + country + '/' + model + '/' + filename)
        df2 = pd.merge(df2, metadata, on=['media_id'], how='left')
        proportion_local.append([model,
                                 country,
                                 df2['country'].value_counts(normalize=True)[country]])
        
proportion_local = pd.DataFrame(proportion_local, columns=['Data', 'Country', '% local streams'])
sns.barplot(proportion_local, x='Country', y='% local streams', hue='Data')

### Testing which LFM global to choose

In [None]:
# Testing which LFM global to choose

proportion_local = []

for country in ['FR', 'DE', 'BR']:

    for global_version in ['GLOBAL', 'GLOBAL2', 'GLOBAL3']:
    
        users = pd.read_csv('dataset/LFM_' + global_version + '/demo.txt', delimiter='\t', names=['user_country', 'age', 'sex', 'date'])
        users = users.index[users['user_country'] == country].unique().to_list()
    
        # original data
        df = pd.read_csv('dataset/LFM_' + global_version + '/LFM_' + global_version + '.inter')
        df = df.rename(columns={'user_id:token': 'user_id', 'item_id:token': 'media_id'})
        df = df[df['user_id'].isin(users)]
        metadata = pd.read_csv('dataset/LFM_' + global_version + '/tracks.txt', delimiter='\t', names=['artist', 'title', 'country'])
        metadata['media_id'] = metadata.index
        df = pd.merge(df, metadata, on=['media_id'], how='left')
        proportion_local.append([country,
                                df['country'].value_counts(normalize=True)[country],
                                'dataset',
                                global_version
                                ])

        # recommendation results
        for model in ['NeuMF', 'ItemKNN']:
            filename = os.listdir('predicted/LFM/' + global_version + '/' + model)[0]
            df2 = pd.read_csv('predicted/LFM/' + global_version + '/' + model + '/' + filename)
            df2 = df2[df2['user_id'].isin(users)]
            df2 = pd.merge(df2, metadata, on=['media_id'], how='left')
            
            proportion_local.append([country,
                                df2['country'].value_counts(normalize=True)[country],
                                model,
                                global_version
                                ])

proportion_local = pd.DataFrame(proportion_local, columns=['Country', '% local streams', 'data', 'version'])

In [None]:
sns.barplot(proportion_local[proportion_local['version'] == 'GLOBAL'], x='Country', y='% local streams', hue='data')    

In [None]:
sns.barplot(proportion_local[proportion_local['version'] == 'GLOBAL2'], x='Country', y='% local streams', hue='data')    

In [None]:
sns.barplot(proportion_local[proportion_local['version'] == 'GLOBAL3'], x='Country', y='% local streams', hue='data')    

In [None]:
# on garde le dataset 1 car NeuMF Brésil reflette les résultats du papier ismir

### Other

In [None]:
def extract_top_k_reco(df, k):
    n_users = len(df.user_id.unique())
    k_max = int(len(df) / n_users)

    df['rank'] = list(range(1, k_max + 1)) * n_users

    return df[df['rank'] <= k].drop(columns = ['rank'])

In [None]:
proportion_local = []


for dataset in ["LFM", "DEEZER"]:
    

    for country in ["FR", "DE", "BR"]:
        # original data
        f = dataset + "_" + country
        df = pd.read_csv("dataset/" + f + "/" + f + ".inter")
        df = df.rename(
            columns={"user_id:token": "user_id", "item_id:token": "media_id"}
        )
        metadata = pd.read_csv("dataset/metadata_" + dataset + ".csv")[
            ["media_id", "country"]
        ]
        df = pd.merge(df, metadata, on=["media_id"], how="left")

        # recommendation results
        for model in ["NeuMF", "ItemKNN"]:
            filenames = sorted(
                os.listdir("predicted/" + dataset + "/" + country + "/" + model),
                reverse=True,
            )
            for try_index in range(7):
                filename = filenames[try_index]
                print(
                    "predicted/"
                    + dataset
                    + "/"
                    + country
                    + "/"
                    + model
                    + "/"
                    + filename
                )
                df2 = pd.read_csv(
                    "predicted/"
                    + dataset
                    + "/"
                    + country
                    + "/"
                    + model
                    + "/"
                    + filename
                )
                df2 = pd.merge(df2, metadata, on=["media_id"], how="left")

                for k in range(10, 101, 5):

                    proportion_local_value = extract_top_k_reco(df2, k)[
                        "country"
                    ].value_counts(normalize=True)[country]
                    proportion_local.append(
                        [dataset, model, country, proportion_local_value, k]
                    )


proportion_local = pd.DataFrame(
    proportion_local, columns=["Data", "Model", "Country", "% local streams", "k"]
)

proportion_local_agg = (
    proportion_local.groupby(["Data", "Model", "Country", "k"])
    .agg({"% local streams": ["mean", "std"]})
    .reset_index()
)
proportion_local_agg.columns = [
    "_".join(col).strip() for col in proportion_local_agg.columns.values
]

proportion_local_agg.columns = [
    "Data",
    "Model",
    "Country",
    "k",
    "% local streams_mean",
    "% local streams_std",
]

In [None]:
import plotly.express as px


DATASET = "LFM"

fig = px.line(
    proportion_local_agg[proportion_local_agg["Data"] == DATASET].sort_values(by="k"),
    color="Country",
    y="% local streams_mean",
    error_y="% local streams_std",
    x="k",
    line_dash="Model",
    template="none",
    height=700,
    width=1000,
    title=DATASET,
)

fig.add_hline(
    y=proportion_local_datasets[
        (proportion_local_datasets["Dataset"] == DATASET)
        & (proportion_local_datasets["Country"] == "BR")
    ]["Proportion of local streams"].values[0],
    line_dash="dash",
    line_color="blue",
    annotation_text="BR dataset",
    annotation_position="bottom right",
)
fig.add_hline(
    y=proportion_local_datasets[
        (proportion_local_datasets["Dataset"] == DATASET)
        & (proportion_local_datasets["Country"] == "DE")
    ]["Proportion of local streams"].values[0],
    line_dash="dash",
    line_color="orange",
    annotation_text="DE dataset",
    annotation_position="bottom right",
)
fig.add_hline(
    y=proportion_local_datasets[
        (proportion_local_datasets["Dataset"] == DATASET)
        & (proportion_local_datasets["Country"] == "BR")
    ]["Proportion of local streams"].values[0],
    line_dash="dash",
    line_color="green",
    annotation_text="FR dataset",
    annotation_position="bottom right",
)

fig.show()

In [None]:
import plotly.express as px


DATASET = "DEEZER"

fig = px.line(
    proportion_local_agg[proportion_local_agg["Data"] == DATASET].sort_values(by="k"),
    color="Country",
    y="% local streams_mean",
    error_y="% local streams_std",
    x="k",
    line_dash="Model",
    template="none",
    height=700,
    width=1000,
    title=DATASET,
)

fig.add_hline(
    y=proportion_local_datasets[
        (proportion_local_datasets["Dataset"] == DATASET)
        & (proportion_local_datasets["Country"] == "BR")
    ]["Proportion of local streams"].values[0],
    line_dash="dash",
    line_color="blue",
    annotation_text="BR dataset",
    annotation_position="bottom right",
)
fig.add_hline(
    y=proportion_local_datasets[
        (proportion_local_datasets["Dataset"] == DATASET)
        & (proportion_local_datasets["Country"] == "DE")
    ]["Proportion of local streams"].values[0],
    line_dash="dash",
    line_color="orange",
    annotation_text="DE dataset",
    annotation_position="bottom right",
)
fig.add_hline(
    y=proportion_local_datasets[
        (proportion_local_datasets["Dataset"] == DATASET)
        & (proportion_local_datasets["Country"] == "BR")
    ]["Proportion of local streams"].values[0],
    line_dash="dash",
    line_color="green",
    annotation_text="FR dataset",
    annotation_position="bottom right",
)

fig.show()

In [None]:
proportion_local

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming `proportion_local` and `proportion_local_datasets` are your dataframes

DATASET = "LFM"

# Filter data for the specific dataset
filtered_data = proportion_local[proportion_local["Data"] == DATASET].sort_values(by="k")

sns.set_style('whitegrid')
# Create the line plot using Seaborn
plt.figure(figsize=(12, 7))
sns.lineplot(data=filtered_data, x="k", y="% local streams", hue="Country", style="Model", markers=True, dashes=False, err_style = 'band')

# Add horizontal lines and annotations using Matplotlib
plt.axhline(y=proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "FR")]["Proportion of local streams"].values[0], color='blue', linestyle='--')
plt.axhline(y=proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "BR")]["Proportion of local streams"].values[0], color='orange', linestyle='--')
plt.axhline(y=proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "DE")]["Proportion of local streams"].values[0], color='green', linestyle='--')

# Add annotations
plt.text(115, proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "BR")]["Proportion of local streams"].values[0], "BR dataset", color='orange', ha='right')
plt.text(115, proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "DE")]["Proportion of local streams"].values[0], "DE dataset", color='green', ha='right')
plt.text(115, proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "FR")]["Proportion of local streams"].values[0], "FR dataset", color='blue', ha='right')

# Set title and labels
plt.title(f"{DATASET} data : % of local streams recommended by the tested models, over k in top_k recommendation")
plt.xlabel('k')
plt.ylabel('% local streams_mean')
plt.legend(loc='center left', bbox_to_anchor=(1, .9))

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming `proportion_local` and `proportion_local_datasets` are your dataframes

DATASET = "DEEZER"

# Filter data for the specific dataset
filtered_data = proportion_local[proportion_local["Data"] == DATASET].sort_values(by="k")

sns.set_style('whitegrid')
# Create the line plot using Seaborn
plt.figure(figsize=(12, 7))
sns.lineplot(data=filtered_data, x="k", y="% local streams", hue="Country", style="Model", markers=True, dashes=False, err_style = 'band')

# Add horizontal lines and annotations using Matplotlib
plt.axhline(y=proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "FR")]["Proportion of local streams"].values[0], color='blue', linestyle='--')
plt.axhline(y=proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "BR")]["Proportion of local streams"].values[0], color='orange', linestyle='--')
plt.axhline(y=proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "DE")]["Proportion of local streams"].values[0], color='green', linestyle='--')

# Add annotations
plt.text(115, proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "BR")]["Proportion of local streams"].values[0], "BR dataset", color='orange', ha='right')
plt.text(115, proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "DE")]["Proportion of local streams"].values[0], "DE dataset", color='green', ha='right')
plt.text(115, proportion_local_datasets[(proportion_local_datasets["Dataset"] == DATASET) & (proportion_local_datasets["Country"] == "FR")]["Proportion of local streams"].values[0], "FR dataset", color='blue', ha='right')

# Set title and labels
plt.title(f"{DATASET} data : % of local streams recommended by the tested models, over k in top_k recommendation")
plt.xlabel('k')
plt.ylabel('% local streams_mean')
plt.legend(loc='center left', bbox_to_anchor=(1, .8))

plt.show()
