# AVA distribution for the recommended set

In [None]:
import numpy as np
import pandas as pd

In [None]:
encode_config = {
    'name': 'Autoencoder algorithm',
    'type': 'encode',
    'base_path': './encode_data'
}

social_graph_config = {
    'name': 'Autoencoder + Social graph data',
    'type': 'social_graph',
    'base_path': './social_graph_data'
}

content_based_config = {
    'name': 'content-based data',
    'type': 'cotent_based',
    'base_path': './content_based_data'
}

PLOT_PATH = '/root/work/rating analysis/plots'

experiment_configs = [encode_config, social_graph_config, content_based_config]

## Load data

In [None]:
df_metadata = pd.read_csv('/root/work/datasets/train_mayors_style_encoded_with_url.csv')
df_metadata.head()

**Get pair (source artwork, rated artwork)**

In [None]:
import os
import json

In [None]:
def get_pairs(data_dict):
    artworks_pair = []
    source_artwork = data_dict['source_artwork']['id']

    for artwork in data_dict['sim_artworks']:
        artworks_pair.append((source_artwork, artwork['id']))

    return artworks_pair

In [None]:
def get_subjects(base_path):
    artworks_pair = []

    for file in os.listdir(base_path):
        if not(os.path.isdir(os.path.join(base_path, file))):
            with open(os.path.join(base_path, file)) as json_file:
                data_dict = json.loads(json_file.read())
                pairs = get_pairs(data_dict)
                artworks_pair.append(pairs)
    return artworks_pair

In [None]:
artworks_pairs = {}

for config in experiment_configs:
    artworks_pair = get_subjects(config['base_path'])
    artworks_pairs[config['name']] = artworks_pair


In [None]:
len(list(artworks_pairs.keys()))

## Find metadata for each source id

In [None]:
def find_metadata(artworks_pair, df_metadata):
    targets_artworks_metadata = []
    for artwork_pair in artworks_pair:
        df = df_metadata.iloc[[target_id for source_id, target_id in artwork_pair]]
        targets_artworks_metadata.append(df)
        
    return targets_artworks_metadata


## Plot diversity

In [None]:
import matplotlib.pyplot as plt
import matplotlib.pylab as plb
import seaborn as sns

**Get the target metadata for each configuration to plot**

In [None]:
index = 0
targets_artworks_to_plot = {}

for name, artworks_pair in artworks_pairs.items():
    print(name)
    reference_index = artworks_pair[index][0][0]
    targets_artworks_metadata = find_metadata(artworks_pair, df_metadata)
    df_target = targets_artworks_metadata[index]
    targets_artworks_to_plot[name] = df_target


**Auxiliar plot function**

In [None]:
def plot_distribution(df, index, axs, name, j, i):
    g = sns.barplot(df.count().index, df.count().values, ax=axs[j, i])
    g.set_xticklabels(g.get_xticklabels(), rotation=90)
    g.yaxis.set_ticks(np.arange(0, 20, 2))
    g.set_title(name.capitalize() + ' \n Artwork id: '+ str(index), fontdict={'fontsize': 15, 'fontweight': 'medium'})
    return g

### Style and genre distribution

In [None]:
nrows = 2
ncols = len(targets_artworks_to_plot)
fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(16,10))
sns.set(style="whitegrid")

i = 0
j = 0

for feature in ['style', 'genre']:
    for name, df_target in targets_artworks_to_plot.items():
        df = df_target.groupby(feature)[feature]

        g = plot_distribution(df, reference_index, axs, name, i, j)
        j += 1
    i += 1
    j = 0

        
fig.tight_layout()
plt.show()

In [None]:
fig.savefig(os.path.join(PLOT_PATH,"style_genre_dist.png"), dpi=100)

## Analyse diversity

**Base function to analyse different aspects of the recommended set**

In [None]:
def base_diversity_analysis(artworks_pairs, feature, analysis_function):
    '''
    artworks_pair => [(name, [(source_artwork, target_artwork), ... ]), ... ]
    feature => feature to analyze(e.g. style, genre)
    analysis_function => function to apply to the data
    '''
    #Define columns data
    feature_target_list = []
    algorithm_name_list = []
    reference_index_list = []
    #Recolect data
    for name, artworks_pair in artworks_pairs.items():
        print(name)
        for index in range(len(artworks_pair)):
            reference_index = artworks_pair[index][0][0]
            targets_artworks_metadata = find_metadata(artworks_pair, df_metadata)
            df_target = targets_artworks_metadata[index]
            feature_target_list.append(analysis_function(df_target, feature))
            algorithm_name_list.append(name)
            reference_index_list.append(reference_index)
    
    #Put into a dataframe to plot
    data = {
    feature: feature_target_list,
    'algorithm name': algorithm_name_list,
    'artwork reference index': reference_index_list
    
    }
    df_max = pd.DataFrame(data)
    return df_max

In [None]:
def plot_diversity_analysis(df_data_to_plot, title):
    nrows = 1
    ncols = 2
    fig, axs = plt.subplots(nrows=nrows, ncols=ncols, figsize=(12,8))
    sns.set(style="whitegrid")

    i = 0
    j = 0

    for feature in ['style', 'genre']:
        g = sns.barplot(x='artwork reference index', y=feature, hue='algorithm name', data=df_data_to_plot[i], ax=axs[j])
        g.set_title(title+feature, fontdict={'fontsize': 16, 'fontweight': 'medium'})
        j += 1
        i +=1


    fig.tight_layout()
    return fig

In [None]:
same_feature_reference_artwork = lambda df,feature : df[feature].value_counts().max()/25

df_max_style = base_diversity_analysis(artworks_pairs, 'style', same_feature_reference_artwork)
df_max_genre = base_diversity_analysis(artworks_pairs, 'genre', same_feature_reference_artwork)
df_data_to_plot = [df_max_style, df_max_genre]
fig = plot_diversity_analysis(df_data_to_plot, 'Percentage of the recommendations \n that match with the reference ')
plt.show()

In [None]:
fig.savefig(os.path.join(PLOT_PATH,"style_genre_specialization.png"), dpi=100)

In [None]:
same_feature_reference_artwork = lambda df,feature : len(df[feature].value_counts())

df_max_style = base_diversity_analysis(artworks_pairs, 'style', same_feature_reference_artwork)
df_max_genre = base_diversity_analysis(artworks_pairs, 'genre', same_feature_reference_artwork)
df_data_to_plot = [df_max_style, df_max_genre]
fig = plot_diversity_analysis(df_data_to_plot, 'Amount of the recommendations \n that vary from the reference ')
plt.show()

In [None]:
fig.savefig(os.path.join(PLOT_PATH,"style_genre_diversity.png"), dpi=100)