# Low-Rank and Linear Spectral Matrix Completion for Playlist Recommendation

In [None]:
# -*- coding: utf8 -*-
import sys
import os
import time
import operator
import numpy as np
import scipy as sp
import pandas as pd
import networkx as nx
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy.io
import scipy.sparse
import itertools
import random
import community
import IPython.utils.path
import cPickle as pickle 
from sklearn.cross_validation import train_test_split


%matplotlib inline
mpl.rcParams['axes.edgecolor'] = 'grey'
mpl.rcParams['grid.color'] = '#66CCCC'
mpl.rcParams['text.color'] = '#0EBFE9'
mpl.rcParams['xtick.color'] = '#66CCCC'
mpl.rcParams['ytick.color'] = '#66CCCC'
mpl.rcParams['axes.labelcolor'] = '#0EBFE9'

import recog 

%load_ext autoreload
%autoreload 2

# reload(sys)  # Reload does the trick!
# sys.setdefaultencoding('UTF8')

# pd.options.display.encoding = 'utf-8'

In [None]:
import bokeh.plotting as bp
from bokeh.palettes import brewer
bp.output_notebook()

# Test on real data

In [None]:
def plot_mat(x, title, fig=(512, 200), cmap='Greys', reverse=False, nb_colors=9):
    f1 = bp.figure(plot_width=fig[0], plot_height=fig[1], 
                   x_range=[0, x.shape[1]], y_range=[0, x.shape[0]])
    

    pal = brewer[cmap][nb_colors]
    if reverse:
        pal = pal[::-1]
        
    f1.image(image=[x], x=[0], y=[0], 
             dw=[x.shape[1]], dh=[x.shape[0]], palette=pal)
    f1.title = title
    f1.title_text_color = 'red'
    f1.title_text_font_style = 'bold'
    bp.show(f1)

In [None]:
# DATA_DIR = os.path.join(IPython.utils.path.get_home_dir(), 'data/aotmv2/')
DATA_DIR = os.path.join( IPython.utils.path.get_home_dir(), 'local/aotmv2/')
print 'Data directory:', DATA_DIR

DATASET_NAME = 'aotm'
MAX_PROCESS = 8
song_id_key = 'aotm_id'
playlist_id_key = 'mix_id'
playlist_cat_key = 'playlist_category'

In [None]:
FULL_SONGS = pd.read_hdf(os.path.join(DATA_DIR, DATASET_NAME + '_songs.h5'), 'data')
FULL_SONGS.rename(columns={'temporal_echonest_features': 'ten'}, inplace=True)
FULL_PLAYLISTS = pd.read_hdf(os.path.join(DATA_DIR, DATASET_NAME + '_playlists.h5'), 'data')
FULL_MIXES = pd.read_hdf(os.path.join(DATA_DIR, DATASET_NAME + '_mixes.h5'), 'data')

In [None]:
# Extract features only from data
to_remove = set(['title', 'artist_name', 'genre', 'top_genres', 'terms', 
                 'release', 'key', 'mode', 'genre_topics', 'genre_topic', 'ncut_id'])
columns = set(FULL_SONGS.columns.tolist())
feat_col = list(columns - to_remove)

### Create smaller dataset from AOTM data

Here we remove ambiguous plyalist categories, we also verify that there are a sufficient number of playlists in each category. Each playlist is composed of "popular songs", (songs seen at least in a certain amount of playlists), and is not too short not too long.

In [None]:
# min_playlists = 70
min_playlists = 100
# min_playlist_size = 8
min_playlist_size = 5
max_playlist_size = 20
min_popularity = 5

to_remove = ['Mixed Genre', 'Theme', 'Single Artist', 'Alternating DJ', 'Mixed', 'Cover', 'Narrative']

In [None]:
# Remove ambiguous categories
FILT_MIXES = FULL_MIXES[~FULL_MIXES.playlist_category.isin(to_remove)]
# Remove too short or too long playlists
FILT_MIXES = FILT_MIXES[FILT_MIXES['size'].between(min_playlist_size, max_playlist_size)]

# Filter popular songs
good_playlist_categories = np.unique(FILT_MIXES.playlist_category.values)
FILT_PLAYLISTS = FULL_PLAYLISTS[FULL_PLAYLISTS[playlist_cat_key].isin(good_playlist_categories)]
song_popularity_hist = FILT_PLAYLISTS.aotm_id.value_counts()
good_songs = song_popularity_hist[song_popularity_hist >= min_popularity].index.values
FILT_MIXES[song_id_key] = FILT_MIXES[song_id_key].apply(lambda x: list((set(x) & set(good_songs))))
FILT_MIXES['size'] = FILT_MIXES[song_id_key].apply(len)

# Refilter size of playlists
FILT_MIXES = FILT_MIXES[FILT_MIXES['size'].between(min_playlist_size, max_playlist_size)]
# Keep a sufficient number of playlist in each category
p_hist = FILT_MIXES[playlist_cat_key].value_counts()
P_CATEGORIES = p_hist.index[np.where(p_hist > min_playlists)].values
FILT_MIXES = FILT_MIXES[FILT_MIXES[playlist_cat_key].isin(P_CATEGORIES)]

# Update the list of valid songs since we removed some playlists
good_songs = np.unique(list(itertools.chain(*list(FILT_MIXES[song_id_key].values))))
FILT_PLAYLISTS = FILT_PLAYLISTS[FILT_PLAYLISTS[playlist_id_key].isin(FILT_MIXES.index.values)]
FILT_PLAYLISTS = FILT_PLAYLISTS[FILT_PLAYLISTS[song_id_key].isin(good_songs)]

# Keep only valid song and features in playlists
FILT_SONGS = FULL_SONGS[FULL_SONGS.index.isin(good_songs)].sort('genre')
FILT_FEAT = FILT_SONGS[feat_col]

print 'Number of playlists:', len(FILT_MIXES)
print 'Number of songs:', len(FILT_SONGS)

In [None]:
FILT_MIXES[playlist_cat_key].value_counts()

### Create evenly sampled dataset

In [None]:
to_keep = ['Romantic', 'Depression', 'Break Up', 'Sleep',
           'Punk', 'Country', 'Hip Hop', 'Dance/House', 'Rock', 'Rhythm and Blues']

P_CATEGORIES = to_keep

cat_size = 100

In [None]:
MINI_MIXES = FILT_MIXES[FILT_MIXES.playlist_category.isin(to_keep)]

tmp = MINI_MIXES.reset_index().groupby(playlist_cat_key).agg({playlist_id_key: lambda x: random.sample(x, cat_size)})
good_mixes = np.unique(list(itertools.chain(*list(tmp[playlist_id_key].values))))

MINI_MIXES = MINI_MIXES[MINI_MIXES.index.isin(good_mixes)]
# sample_idx = random.sample(MINI_MIXES.index, len(MINI_MIXES) // sample_factor)
# MINI_MIXES = MINI_MIXES[MINI_MIXES.index.isin(sample_idx)]
# Update the list of valid songs since we removed some playlists
good_songs = np.unique(list(itertools.chain(*list(MINI_MIXES[song_id_key].values))))
MINI_PLAYLISTS = FILT_PLAYLISTS[FILT_PLAYLISTS[playlist_id_key].isin(MINI_MIXES.index.values)]
MINI_PLAYLISTS = MINI_PLAYLISTS[MINI_PLAYLISTS[song_id_key].isin(good_songs)]

# Keep only valid song and features in playlists
MINI_SONGS = FILT_SONGS[FILT_SONGS.index.isin(good_songs)].sort('genre')
MINI_FEAT = MINI_SONGS[feat_col]

print 'Number of playlists:', len(MINI_MIXES)
print 'Number of songs:', len(MINI_SONGS)

In [None]:
MINI_MIXES[playlist_cat_key].value_counts()

### Select working dataset

In [None]:
MIXES = MINI_MIXES
FEAT = MINI_FEAT
PLAYLISTS = MINI_PLAYLISTS
SONGS = MINI_SONGS
DATASET_VERSION = 'medium'

In [None]:
# MIXES = FILT_MIXES
# FEAT = FILT_FEAT
# PLAYLISTS = FILT_PLAYLISTS
# SONGS = FILT_SONGS
# SONG_TO_IDX = dict(zip(SONGS.index.values, itertools.count()))
# DATASET_VERSION = 'filt'

In [None]:
DUMP_DIR = os.path.join(DATA_DIR, 'dump_' + DATASET_VERSION + '/')

if not os.path.exists(DUMP_DIR):
    os.mkdir(DUMP_DIR)

## Prepare graphs

First, create train and test dataset.

In [None]:
train_set_size = 0.7

In [None]:
mixes_train_idx, mixes_test_idx = train_test_split(MIXES.index.values, train_size=train_set_size)
mixes_train_idx, mixes_test_idx = sorted(mixes_train_idx), sorted(mixes_test_idx)

MIXES_train = MIXES[MIXES.index.isin(mixes_train_idx)]
MIXES_test = MIXES[MIXES.index.isin(mixes_test_idx)]
PLAYLISTS_train = PLAYLISTS[PLAYLISTS[playlist_id_key].isin(mixes_train_idx)]

songs_train_idx = np.unique(list(itertools.chain(*list(MIXES_train[song_id_key].values))))
songs_test_idx = np.unique(list(itertools.chain(*list(MIXES_test[song_id_key].values))))

### Create song graph

To keep the same number of nodes, we create the full song graph and we remove edges not in the train set.

In [None]:
SONG_GRAPH, SONGS = recog.graph.create_song_graph(FEAT, SONGS, 5)
SONG_TO_IDX = dict(zip(SONGS.index.values, itertools.count()))
print nx.info(SONG_GRAPH)

#### Plot adjacency matrix

In [None]:
W_SONGS = nx.to_numpy_matrix(SONG_GRAPH)

In [None]:
# sample = W_SONGS[2300:3500, 2300:3500]
sample = W_SONGS
sample = (sample > 0).astype(np.float)
recog.plot_factor_mat(sample, 'Adjacency matrix for Song graph', 'Blues')
# plot_mat(W_SONGS.todense(), 'Song graph ncut', fig=(512, 512))
fig1 = plt.gcf()
ax = plt.gca()
fig1.frameon = False
ax.patch.set_visible(False)
ax.axis('off')
fig_path = os.path.join(DUMP_DIR, 'sample_song_graph_partitions_' + str(SONG_GRAPH.graph['partitions']) + '.png')
fig1.savefig(fig_path, dpi=500)

### Compute all shortest path between songs for validation

In [None]:
start = time.time()
pairs_distance = nx.all_pairs_dijkstra_path_length(SONG_GRAPH, weight='dist')
end = time.time()
print 'Created in:', end - start

In [None]:
pickle.dump(pairs_distance, open(os.path.join(DUMP_DIR, 'song_pairs_distance.pickle'), "wb" ) )

### Create playlist graph

In [None]:
PLAYLIST_GRAPH_train, MIXES_train, PLAYLISTS_train = recog.graph.create_playlist_graph(MIXES_train.copy(), PLAYLISTS_train, playlist_id_key, song_id_key, 
                                                       'playlist_category', 0.3, 0.2)
print nx.info(PLAYLIST_GRAPH_train)

# MIXES_train = MIXES[MIXES.index.isin(mixes_train_idx)]
# MIXES_test = MIXES[MIXES.index.isin(mixes_test_idx)]

#### Plot adjacency matrix

In [None]:
W_PLAYLISTS = nx.to_numpy_matrix(PLAYLIST_GRAPH_train)

In [None]:
sample = (W_PLAYLISTS > 0).astype(np.float)
recog.plot_factor_mat(sample, 'Adjacency matrix for Playlist graph', 'Blues')
# plot_mat(W_SONGS.todense(), 'Song graph ncut', fig=(512, 512))
fig1 = plt.gcf()
ax = plt.gca()
fig1.frameon = False
ax.patch.set_visible(False)
ax.axis('off')
fig_path = os.path.join(DUMP_DIR, 'sample_playlist_graph_partitions_' + str(PLAYLIST_GRAPH.graph['partitions']) + '.png')
fig1.savefig(fig_path, dpi=100)

### Playlist graph without categories

In [None]:
def stripped_playlist_graph(g):
    h = nx.Graph()
    h.add_nodes_from(g.nodes(data=True))
    counts = nx.get_edge_attributes(g, 'count')

    cosine = lambda u, v, x: float(x) / (np.sqrt(g.node[u]['size']) * np.sqrt(g.node[v]['size']))
    reweighted_attrs = [(k[0], k[1], cosine(k[0], k[1], v)) for k, v in counts.iteritems()]

    h.add_weighted_edges_from(reweighted_attrs)

    d = community.best_partition(h)
    mod = community.modularity(d, h)
    print 'Number of clusters: {}, Modularity: {}'.format(len(set(d.values())), mod)
    return h

In [None]:
STRIPPED_PLAYLIST_GRAPH_train = stripped_playlist_graph(PLAYLIST_GRAPH_train)

In [None]:
W_PLAYLISTS_train = nx.to_numpy_matrix(STRIPPED_PLAYLIST_GRAPH_train)

In [None]:
sample = (W_PLAYLISTS_train > 0).astype(np.float)
recog.plot_factor_mat(sample, 'Adjacency matrix for Playlist graph train', 'Blues')
# plot_mat(W_SONGS.todense(), 'Song graph ncut', fig=(512, 512))
fig1 = plt.gcf()
ax = plt.gca()
fig1.frameon = False
ax.patch.set_visible(False)
ax.axis('off')
fig_path = os.path.join(DUMP_DIR, 'sample_playlist_graph_train_partitions_' + str(PLAYLIST_GRAPH.graph['partitions']) + '.png')
fig1.savefig(fig_path, dpi=100)

In [None]:
MIXES_train.groupby('cluster_id')['playlist_category'].value_counts()

### Create C matrix

In [None]:
C_train = recog.create_recommendation_matrix(MIXES_train, SONGS.index,
                                     playlist_id_key, DATASET_NAME, normalize=True)

### Plot C

In [None]:
recog.plot_factor_mat(C_train.toarray())
# plot_mat(C_train.toarray(), 'C', (512, 512))
print 'Sparsity ratio:', C_train.nnz / float(C_train.shape[0] * C_train.shape[1])
print C_train.shape

## Dump data

In [None]:
def dump(outdir):
    # Write graphs
    nx.write_gpickle(SONG_GRAPH, os.path.join(outdir, 'song_graph.gpickle'))
#     nx.write_gpickle(SONG_GRAPH_train, os.path.join(outdir, 'song_graph_train.gpickle'))
#     nx.write_gpickle(PLAYLIST_GRAPH, os.path.join(outdir, 'playlist_graph.gpickle'))
    nx.write_gpickle(PLAYLIST_GRAPH_train, os.path.join(outdir, 'playlist_graph_train.gpickle'))
    
    # Write data
    store = pd.HDFStore(os.path.join(outdir, 'data.h5'))
    store['songs'] = SONGS
    store['mixes'] = MIXES
    store['playlists'] = PLAYLISTS
    store.close()

    matlab_export_path = os.path.join(outdir, 'recog_real_data.mat')
    matlab_data = dict()

#     matlab_data['C'] = C
    matlab_data['C_train'] = C_train
#     matlab_data['songs_train'] = songs_train_idx.tolist()
#     matlab_data['songs_test'] = songs_test_idx.tolist()
    matlab_data['mixes_train'] = mixes_train_idx
    matlab_data['mixes_test'] = mixes_test_idx

    sp.io.savemat(matlab_export_path, matlab_data)
    print 'Dump data to:', outdir
    
dump(DUMP_DIR)

## Load data

In [None]:
def load(outdir):    
    if not os.path.exists(outdir):
        print 'Error folder does not exists'
        return

    song_graph = nx.read_gpickle(os.path.join(outdir, 'song_graph.gpickle'))
#     song_graph_train = nx.read_gpickle(os.path.join(outdir, 'song_graph_train.gpickle'))
#     playlist_graph = nx.read_gpickle(os.path.join(outdir, 'playlist_graph.gpickle'))
    playlist_graph_train = nx.read_gpickle(os.path.join(outdir, 'playlist_graph_train.gpickle'))
    
    store = pd.HDFStore(os.path.join(outdir, 'data.h5'))
    songs = store['songs']
    mixes = store['mixes']
    playlists = store['playlists']
    store.close()

    matlab_export_path = os.path.join(outdir, 'recog_real_data.mat')
    data = sp.io.loadmat(matlab_export_path)
    
#     c = data['C']
    c_train = data['C_train']
#     songs_train_idx = data['songs_train'].tolist()[0]
#     songs_test_idx = data['songs_test'].tolist()[0]
    mixes_train_idx = data['mixes_train'].tolist()[0]
    mixes_test_idx = data['mixes_test'].tolist()[0]

    return songs, mixes, playlists, song_graph, playlist_graph_train, c_train, mixes_train_idx, mixes_test_idx
    

DATASET_VERSION = 'medium'
DUMP_DIR = os.path.join(DATA_DIR, 'dump_' + DATASET_VERSION + '/')
    
SONGS, MIXES, PLAYLISTS, SONG_GRAPH, PLAYLIST_GRAPH_train, C_train, mixes_train_idx, mixes_test_idx = load(DUMP_DIR)
FEAT = SONGS[feat_col]
SONG_TO_IDX = dict(zip(SONGS.index.values, itertools.count()))
MIXES_test = MIXES[MIXES.index.isin(mixes_test_idx)]
pairs_distance = None
PLAYLISTS_train = PLAYLISTS[PLAYLISTS[playlist_id_key].isin(mixes_train_idx)]
STRIPPED_PLAYLIST_GRAPH_train = stripped_playlist_graph(PLAYLIST_GRAPH_train)

In [None]:
# if needed
pairs_distance = pickle.load(open(os.path.join(DUMP_DIR, 'song_pairs_distance.pickle'), "rb"))

# Experiments


In [None]:
# rank = len(P_CATEGORIES)
rank = 10  # number of clusters of song graph
playlist_size = 30
sample_size = 7
nb_sampled_playlists = 100

In [None]:
def agglomerate_results_per_category(data_path, key):
    path = data_path + '.h5'
    store = pd.HDFStore(path)
    random = store['results_random'][key]
    
    test_set = store['results_test_per_category'][key]
    sampled = store['results_sampled'][key]
    
    res = pd.DataFrame(index=random.index)
    res['Random'] = random
    res['Test set'] = test_set
    res['Sampled'] = sampled
    store.close()
    
    res.index.name = 'Playlist category'
    
    out = data_path + '_agg_' + key + '.tex'
    res.to_latex(out)
    return res


def training_closure():
    def inner(data_path, theta_playlists, theta_songs, playlist_graph):  # stripped or normal
        return recog.proximal_training(C_train, playlist_graph, SONG_GRAPH, rank,
                                       theta_tv_a=theta_playlists,
                                       theta_tv_b=theta_songs,
                                       data_path=data_path,
                                       verbose=1)
    return inner


def recommend_and_save(data_path, A, B):
    start = time.time()
    results_test = recog.test_playlists(MIXES_test, PLAYLISTS, SONGS, A, B, playlist_size, 
                                        SONG_TO_IDX, song_id_key, playlist_cat_key,
                                        pairs_distance=pairs_distance, pgraph_full=None) 
    mean_results_test = results_test.groupby('p_' + playlist_cat_key).mean()
    results_sampled, results_random = recog.sampled_vs_random(nb_sampled_playlists, P_CATEGORIES, PLAYLISTS, SONGS, 
                                        sample_size, A, B, playlist_size, SONG_TO_IDX, song_id_key, 
                                              playlist_cat_key, pairs_distance=pairs_distance)
    res = [mean_results_test.mean(), results_sampled.mean(), results_random.mean()]
    results = pd.DataFrame(res)
    results.set_index(pd.Index(['test_set', 'sampled', 'random']), inplace=True)
    
    path = data_path + '.h5'
    store = pd.HDFStore(path)
    store['results_test'] = results_test
    store['results_sampled'] = results_sampled
    store['results_random'] = results_random
    store['results_test_per_category'] = mean_results_test
    store.close()
    print 'Done in {} seconds'.format(time.time() - start)
    return results


def rec_sys_factory(method=0):  # add common params here
    if method == 0:
        return recog.recommend
    else if method == 1:
        return recog.recommend_playlist_graph_only
    else:
        raise NotImplementedError
    
    
def create_sampled_mix_df(playlist_df, sample_size, nb_playlists, p_category):
    pass


# TODO, create mix df if sampled from playlist or random and generated scores
def scores_from_mix_df(mix_df, rec_method):
    pass

### Average pairwise distance of song graph

In [None]:
distances = nx.get_edge_attributes(SONG_GRAPH, 'dist')

res = sum(map(lambda x: x[1], distances.items()))
res /= len(distances)

print 'Average distance on the song graph:', res
    
print 'Diameter:', nx.diameter(SONG_GRAPH)

## Experiment 1

Here we set theta_song to 0 and compare our model to the random case or using a simple playlist recommender system.

We designed two scenarii with a playlist graph only created with cosine similarity, the second scenario also add the metadata.

We test different cases:

- NMF only $\theta_p = \theta_s = 0$
- NMF + playlist graph compared to a different rec sys using only playlist

In [None]:
EXPERIMENT_DIR = os.path.join(DUMP_DIR, 'experiment1')
if not os.path.exists(EXPERIMENT_DIR):
    os.mkdir(EXPERIMENT_DIR)

### NMF only

In [None]:
theta_playlists_nmf_only = 0
theta_songs_nmf_only = 0
data_path = os.path.join(EXPERIMENT_DIR, 'nmf_only')

In [None]:
A_nmf_only, B_nmf_only = training_closure()(data_path, 
                                            theta_playlists_nmf_only,
                                            theta_songs_nmf_only, 
                                            STRIPPED_PLAYLIST_GRAPH_train)

In [None]:
results_nmf_only = recommend_and_save(data_path, A_nmf_only, B_nmf_only)

In [77]:
results_nmf_only

Unnamed: 0,p_cat_in,p_cat_out,s_cluster_in,s_cluster_out,s_genre,s_graph_dist_in,s_graph_dist_out
test_set,1.0,0.197329,0.347708,0.2536,0.396629,63.028296,64.461499
sampled,1.0,0.174,0.350429,0.248567,0.371115,63.108911,64.373068
random,0.157214,0.1442,0.339143,0.252133,0.302794,63.517827,64.369289


### Scenario 1

In this scenario no playlist categories are given, we create a playlist similarity graph only using cosine similarity. This is the normal case (Spotify, Deezer, etc.).

In [78]:
SCENARIO_DIR = os.path.join(EXPERIMENT_DIR, 'scenario1')
if not os.path.exists(SCENARIO_DIR):
    os.mkdir(SCENARIO_DIR)

#### Playlist graph recommender

TODO

#### Our recommender system

Find best theta from grid search.

In [None]:
def grid_search_theta_playlist():
    results = []
    for theta in xrange(1, 27, 3):
        d = {'theta_playlist': theta}
        data_path = os.path.join(SCENARIO_DIR, 'exp1_s1_theta_p_' + str(theta))
        d['data_path'] = data_path
        a, b = training_closure()(data_path, theta, 0.0, STRIPPED_PLAYLIST_GRAPH_train)
        d['a'] = a
        d['b'] = b
        results.append(d)
    return results

results_gs_theta_playlist_exp1_s1 = grid_search_theta_playlist()

In [None]:
# TO CHANGE, best a and b and change recommend method
results_exp1_s1 = recommend_and_save(data_path, a, b)

In [None]:
results_exp1_s1

### Scenario 2

In this scenario, we have the playlist categories and the graph of playlists is well connected using both metadata and cosine similarity.

In [None]:
SCENARIO_DIR2 = os.path.join(EXPERIMENT_DIR, 'scenario2')
if not os.path.exists(SCENARIO_DIR2):
    os.mkdir(SCENARIO_DIR2)

#### Playlist graph recommender

TODO

#### Our recommender system

In [None]:
def grid_search_theta_playlist_scenario2():
    results = []
    for theta in xrange(1, 27, 3):
        d = {'theta_playlist': theta}
        data_path = os.path.join(SCENARIO_DIR2, 'exp1_s2_theta_p_' + str(theta))
        d['data_path'] = data_path
        a, b = training_closure()(data_path, theta, 0.0, PLAYLIST_GRAPH_train)
        d['a'] = a
        d['b'] = b
        results.append(d)
    return results

results_gs_theta_playlist_exp1_s2 = grid_search_theta_playlist_scenario2()

In [None]:
# TO CHANGE
results_exp1_s2 = recommend_and_save(data_path, A_nmf_playlist2, B_nmf_playlist2)

In [None]:
results_exp1_s2

In [None]:
agglomerate_results_per_category(data_path, 'p_cat_out')

## Experiment 2

In this experiment we show the diversity of the recommended playlist. It augments if we augment the influence of the graph of songs. We use the STRIPPED playlist graph.

In [None]:
EXPERIMENT_DIR2 = os.path.join(DUMP_DIR, 'experiment2')
if not os.path.exists(EXPERIMENT_DIR2):
    os.mkdir(EXPERIMENT_DIR2)

#### Theta songs = 0

In [None]:
results_nmf_playlist_exp1

#### Grid seach theta songs

In [None]:
def grid_search_theta_song():
    results = []
    for theta in xrange(1, 27, 3):
        d = {'theta_song': theta}
        data_path = os.path.join(EXPERIMENT_DIR2, 'exp2_theta_s_' + str(theta))
        d['data_path'] = data_path
        a, b = training_closure()(data_path, theta_playlists_nmf_playlist, theta, STRIPPED_PLAYLIST_GRAPH_train)
        d['a'] = a
        d['b'] = b
        results.append(d)
    return results

results_gs_theta_song = grid_search_theta_song()

### Agglomerate results to compare the different theta songs

In [None]:
recog.reco_playlist_graph_only(None, SONGS, PLAYLISTS_TRAIN, song_id_key, 214, playlist_id_key)

In [None]:
PLAYLISTS_TRAIN[PLAYLISTS_TRAIN.mix_id == 214]

### NMF + playlist + songs

In [None]:
theta_playlists_nmf_playlist_song = 10.
theta_songs_nmf_playlist_song = 10.
data_path = os.path.join(SCENARIO_DIR, 'nmf_playlist_song')

In [None]:
A_nmf_playlist_song, B_nmf_playlist_song = training_closure()(data_path, 
                                                              theta_playlists_nmf_playlist_song,
                                                              theta_songs_nmf_playlist_song)

In [None]:
results_nmf_playlist_song = recommend_and_save(data_path, A_nmf_playlist_song, B_nmf_playlist_song)

In [None]:
results_nmf_playlist_song

In [None]:
results_nmf_playlist_song - results_playlist_only

### NMF + playlist + songs 2

In [None]:
theta_playlists_nmf_playlist_song2 = 10.
theta_songs_nmf_playlist_song2 = 2.
data_path = os.path.join(SCENARIO_DIR, 'nmf_playlist_song2')

In [None]:
A_nmf_playlist_song2, B_nmf_playlist_song2 = training_closure()(data_path, 
                                                                theta_playlists_nmf_playlist_song2, 
                                                                theta_songs_nmf_playlist_song2)

In [None]:
results_nmf_playlist_song2 = recommend_and_save(data_path, A_nmf_playlist_song2, B_nmf_playlist_song2)

In [None]:
results_nmf_playlist_song2 - results_nmf_playlist_song

In [None]:
results_nmf_playlist_song2 - results_playlist_only

## Scenario 2


No playlist categories, we keep only the edges created by cosine similarity.

In [None]:
def stripped_playlist_graph(g):
    h = nx.Graph()
    h.add_nodes_from(g.nodes(data=True))
    counts = nx.get_edge_attributes(g, 'count')

    cosine = lambda u, v, x: float(x) / (np.sqrt(g.node[u]['size']) * np.sqrt(g.node[v]['size']))
    reweighted_attrs = [(k[0], k[1], cosine(k[0], k[1], v)) for k, v in counts.iteritems()]

    h.add_weighted_edges_from(reweighted_attrs)

    d = community.best_partition(h)
    mod = community.modularity(d, h)
    print 'Number of clusters: {}, Modularity: {}'.format(len(set(d.values())), mod)
    return h

In [None]:
STRIPPED_PLAYLIST_GRAPH_train = stripped_playlist_graph(PLAYLIST_GRAPH_train)

In [None]:
SCENARIO_DIR = os.path.join(DUMP_DIR, 'scenario2')
if not os.path.exists(SCENARIO_DIR):
    os.mkdir(SCENARIO_DIR)

### Playlist only

In [None]:
data_path = os.path.join(SCENARIO_DIR, 'playlist_graph_only')

In [None]:
A_playlist_only_v2, B_playlist_only_v2 = training_closure()(data_path, 
                                                      theta_playlists_playlist_only, 
                                                      theta_songs_playlist_only,
                                                      STRIPPED_PLAYLIST_GRAPH_train)

In [None]:
results_playlist_only_v2 = recommend_and_save(data_path, A_playlist_only_v2, B_playlist_only_v2)

In [None]:
results_playlist_only_v2

### NMF + playlist

In [None]:
theta_playlists_nmf_playlist = 18 
theta_songs_nmf_playlist = 0.0
data_path = os.path.join(SCENARIO_DIR, 'nmf_playlist')

In [None]:
A_nmf_playlist_v2, B_nmf_playlist_v2 = training_closure()(data_path, 
                                                          theta_playlists_nmf_playlist, 
                                                          theta_songs_nmf_playlist,
                                                          STRIPPED_PLAYLIST_GRAPH_train)

In [None]:
results_nmf_playlist_v2_mod = recommend_and_save(data_path, A_nmf_playlist_v2, B_nmf_playlist_v2)

In [None]:
results_nmf_playlist_v2

In [None]:
results_nmf_playlist_v2_mod

## Find best nmf_playlist theta playlist

In [None]:
best_theta = 12
# 0.004379696413781331 
last_diff = (results_nmf_playlist_v2 - results_playlist_only_v2)['p_cat_out']['test_set']

best_A = A_nmf_playlist_v2
best_B = B_nmf_playlist_v2
for i, theta in enumerate(xrange(13, 20)):
    theta_playlists_nmf_playlist = theta    
    data_path = os.path.join(SCENARIO_DIR, 'nmf_playlist_' + str(i + 1))
    A_nmf_playlist_v2, B_nmf_playlist_v2 = training_closure()(data_path, 
                                                          theta_playlists_nmf_playlist, 
                                                          theta_songs_nmf_playlist,
                                                          STRIPPED_PLAYLIST_GRAPH_train)
    results_nmf_playlist_v2 = recommend_and_save(data_path, A_nmf_playlist_v2, B_nmf_playlist_v2)
    
    diff = (results_nmf_playlist_v2 - results_playlist_only_v2)['p_cat_out']['test_set']
    
    if diff > last_diff:
        last_diff = diff
        best_theta = theta
        best_A = A_nmf_playlist_v2
        best_B = B_nmf_playlist_v2

In [None]:
results_nmf_playlist_v3 = recommend_and_save(data_path, best_A, best_B, STRIPPED_PLAYLIST_GRAPH)

In [None]:
results_nmf_playlist_v3

In [None]:
data_path = os.path.join(SCENARIO_DIR, 'nmf_playlist')
np.savez(data_path, A=best_A, B=best_B)

### Output neighbors of a song in the song graph

In [None]:
song_id =  SONGS[SONGS.artist_name == 'The Beatles'].index.values[0]

In [None]:
song_id = 545685 # yesterday

In [None]:
x = SONGS.reset_index()
song_id = 1308  # on graph

In [None]:
def get_sorted_neighbors(query, df, g, metadata=['artist_name', 'title', 'genre'], path=None):
    s = df.reset_index()
    neighbors = g[song_id]

    res = []
    for k, v in neighbors.iteritems():
        res.append((k, v['weight']))
        
    t = list(reversed(sorted(res, key=operator.itemgetter(1))))
    sorted_neighbors = map(lambda x: x[0], t)

    res = s[s.index.isin(sorted_neighbors)][metadata]
    res = res.reindex(sorted_neighbors, copy=True).reset_index(drop=True)    
    res.index += 1
    res2 = df.iloc[query][metadata]
    
    if path is not None:
        with open(path, 'w') as f:
            f.write(res.to_latex())
            
    return res, res2

path = os.path.join(DATA_DIR, 'song_knn.tex')
neighbors, query_song = get_sorted_neighbors(song_id, SONGS, SONG_GRAPH, path=path)

In [None]:
neighbors

In [None]:
query_song

## Model parameters