In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib as mpl


%matplotlib inline

In [None]:
def discrete_cmap(N, base_cmap=None):
    """Create an N-bin discrete colormap from the specified input map"""

    # Note that if base_cmap is a string or None, you can simply do
    #    return plt.cm.get_cmap(base_cmap, N)
    # The following works for string, None, or a colormap instance:

    base = plt.cm.get_cmap(base_cmap)
    color_list = base(np.linspace(0, 1, N))
    cmap_name = base.name + str(N)
    return base.from_list(cmap_name, color_list, N)

In [None]:
def tsne_plot(rows, file_name):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []
    
    N = 20

    for index, row in rows.iterrows():
        vector = row["vector"]
        items = row["items"]
        
        vector_arr = [float(i) for i in vector.split(',')]
        items_arr = items.split(',')

        tokens.append(vector_arr)
        
        label = len(items_arr)
        if label > N:
            label = N
        labels.append(label)
        
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', random_state=123, 
                      early_exaggeration=10, n_iter_without_progress = 150, n_iter=3000 )
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
        
    # define the colormap
    cmap = plt.cm.jet
    # extract all colors from the .jet map
    cmaplist = [cmap(i) for i in range(cmap.N)]
    # force the first color entry to be grey
    #cmaplist[0] = (.5,.5,.5,1.0)
    # create the new map
    cmap = cmap.from_list('Custom cmap', cmaplist, cmap.N)
    # define the bins and normalize
    bounds = np.linspace(0,N,N+1)
    norm = mpl.colors.BoundaryNorm(bounds, cmap.N)
                
    fig, ax = plt.subplots(1,1, figsize=(16,16))
    #plt.figure(figsize=(16, 16)) 
    ax.scatter(x,y, c=labels, cmap=cmap, norm=norm, rasterized=True)

    #for i in range(len(x)):
        #ax.scatter(x[i],y[i], c=labels[i], cmap=cmap, norm=norm, rasterized=True)
        #ax.annotate(labels[i],
        #             xy=(x[i], y[i]),
        #             xytext=(5, 2),
        #             textcoords='offset points',
        #             ha='right',
        #             va='bottom')
    ax2 = fig.add_axes([0.95, 0.1, 0.03, 0.8])
    cb = mpl.colorbar.ColorbarBase(ax2, cmap=cmap, norm=norm, spacing='proportional', 
                                   ticks=bounds, boundaries=bounds, format='%1i')
    ax.set_title('t-SNE on session embeddings', size=30)
    #ax.set_ylim(-0.3,0)
    #ax.set_xlim(-0.6,-0.3)
    ax2.set_ylabel('Session length', size=30)
    
    font_tick = 25

    for tick in ax.xaxis.get_major_ticks():
        tick.label.set_fontsize(font_tick) 
    for tick in ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(font_tick) 

    #plt.show()
    plt.savefig('../plots/' + file_name, dpi=300,  bbox_inches='tight')


In [None]:
def init_tsne_plot(algo, file_prefix = "", att = False):
    tsne_file = "tsne_recsys17_best_" + file_prefix
    infer_path = '../data/recsys17/interim/infer/' + file_prefix
    if att == True:
        infer_path += 'att_'
        tsne_file += 'att_'

    infer_path += algo + '_encoder_train'
    tsne_file += algo

    infer_file = pd.read_csv(infer_path + ".csv", sep='\t', header=None, usecols=[0,1,2])
    infer_file.columns = ['sid','vector','items']
    print(len(infer_file))
    tsne_plot(infer_file[["vector","items"]], tsne_file + ".pdf")

#init_tsne_plot("ae", False)
#init_tsne_plot("ae", True)
#init_tsne_plot("dae", False)
#init_tsne_plot("dae", True)

#init_tsne_plot("vae", False)
#init_tsne_plot("vae", True)

init_tsne_plot("ae", "content_", False)
init_tsne_plot("ae", "content_", True)
init_tsne_plot("dae", "content_", False)
init_tsne_plot("dae", "content_", True)
init_tsne_plot("vae", "content_", False)
init_tsne_plot("vae", "content_", True)