<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Logger-(replace-with-better)" data-toc-modified-id="Logger-(replace-with-better)-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Logger (replace with better)</a></span></li><li><span><a href="#Imports" data-toc-modified-id="Imports-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Imports</a></span></li><li><span><a href="#Experiment-setup" data-toc-modified-id="Experiment-setup-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Experiment setup</a></span></li><li><span><a href="#Run-experiment" data-toc-modified-id="Run-experiment-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Run experiment</a></span></li></ul></div>

# Logger (replace with better)

In [1]:
import inspect
import time

def logger(msg, level=False, endspace="\n"):
    if level:
        print(time.ctime(), " ", inspect.stack()[1][3], "(): ", msg, sep="", end=endspace, flush=True)

# Imports

In [3]:
from random import random, sample
import copy
import math
import numpy
from sklearn.metrics.pairwise import cosine_distances as cosine

In [4]:
#  Very simple implementation of index vectors
#  Some standard libraries for sparse vectors are available
#  in Python but appear to have prohibitive overhead of implementation.
def newrandomvector(dimensionality, denseness):
    """
    Generates a sparse vector in the form of a dict with denseness entries composed of
    integer keys and 1 or -1 values, equal number of each.
    :param dimensionality: int
    :param denseness: int
    :return: dict int->int
    """
    vec = {}
    if denseness % 2 != 0:
        denseness += 1
    if denseness > 0:  # no need to be careful about this, right? and k % 2 == 0):
        nonzeros = sample(range(dimensionality), denseness)
        negatives = denseness // 2
        for ix in nonzeros[:negatives]:
            vec[ix] = 1
        for ix in nonzeros[negatives:]:
            vec[ix] = -1
    return vec

In [5]:
def sparseadd(onevec, othervec, weight=1.0):
    """
    Adds two sparse vectors represented as dicts with numerical values, optionally weighting othervec with the
    weight parameter which defaults to 1.0 (equal weighting).
    :param onevec: dict with numerical values
    :param othervec: dict with numerical values
    :param weight: float
    :return: dict with float values
    """
    result = copy.copy(onevec)
    for ll in onevec:
        result[ll] = onevec[ll]
    for kk in othervec:
        if kk in result:
            result[kk] = result[kk] + othervec[kk] * float(weight)
        else:
            result[kk] = othervec[kk] * float(weight)
    return result

In [6]:
def sparsecosine(xvec, yvec):
    """
    Calculates cosine between two sparse vectors. (Can be replaced with sklearn.dist.cosine,
    but that seems to be more costly in processing time.)
    :param xvec: dict with numerical values
    :param yvec: dict with numerical values
    :return: float
    """
    x2 = 0
    y2 = 0
    xy = 0
    for ix in xvec:
        x2 += xvec[ix] * xvec[ix]
    for jx in yvec:
        y2 += yvec[jx] * yvec[jx]
        if jx in xvec:
            xy += xvec[jx] * yvec[jx]
    if x2 * y2 == 0:
        cos = 0
    else:
        cos = xy / (math.sqrt(x2) * math.sqrt(y2))
    return cos

# Experiment setup

In [9]:
# vector space setup
dimensionality = 1000
denseness = 10
testsampleproportion = 500

# loglevels for logger
debug   = True    
monitor = True
error   = True

# simulated data
numberofplaylists = 1000
numberoftracks = 10000
minimumnumberoftracksperplaylist = 50
maximumnumberoftracksperplaylist = 200

playlists = []
for i in range(numberofplaylists):
    playlistlength = minimumnumberoftracksperplaylist + \
                 int(random() * (maximumnumberoftracksperplaylist - minimumnumberoftracksperplaylist))
    oneplaylist = sample(range(numberoftracks), playlistlength)
    playlists.append(oneplaylist)

# Run experiment

In [10]:
# for every playlist
#   generate sparse random index vector for that playlist
#   for every song in that playlist
#       if not seen, initiate empty context vector
#       add the playlist index vector to the context vector
logger("Start building space.", debug)
tracks = {}
for oneplaylist in playlists:
    indexvector = newrandomvector(dimensionality, denseness)
    for track in oneplaylist:
        if track not in tracks:
            contextvector = newrandomvector(dimensionality, denseness)
        else:
            contextvector = tracks[track]
        contextvector = sparseadd(contextvector, indexvector)
        tracks[track] = contextvector
logger("Done building space of {} items.".format(len(tracks)), debug)
samplesize = len(tracks) // testsampleproportion
testsample = sample(tracks.keys(), samplesize)
for t1 in testsample:
    for t2 in testsample:
        c1 = sparsecosine(tracks[t1], tracks[t2])
logger("Done testing distances for {} items against {} items.".format(len(testsample), len(tracks)),
       debug)

Thu Jan  9 13:38:22 2020 <module>(): Start building space.
Thu Jan  9 13:38:24 2020 <module>(): Done building space of 10000 items.
Thu Jan  9 13:38:24 2020 <module>(): Done testing distances for 20 items against 10000 items.
