# Explore the neighborhood of song recommendations

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import numpy as np
import matplotlib as plt
import importlib

import mpd

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline
# change default plot size
plt.rcParams['figure.figsize'] = (15,10)

## Load and prep data

* Load the full data set
* Vectorize the playlists into sparse vectors
* Extract the vocabulary with tid to allow translation back to track_uri

In [None]:
mpd_all=mpd.load(spark, "onebig", 1)

In [None]:
model, result = mpd.vectorizecol(mpd_all.select("pid", "tracks.track_uri"), "track_uri", "features")

In [None]:
importlib.reload(mpd)

In [None]:
vdf = mpd.buildvocabdf(spark, model.vocabulary)

In [None]:
vdf.show(5)

Demonstrate mapping of tracks in voabulary to human readable names from mpd dataset

In [None]:
from pyspark.sql.functions import explode
tname=mpd_all.select(explode("tracks").alias("tracks")).select("tracks.track_name", "tracks.track_uri", "tracks.artist_name").distinct()

In [None]:
vdf.join(tname, tname.track_uri == vdf.term).drop(vdf.term).orderBy("tid").show(5)

## Prepare data for kNN search

* Prep dataset with vector length to elliminate empty playlists from minHash input

In [None]:
from pyspark.ml.feature import MinHashLSH

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

vectorlength = udf(lambda x: x.numNonzeros(), IntegerType())

In [None]:
r2=result
r2=r2.withColumn("vlen", vectorlength(r2.features))

In [None]:
sparsevec = r2.where(r2.vlen > 0)

In [None]:
sparsevec= sparsevec.rdd.sample(False, .01, 1).toDF()

In [None]:
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)

In [None]:
model = mh.fit(sparsevec)

In [None]:
transformA = model.transform(sparsevec)

## Explore results for full playlist match

In [None]:
testpl=result.select("features").rdd.map(lambda x: x.features).take(1)[0]

In [None]:
print("Approximately searching dfA for 100 nearest neighbors of the key:")
k100nn = model.approxNearestNeighbors(transformA, testpl, 100)

In [None]:
mpd.plothist(k100nn, "distCol", 11)

In [None]:
k100nn.printSchema()

In [None]:
k100nntracks=k100nn.select(explode("track_uri").alias("track_uri"))

In [None]:
trackrank = k100nntracks.select("track_uri").groupby("track_uri").count().sort(f.col("count").desc())

In [None]:
trackrank.printSchema()

In [None]:
importlib.reload(mpd)

In [None]:
mpd.scatterplotfreq(trackrank)

## Explore results for a subset of playlist

In [None]:
testpl

Sparse vectors have an [indices method](http://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector.indices) to get the array elements with values.

In [None]:
testpl.indices

Getting a subset of the playlist is easy by looking at the indices
They are returned as a numpy array. 
[numpy has a built in to chose a random sample from an array](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.random.choice.html)

In [None]:
rand5npa = np.random.choice(testpl.indices, 5)

Note, the machine learning libararies expect sparse vectors of the new ml.linalg package not the mllib.linalg package.  There is a [conversion method for the old format to return as the new ML format](http://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector.asML).

In [None]:
from pyspark.ml.linalg import SparseVector 

In [None]:
np.sort(rand5npa)

In [None]:
print(testpl.size, np.sort(rand5npa), np.ones(len(rand5npa)))

Use np.sort and np.ones to help create new spase vector for the search

In [None]:
rand5pl = SparseVector(testpl.size, np.sort(rand5npa), np.ones(len(rand5npa)))

In [None]:
rand5pl

In [None]:
k100nn5seed = model.approxNearestNeighbors(transformA, rand5pl, 100)

In [None]:
k100nn5seed.count()

Note we're only getting 5 candidates so that must mean we [don't have enough candidates in the hash bucket](https://spark.apache.org/docs/2.2.0/ml-features.html#approximate-nearest-neighbor-search).  Wonder if increasing the hash number in the minhash will improve this.

In [None]:
mpd.plothist(k100nn5seed, "distCol", 11)

## Explore impact of choice of sampled playlist

I would have expected more results not less from a smaller playlist.
I'm assuming that a smaller set of tracks would match a larger collection of playlists.

This could be a luck of the draw for the random selection. Repeating the random choice selection from the playlist now gives me 11 results.

So this is challenging. How to ensure I get enough results from the search

I assume what is happening is that if there are tracks that are less popular in the search then I'm going to be more of an edge case playlist.

Yeah, and if i take just the top-5 most popular songs in the playlist then i get my 100 results as expected.

Randomly choose tracks from playlist. Leads to small result set for near neighbors

In [None]:
rand5npa = np.random.choice(testpl.indices, 5)

Choose 5 most popular songs from playlist. Leads to high number of near neighbors.

In [None]:
rand5npa = np.sort(testpl.indices)[0:5]

Choose the first 5 songs from the playlist. This is the continuation model.  You hear 5 songs and now recommend the rest.

Note, in the current construction of testpl we have lost the orignal playlist order since we are deriving this from the sparse vector.

Really need to go back and redefine testpl as coming from the original playlist

In [None]:
rand5npa = testpl.indices[0:5]

In [None]:
rand5pl = SparseVector(testpl.size, np.sort(rand5npa), np.ones(len(rand5npa)))

In [None]:
rand5pl

In [None]:
mh10 = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=10)

In [None]:
model10 = mh10.fit(sparsevec)

In [None]:
transformA10 = model10.transform(sparsevec)

In [None]:
k100nn5seed10 = model.approxNearestNeighbors(transformA10, rand5pl, 100)

In [None]:
k100nn5seed10.count()

In [None]:
mpd.plothist(k100nn5seed10, "distCol", 11)

## Explore artists as clustering

In [None]:
amodel, aresult = mpd.vectorizecol(mpd_all.select("pid", "tracks.artist_uri"), "artist_uri", "features")

In [None]:
avdf = mpd.buildvocabdf(spark, amodel.vocabulary)

In [None]:
avdf.show(5)

In [None]:
aname=mpd_all.select(explode("tracks").alias("tracks")).select("tracks.artist_uri", "tracks.artist_name").distinct()

In [None]:
avdf.join(aname, aname.artist_uri == avdf.term).drop(avdf.term).orderBy("tid").show(5)

In [None]:
ar2=aresult
ar2=ar2.withColumn("vlen", vectorlength(ar2.features))

In [None]:
ar2.count()

In [None]:
asparsevec = ar2.where(ar2.vlen > 0)

In [None]:
asparsevec.count()

In [None]:
asparsevec= asparsevec.rdd.sample(False, .01, 1).toDF()

In [None]:
amh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)

In [None]:
amhmodel = amh.fit(asparsevec)

In [None]:
amhtransform = amhmodel.transform(asparsevec)

In [None]:
atestpl=aresult.select("features").rdd.map(lambda x: x.features).take(1)[0]

In [None]:
atestpl

In [None]:
print("Approximately searching dfA for 100 nearest neighbors of the artist:")
ak100nn = model.approxNearestNeighbors(amhtransform, atestpl, 100)

In [None]:
mpd.plothist(ak100nn, "distCol", 11)

In [None]:
k100nnartists=ak100nn.select(explode("artist_uri").alias("artist_uri"))

In [None]:
artistrank = k100nnartists.select("artist_uri").groupby("artist_uri").count().sort(f.col("count").desc())

In [None]:
mpd.scatterplotfreq(artistrank)

### Explore subset of playlist match with artists

In [None]:
atestpl

In [None]:
arand5npa = np.random.choice(atestpl.indices, 5, replace=False)

In [None]:
arand5npa

In [None]:
arand5pl = SparseVector(atestpl.size, np.sort(arand5npa), np.ones(len(arand5npa)))

In [None]:
ak100nn5seed = model.approxNearestNeighbors(amhtransform, arand5pl, 100)

In [None]:
ak100nn5seed.count()

In [None]:
ak100nn5seed.printSchema()

In [None]:
ak100nn5seed.orderBy("pid").show(5)

So the playlist name matches are less perfect than with the full playlist search, at least based on superficial name matching.

Get the sense that searching for knn with the full sample playlist by tracks got many similarly named playlists.

In [None]:
ak100nn5seed.join(mpd_all.select("pid", "name"), mpd_all.pid == ak100nn5seed.pid).drop(mpd_all.pid).select("pid", "name").show(5)