# Explore the neighborhood of song recommendations

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import numpy as np
import matplotlib as plt
import importlib

import mpd

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline
# change default plot size
plt.rcParams['figure.figsize'] = (15,10)

## Load and prep data

* Load the full data set
* Vectorize the playlists into sparse vectors
* Extract the vocabulary with tid to allow translation back to track_uri

In [None]:
mpd_all=mpd.load(spark, "onebig", 1)

In [None]:
model, result = mpd.vectorizecol(mpd_all.select("pid", "tracks.track_uri"), "track_uri", "features")

In [None]:
importlib.reload(mpd)

In [None]:
vdf = mpd.buildvocabdf(spark, model.vocabulary)

In [None]:
vdf.show(5)

Demonstrate mapping of tracks in voabulary to human readable names from mpd dataset

In [None]:
from pyspark.sql.functions import explode
tname=mpd_all.select(explode("tracks").alias("tracks")).select("tracks.track_name", "tracks.track_uri", "tracks.artist_name").distinct()

In [None]:
vdf.join(tname, tname.track_uri == vdf.term).drop(vdf.term).orderBy("tid").show(5)

## Prepare data for kNN search

* Prep dataset with vector length to elliminate empty playlists from minHash input

In [None]:
from pyspark.ml.feature import MinHashLSH

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

vectorlength = udf(lambda x: x.numNonzeros(), IntegerType())

In [None]:
r2=result
r2=r2.withColumn("vlen", vectorlength(r2.features))

In [None]:
sparsevec = r2.where(r2.vlen > 0)

In [None]:
sparsevec= sparsevec.rdd.sample(False, .01, 1).toDF()

In [None]:
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)

In [None]:
model = mh.fit(sparsevec)

In [None]:
transformA = model.transform(sparsevec)

## Explore results for full playlist match

In [None]:
testpl=result.select("features").rdd.map(lambda x: x.features).take(1)[0]

In [None]:
print("Approximately searching dfA for 100 nearest neighbors of the key:")
k100nn = model.approxNearestNeighbors(transformA, testpl, 100)

In [None]:
mpd.plothist(k100nn, "distCol", 11)

In [None]:
k100nn.printSchema()

In [None]:
k100nntracks=k100nn.select(explode("track_uri").alias("track_uri"))

In [None]:
trackrank = k100nntracks.select("track_uri").groupby("track_uri").count().sort(f.col("count").desc())

In [None]:
trackrank.printSchema()

In [None]:
importlib.reload(mpd)

In [None]:
mpd.scatterplotfreq(trackrank)

## Explore results for a subset of playlist

In [None]:
testpl

Sparse vectors have an [indices method](http://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector.indices) to get the array elements with values.

In [None]:
testpl.indices

Getting a subset of the playlist is easy by looking at the indices
They are returned as a numpy array. 
[numpy has a built in to chose a random sample from an array](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.random.choice.html)

In [None]:
rand5npa = np.random.choice(testpl.indices, 5)

Note, the machine learning libararies expect sparse vectors of the new ml.linalg package not the mllib.linalg package.  There is a [conversion method for the old format to return as the new ML format](http://spark.apache.org/docs/2.2.0/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector.asML).

In [None]:
from pyspark.ml.linalg import SparseVector 

In [None]:
np.sort(rand5npa)

In [None]:
print(testpl.size, np.sort(rand5npa), np.ones(len(rand5npa)))

Use np.sort and np.ones to help create new spase vector for the search

In [None]:
rand5pl = SparseVector(testpl.size, np.sort(rand5npa), np.ones(len(rand5npa)))

In [None]:
rand5pl

In [None]:
k100nn5seed = model.approxNearestNeighbors(transformA, rand5pl, 100)

In [None]:
k100nn5seed.count()

In [None]:
mpd.plothist(k100nn5seed, "distCol", 11)