# Build song recommendations

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import numpy as np
import matplotlib as plt
import importlib
from pyspark.ml.feature import Tokenizer, CountVectorizer, MinHashLSH
from pyspark.sql.types import IntegerType, StringType, ArrayType

import mpd

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline
# change default plot size
plt.rcParams['figure.figsize'] = (15,10)

## Load and prep data

* Load the full data set
* Vectorize the playlists into sparse vectors
* Extract the vocabulary with tid to allow translation back to track_uri

In [None]:
mpd_all=mpd.load(spark, "onebig", 1)

## Build track, artist and name features

The track_uri and artist_uri columns are already lists and can be processed in their raw format.

In [None]:
trackdf = mpd_all.select("pid", "tracks.track_uri")

In [None]:
artistdf = mpd_all.select("pid", "tracks.artist_uri")

In [None]:
namedf = mpd.canonicaltokens(mpd_all.select("pid", "name").fillna({"name": ""}), "name", "filtered").drop("name")

In [None]:
mergeCols = f.udf((lambda x, y: x + y), ArrayType(StringType()))

In [None]:
featuredf = trackdf.join(artistdf, trackdf.pid == artistdf.pid).drop(artistdf.pid)

In [None]:
featuredf = featuredf.withColumn("tafeatures", mergeCols("track_uri", "artist_uri")).drop("track_uri").drop("artist_uri")

In [None]:
featuredf.printSchema()

In [None]:
featuredf = featuredf.join(namedf, namedf.pid == featuredf.pid).drop(namedf.pid)

In [None]:
featuredf = featuredf.withColumn("features", mergeCols("filtered", "tafeatures")).drop("tafeatures").drop("filtered")

In [None]:
featuredf.printSchema()

In [None]:
featuredf.show(5)

## Build feature vector

In [None]:
cv = CountVectorizer(inputCol="features", outputCol="featurevector", minDF=2, vocabSize=2000000)

In [None]:
model=cv.fit(featuredf)

In [None]:
featurevec = model.transform(featuredf).drop("features")

In [None]:
featurevec = featurevec.withColumnRenamed("featurevector", "features")

In [None]:
featurevec.show(5)

## Build LSH 

Eliminate any zero length feature vectors from the input

In [None]:
vectorlength = f.udf(lambda x: x.numNonzeros(), IntegerType())

In [None]:
arraylength = f.udf(lambda x: len(x), IntegerType())

In [None]:
f2 = featurevec.withColumn("vlen", vectorlength(featurevec.features))

In [None]:
sparsevec = f2.where(f2.vlen > 1)

In [None]:
mh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=5)

In [None]:
mhmodel = mh.fit(sparsevec)

In [None]:
transform = mhmodel.transform(sparsevec)

In [None]:
transform.show(5)

In [None]:
transform.count()

In [None]:
t2 = transform.withColumn("hlen", arraylength("hashes"))

In [None]:
t2.orderBy("vlen").show(5)

In [None]:
t2.orderBy("hlen").show(5)

## Load Challenge set

In [None]:
mpd_test=spark.read.json("../mpd-challenge/challenge_set.json", multiLine=True)

In [None]:
cpl=mpd_test.select(f.explode("playlists").alias("playlist"))

In [None]:
recdf=cpl.select("playlist.name", "playlist.num_holdouts", "playlist.pid", "playlist.num_tracks", "playlist.tracks", "playlist.num_samples")

In [None]:
chtracks = recdf.select("pid", "tracks.track_uri")

In [None]:
chartist = recdf.select("pid", "tracks.artist_uri")

In [None]:
challengedf = chtracks.join(chartist, chtracks.pid == chartist.pid).drop(chartist.pid)

In [None]:
challengedf = challengedf.withColumn("features", mergeCols(f.col("track_uri"),f.col("artist_uri"))).drop("track_uri").drop("artist_uri")

In [None]:
tokedf = mpd.canonicaltokens(recdf.select("pid", "name").fillna({"name": ""}), "name", "filtered").drop("name")

In [None]:
challengedf.printSchema()

In [None]:
tokedf.printSchema()

In [None]:
challengedf = challengedf.join(tokedf, tokedf.pid == challengedf.pid).drop(tokedf.pid)

In [None]:
challengedf.printSchema()

In [None]:
challengedf = challengedf.withColumn("featurevec", mergeCols("filtered", "features")).drop("filtered").drop("features")

In [None]:
challengedf = challengedf.withColumnRenamed("featurevec", "features")

In [None]:
challengedf.printSchema()

In [None]:
challengedf.show(5)

## Map challenge set into training vocab

In [None]:
challengevec = model.transform(challengedf).drop("features")

In [None]:
challengevec = challengevec.withColumnRenamed("featurevector", "features")

In [None]:
challengevec.printSchema()

In [None]:
challengevec.show(5)

In [None]:
c2 = challengevec.withColumn("vlen", vectorlength("features"))

In [None]:
c2.orderBy("vlen").show(5)

There are challenge set vectors that have a length of zero.  This shouldn't cause any problems because can just recommend top songs from the global data set here.

In [None]:
c2.where(c2.vlen == 0).describe("pid").show()

## Find playlist matches for one challenge set

Select the first challenge playlist to test.`

In [None]:
challengevec.cache()

In [None]:
testpl = challengevec.limit(1)

In [None]:
testpl.show(truncate=False)

In [None]:
testvec=challengevec.select("features").rdd.map(lambda x: x.features).take(1)[0]

In [None]:
testvec

In [None]:
type(testvec)

In [None]:
testpid=challengevec.select("pid").rdd.map(lambda x: x.pid).take(1)[0]

In [None]:
testpid

In [None]:
mh.getOutputCol()

In [None]:
mhmodel.params

In [None]:
transform.printSchema()

In [None]:
transform.cache()

In [None]:
hot100 = mhmodel.approxNearestNeighbors(transform, testvec, 100)

In [None]:
# hot100.explain()

In [None]:
hot100.count()

In [None]:
hot100.printSchema()

In [None]:
hot100.show(5)

In [None]:
pidlist = hot100.select("pid").toPandas()["pid"].tolist()

In [None]:
pidlist[0:5]

In [None]:
type(pidlist)

In [None]:
recommend = { "pid": testpid, "recpl": [pidlist]}

In [None]:
recommend

In [None]:
testpd = pd.DataFrame({"pid":0, "recpl":[]})

In [None]:
testpd

In [None]:
testpd.append(pd.DataFrame(recommend))

In [None]:
recpd = pd.DataFrame(recommend)

In [None]:
recpd

In [None]:
recpd.append(pd.DataFrame(recommend))

In [None]:
spark.createDataFrame(pd.DataFrame(recommend)).show()

## Build nearest neighbor playlists

In [None]:
def getrecommend(chpl, model, transform):
    #testvec=challenge.select("features").rdd.map(lambda x: x.features).take(1)[0]
    #testpid=challenge.select("pid").rdd.map(lambda x: x.pid).take(1)[0]
    
    testvec = chpl.features
    testpid = chpl.pid
    #print("DEBUG: " + testvec + " " + testpid )
    
    hot100 = model.approxNearestNeighbors(transform, testvec, 100)
    
    pidlist = hot100.select("pid").toPandas()["pid"].tolist()
    
    recommend = { "pid": testpid, "recpl": [pidlist]}
    #print("DEBUG: " + testpid + " " + pidlist)
    
    return recommend
    #return recpd
    #print(requests.pid)
    #testvec = requests.features
    
    #return testpid, testvec

In [None]:
testpd = pd.DataFrame({"pid":0, "recpl":[]})

In [None]:
for row in challengevec.limit(100).rdd.collect():
    rec = getrecommend(row, mhmodel, transform)
    testpd = testpd.append(pd.DataFrame(rec))

In [None]:
testpd