# Hot song recommendations

Build song recommendation out of the k=100 nearest neighbors

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import numpy as np
import matplotlib as plt
import importlib
from pyspark.ml.feature import Tokenizer, CountVectorizer, MinHashLSH
from pyspark.sql.types import IntegerType, StringType, ArrayType

import mpd

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline
# change default plot size
plt.rcParams['figure.figsize'] = (15,10)

## Load and prep data

* Load the full data set
* Load the picked k=100 approx Nearest Neighbor results
* Build song recommdations based on songs in nearest playlist

In [None]:
mpd_all=mpd.load(spark, "onebig", 1)

Get the ranked popularity of songs in the mpd.

In [None]:
cv = CountVectorizer(inputCol="track_uri", outputCol="features", minDF=2, vocabSize=2000000)

In [None]:
model=cv.fit(mpd_all.select("pid", "tracks.track_uri"))

In [None]:
result=model.transform(mpd_all.select("pid", "tracks.track_uri"))

In [None]:
#model, result = mpd.vectorizecol(mpd_all.select("pid", "tracks.track_uri"), "track_uri", "features", 2000000)

In [None]:
result.printSchema()

In [None]:
result.count()

In [None]:
importlib.reload(mpd)

In [None]:
vdf = mpd.buildvocabdf(spark, model.vocabulary)

In [None]:
vdf.show(5)

In [None]:
vdf.describe("tid").show()

In [None]:
vdf.printSchema()

In [None]:
vdf.count()

Get the Hot100 playlists that match the challenge set.

In [None]:
hot100 = spark.createDataFrame(pd.read_pickle("ex-neighborpl.pkl"))

In [None]:
hot100 = spark.createDataFrame(pd.read_pickle("neighborpl.pkl"))

In [None]:
hot100.orderBy("pid").show(5)

In [None]:
arraylength = f.udf(lambda x: len(x), IntegerType())

In [None]:
h100cnt = hot100.withColumn("reclen", arraylength(hot100.recpl))

In [None]:
h100cnt.orderBy("reclen").show()

In [None]:
h100cnt.groupBy("reclen").count().orderBy("reclen").show(5)

In [None]:
h100cnt.orderBy("reclen").groupBy("reclen").count().describe("count").show()

In [None]:
h100cnt.describe("reclen").show()

We can see that most results will have gotten 100 neighbors

In [None]:
mpd.plothist(h100cnt, "reclen", 11)

In [None]:
h100 = hot100.select("pid", f.explode("recpl").alias("recpid"))

In [None]:
h100withtracks = h100.join(result, result.pid == h100.recpid).drop(result.pid).drop(result.features).orderBy("pid")

In [None]:
h100withtracks.show(5)

In [None]:
h100withtracks.count()

## Elliminate duplicates with subtracting dataframes

It's possible to [use subtraction to remove the duplicates](https://stackoverflow.com/a/42380533)

Don't actually do this here though. Save it till after all the ordering takes place, otherwise we lose duplicates and the ability to count popularity.

challengetracks = recdf.select("pid", f.explode("tracks.track_uri").alias("track"))

challengetracks.count()

challengetracks.where(challengetracks.pid == 1000000).count()

tracklist = h100withtracks.select("pid", f.explode("track_uri").alias("track"))

tracklist.count()

newtracks = tracklist.subtract(challengetracks)

newtracks.count()

## Get the ranked resutls of tracks from the recommended neighboring playlists.

Make sure the elliminated tracks don't loose detail for a playlist.  The pid 1000000 order by count is different when the tracks aren't elliminated.   However the total count of tracks matches the expected difference.  646 original, less 5 given is 641.

In [None]:
trackrank = h100withtracks.select("pid", f.explode("track_uri").alias("track")).groupBy("pid","track").count().sort(f.desc("count"))

In [None]:
trackrank.orderBy("pid", f.desc("count")).show(5)

In [None]:
trackrank.where(trackrank.pid == 1000000).count()

Commented out for now favoring ellimination at the end

trackrank = newtracks.groupBy("pid","track").count().sort(f.desc("count"))

trackrank.orderBy("pid", f.desc("count")).show(5)

trackrank.where(trackrank.pid == 1000000).count()

In [None]:
trackrank.where(trackrank.pid == 1000000).orderBy(f.asc("count")).show()

In [None]:
trackrank.describe("count").show()

In [None]:
trackrank.printSchema()

### Exlore a single playlist

In [None]:
testpid = 1000061

In [None]:
trackrank.where(f.col("pid") == testpid).show()

In [None]:
trackrank.where(f.col("pid") == testpid).count()

Add the global rank

In [None]:
grank=trackrank.join(vdf, trackrank.track == vdf.term).drop(vdf.term)

In [None]:
grank.printSchema()

Here is the track recommendation for one playlist based on the popularity of the track in the neighborhood with additional sorting by the globab popularity. Global popularity is based on count vecorizer with most popular recieving the lowest value.

In [None]:
grank.where(f.col("pid") == testpid).orderBy(f.desc("count"), f.asc("tid")).show()

## Eliminate tracks included in the search

In [None]:
mpd_test=spark.read.json("../mpd-challenge/challenge_set.json", multiLine=True)

In [None]:
cpl=mpd_test.select(f.explode("playlists").alias("playlist"))

In [None]:
recdf=cpl.select("playlist.pid", "playlist.tracks")

In [None]:
recdf.describe("pid").show()

test a playlist

In [None]:
existingtracks = recdf.where(recdf.pid == testpid).select(f.explode("tracks.track_uri").alias("track"))

In [None]:
existingtracks.printSchema()

In [None]:
existingtracks.show()

In [None]:
existingtracks.toPandas()["track"].tolist()

In [None]:
trank = grank.where(f.col("pid") == testpid).where(~grank.track.isin(existingtracks.toPandas()["track"].tolist()))

In [None]:
trank.orderBy(f.desc("count"), f.asc("tid")).show()

## Iterate over search results and provide track list

In [None]:
def gettracks(chpl, grank, recdf):
    # get the challenge playlist id
    testpid = chpl.pid
    
    # get the provided tracks
    existingtracks = recdf.where(recdf.pid == testpid).select(f.explode("tracks.track_uri").alias("track"))
    
    # get the tracks from the global rank
    df = grank.where(f.col("pid") == testpid).where(~grank.track.isin(existingtracks.toPandas()["track"].tolist()))
    
    tracklist = df.orderBy(f.desc("count"), f.asc("tid")).toPandas()["track"].tolist()
    
    recommend = { "pid": testpid, "tracks": [tracklist]}
    #print("DEBUG: " + testpid + " " + pidlist)
    
    return recommend


In [None]:
recommended = pd.DataFrame({"pid":0, "tracks":[]})

Don't do this. It's awfully slow.

for row in hot100.limit(10).rdd.collect():
    rec = gettracks(row, grank, recdf)
    recommended = recommended.append(pd.DataFrame(rec))

In [None]:
recommended

In [None]:
recommended["tracks"].apply(lambda x: len(x))

recommended.to_pickle("rectracks.pkl")

recommended.to_csv("rectracks.csv")

## The iterrate-over-data-set is too damn slow 

The global ranking doesn't take much time even with a few cores.

In [None]:
grank.orderBy("pid", f.desc("count"), f.asc("tid")).show()

Add monotonic id to preserve order.

In [None]:
tgrank = grank.withColumn("mid", f.monotonically_increasing_id())

In [None]:
tgrank = grank.orderBy("pid", f.desc("count"), f.asc("tid")).withColumn("mid", f.monotonically_increasing_id())

Now we have an mid that preserves the global order of our data set and can be used to sort tracks within playlists independent of nearest neighbor or global popularity counts.

In [None]:
tgrank.orderBy("mid").show()

tgrank = tgrank.orderBy("pid", f.desc("count"), f.asc("tid")).withColumn("mid", f.monotonically_increasing_id()).drop("count").drop("tid")

In [None]:
tgrank.count()

In [None]:
tgrank = tgrank.drop("count").drop("tid")

In [None]:
tgrank.printSchema()

### Elliminate duplicates with subtracting dataframes

In [None]:
challengetracks = recdf.select("pid", f.explode("tracks.track_uri").alias("track"))

In [None]:
challengetracks.printSchema()

Impart monotonic id on challenge set to create identical rows

In [None]:
ctracks = tgrank.join(challengetracks, [challengetracks.pid == tgrank.pid, challengetracks.track == tgrank.track]).drop(challengetracks.pid).drop(challengetracks.track)

In [None]:
ctracks.printSchema()

In [None]:
ctracks.orderBy("pid", "mid").show(truncate=False)

In [None]:
newtracks = tgrank.subtract(ctracks)

Comparing to the above results it's clear we've ellimnated the provided tracks

In [None]:
newtracks.count()

In [None]:
newtracks.orderBy("pid", "mid").show()

Need to do the ellimination sooner because this approach loses the desired sorting.

Actually can do it here if we introduce a row index and do a join with the challenge set to get the duplicates with the same row id.  Then we can subtract and have identical rows and also retain the relative ordering.

Monotonically increasing should be fine since all we care about is sorting.

### Save data 

In [None]:
trackrec = newtracks.orderBy("pid", "mid").groupBy("pid").agg(f.collect_list("track").alias("tracks"))

In [None]:
trackrec.printSchema()

In [None]:
trackrec.show(5)

In [None]:
trackrec = trackrec.withColumn("len", arraylength(trackrec.tracks))

In [None]:
trackrec.where(trackrec.len < 500).show()

In [None]:
trackrec.count()

In [None]:
trackrec.where(trackrec.len < 500).count()

In [None]:
trunc500 = f.udf(lambda x: x[0:500], ArrayType(StringType()))

In [None]:
cleanlist = trackrec.withColumn("first500", trunc500(trackrec.tracks))

## Format output

In [None]:
trpd = cleanlist["pid","first500"].toPandas()

In [None]:
trpd.to_csv("reclist.csv")

In [None]:
trpd.to_csv("reclist2.csv", quoting=0)

In [None]:
trpd["formatted"] = trpd["pid"].map(str) + ", " + trpd["first500"].apply(', '.join)

In [None]:
trpd["formatted"]

In [None]:
type(trpd)

In [None]:
trpd["formatted"].to_csv("recpl.csv", index=False, header=False)

In [None]:
!echo -e "team_info,main,jprorama,jpr@uab.edu\n\n" > recpl-clean.csv

Fix the output becuase the quoting flag doesn't work above

In [None]:
!sed -e 's/^"//' -e 's/"$//' recpl.csv >> recpl-clean.csv