# Explore the neighborhood of song recommendations

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import numpy as np
import matplotlib as plt
import importlib

import mpd

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline
# change default plot size
plt.rcParams['figure.figsize'] = (15,10)

## Load and prep data

* Load the full data set
* Vectorize the playlists into sparse vectors
* Extract the vocabulary with tid to allow translation back to track_uri

In [None]:
mpd_all=mpd.load(spark, "onebig", 1)

## Build track, artist and name features

In [None]:
model, result = mpd.vectorizecol(mpd_all.select("pid", "tracks.track_uri"), "track_uri", "features", size=1188873 )

In [None]:
amodel, aresult = mpd.vectorizecol(mpd_all.select("pid", "tracks.artist_uri"), "artist_uri", "features")

In [None]:
avdf = mpd.buildvocabdf(spark, amodel.vocabulary)

In [None]:
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [None]:
tokenizer = Tokenizer(inputCol="name", outputCol="words")

In [None]:
countTokens = udf(lambda words: len(words), IntegerType())

In [None]:
tokenized = tokenizer.transform(mpd_all.select("pid", "name"))

In [None]:
tokecount=tokenized.select("name", "words")\
    .withColumn("tokens", countTokens(col("words")))

In [None]:
tokecount.show(10)

The top 20 playlist names match that from the stats file.  They also show how useful the playlist name is in catagorizing music as of a certain type.  This are natural clusters.

They could be seen as providing a labeled dataset for regression or other classification tasks.

In [None]:
tokecount.groupBy("words").count().orderBy(f.desc("count")).show(30)

## Build combined feature vector

In [None]:
result.show(5)

In [None]:
aresult.show(5)

In [None]:
tokenized.show(5)

In [None]:
mpd_all.show(5)

This post shows how to [merge columns using a rdd built in to combine to columns](https://stackoverflow.com/a/46635404).  So if i build a new df with all the rows joined i should be albe to accomplish that.

In [None]:
combo = result.join(aresult, result.pid == aresult.pid).drop(aresult.pid)

In [None]:
combo.show(5)

Thows a java result size error:

import itertools
newcombo=combo.rdd.map(lambda x: [item for item in itertools.chain(x.track_uri, x.artist_uri)]).collect()

newcombo.show(5)

Combinding two lists into a new lists requires a udf.  The [udf has to describe it's output data type](https://changhsinlee.com/pyspark-udf/).  This example comes from [here](https://stackoverflow.com/a/50333492)

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType,StringType
#from string import split
mergeCols = udf((lambda x, y: x + y), ArrayType(StringType()))

In [None]:
newcombo=combo.withColumn("global_feature", mergeCols("track_uri", "artist_uri"))

In [None]:
newcombo.printSchema()

In [None]:
newcombo.show(5)

In [None]:
newcombo=newcombo.join(tokenized, tokenized.pid==newcombo.pid).drop(tokenized.pid)

In [None]:
newcombo.show(2)

In [None]:
newcombo.printSchema()

In [None]:
allcombo=newcombo.withColumn("all_features", mergeCols(col("global_feature"), col("words")))

In [None]:
allcombo.show(1, False)

In [None]:
allcombo.printSchema()

In [None]:
features=allcombo.select("pid", allcombo.all_features.alias("features"))

In [None]:
features.printSchema()

In [None]:
features.show(3)

## Convert features to a vector

In [None]:
from pyspark.ml.feature import CountVectorizer

Build a count vectorizer that contains all the terms that appear at least in two playlists.

In [None]:
cv=CountVectorizer(inputCol="features", outputCol="featurevector", minDF=2, vocabSize=2000000)

In [None]:
model=cv.fit(features)

In [None]:
featurevec = model.transform(features)

In [None]:
featurevec.show(1, False)

A problem with the jaccobian and nearest neighbor is that the playlist names will only match distint tokens, not close terms, like an edit distance.