# Debug Challenge set parsing

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pandas as pd
import numpy as np
import matplotlib as plt
import importlib
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.types import IntegerType

import mpd

In [None]:
# Will allow us to embed images in the notebook
%matplotlib inline
# change default plot size
plt.rcParams['figure.figsize'] = (15,10)

### Load challenge data set

In [None]:
mpd_test=spark.read.json("../mpd-challenge/challenge_set.json", multiLine=True)

In [None]:
mpd_test.printSchema()

In [None]:
cpl=mpd_test.select(f.explode("playlists").alias("playlist"))

In [None]:
cpl.printSchema()

In [None]:
cpl.show(5)

In [None]:
recdf=cpl.select("playlist.name", "playlist.num_holdouts", "playlist.pid", "playlist.num_tracks", "playlist.tracks", "playlist.num_samples")

In [None]:
recdf.printSchema()

In [None]:
recdf.select("pid", recdf.tracks.artist_uri, recdf.tracks.track_uri).show(5)

In [None]:
recdf.select("pid", "name", f.explode("tracks")).show()

In [None]:
countTokens = f.udf(lambda wordlist: len(wordlist), IntegerType())

In [None]:
chtracks = recdf.select("pid", "tracks.track_uri")

In [None]:
chartist = recdf.select("pid", "tracks.artist_uri")

In [None]:
challengedf = chtracks.join(chartist, chtracks.pid == chartist.pid).drop(chartist.pid)

In [None]:
challengedf.show()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType,StringType

mergeCols = udf((lambda x, y: x + y), ArrayType(StringType()))

In [None]:
challengedf = challengedf.withColumn("features", mergeCols(f.col("track_uri"),f.col("artist_uri"))).drop("track_uri").drop("artist_uri")

In [None]:
challengedf.show(5)

In [None]:
chpname = recdf.select("pid", "name")

In [None]:
recdf.select("name").count()

In [None]:
chpname.describe("name").show()

In [None]:
chtokenizer = Tokenizer(inputCol="name", outputCol="words")

In [None]:
chtokenized = chtokenizer.transform(recdf.select("pid", "name"))

In [None]:
chtokenized = chtokenizer.transform(chpname)

In [None]:
chtokenized.show(5)

### Debug nullPointer expection in transform on name column

Any attmempt to work with the full set of names of the playlist results in a [nullPointer excpetion](https://stackoverflow.com/a/41659622) when the transform method is called to generate the data, e.g.
```
chtokecount.select("tokens").show(10)
...
Py4JJavaError                             Traceback (most recent call last)
<ipython-input-46-83c9f75e1c45> in <module>()
----> 1 chtokecount.select("tokens").show(10)
...
Py4JJavaError: An error occurred while calling o287.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 30.0 failed 4 times, most recent failure: Lost task 0.3 in stage 30.0 (TID 38, 172.20.201.103, executor 2): org.apache.spark.SparkException: Failed to execute user defined function($anonfun$createTransformFunc$1: (string) => array<string>)
...
Caused by: java.lang.NullPointerException
	at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
	at org.apache.spark.ml.feature.Tokenizer$$anonfun$createTransformFunc$1.apply(Tokenizer.scala:39)
	... 15 more
```

The playlist dataset clearly contains all expected entries

In [None]:
chtokenized.count()

But 2000 of the entries have no name field at all and result in [a null value for the resulting name field](https://stackoverflow.com/a/44631639) and then cause the transform to the words Tokenizer to fail.

In [None]:
chtokenized.where(chtokenized.name.isNull()).count()

In [None]:
chtokenized.where(chtokenized.name.isNotNull()).count()

In [None]:
chtokenized.where(chtokenized.name == "").count()

This is fixed by [replacing null values when loading the data](http://spark.apache.org/docs/2.1.0/api/python/pyspark.sql.html#pyspark.sql.DataFrameNaFunctions.fill) with the fillna() function.

In [None]:
chpname = recdf.select("pid", "name").fillna({"name": ""})

In [None]:
chtokenized = chtokenizer.transform(chpname)

Now there are no longer any null values in the name

In [None]:
chtokenized.where(chtokenized.name.isNull()).count()

In [None]:
chtokenized.where(chtokenized.name.isNotNull()).count()

## Explore frequent items in playlist names

Was originally related to debug of the null pointer exception but it's interesting to view the results

In [None]:
freq=chtokenized.freqItems(["name"])

In [None]:
type(freq)

In [None]:
freq.printSchema()

In [None]:
freq.select(f.explode('name_freqItems')).show()

In [None]:
freq.select(f.explode('name_freqItems')).count()

In [None]:
countTokens

In [None]:
chtokecount=chtokenized.select("name", "words")\
    .withColumn("tokens", countTokens(f.col("words")))

In [None]:
chtokecount=chtokenized.withColumn("tokens", countTokens("words"))

In [None]:
chtokecount.explain(True)

Interestingly, the longest playlist names actually are simple words but spelled with spaces inbetween to make them easier to read on the UI.

In [None]:
chtokecount.select("words","tokens").orderBy(f.desc("tokens")).show(10)

This should be fixable with a regex tokenizer that looks for individual letters separated by spaces.  Or a clean up script for the input values.

## Build full feature vectors

In [None]:
fullfeature = challengedf.join(chtokenized, challengedf.pid == chtokenized.pid).drop(chtokenized.pid)

In [None]:
fullfeature.printSchema()

In [None]:
chfeatures = fullfeature.withColumn("all_features", mergeCols("features", "words")).drop("words").drop("name").drop("features")

In [None]:
chfeatures

In [None]:
chfeatures.count()

In [None]:
chfeatures.show(5)

## Create name cleanup

In [None]:
chpname.show()

## Clean playlist space pretty print

Collapse easy-read pringing of playlist names like "t h r o w b a c k s" into a single word.  The replacement pattern is a sequence of two characters in a space separation.  The second pattern catches sequences not evenly divisibible by four.  The easier single pattern " \d " is not used because it would match phrases like "sing a song" which should remain three words.  Stop words will be removed below.

In [None]:
newname = chpname.withColumn("cleanname", f.regexp_replace(f.regexp_replace(f.rtrim(f.ltrim(f.col("name"))), " (\w) (\w) ", "$1$2"), "(\w) (\w) (\w)$", "$1$2$3"))

In [None]:
newname.printSchema()

In [None]:
newtokenizer = Tokenizer(inputCol="cleanname", outputCol="words")

In [None]:
chtokenized = newtokenizer.transform(newname)

In [None]:
chtokenized.printSchema()

In [None]:
chtokecount=chtokenized.withColumn("tokens", countTokens("words"))

In [None]:
chtokecount.select("name", "cleanname", "words","tokens").orderBy(f.desc("tokens")).show(10)

## Remove stop words

In [None]:
from pyspark.ml.feature import StopWordsRemover

In [None]:
stopwordremover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [None]:
canonicalname = stopwordremover.transform(chtokenized)

In [None]:
canonicalname.show(5)

In [None]:
chtokecount=canonicalname.withColumn("tokens", countTokens("filtered"))

In [None]:
chtokecount.select("name", "cleanname", "filtered","tokens").orderBy(f.desc("tokens")).show(20)

### Test canonical tokenizer

In [None]:
importlib.reload(mpd)

In [None]:
tokedf = mpd.canonicaltokens(recdf.select("pid", "name").fillna({"name": ""}), "name", "filtered")

In [None]:
tokedf.show()

In [None]:
chtokecount=tokedf.withColumn("tokens", countTokens("filtered"))

In [None]:
chtokecount.select("name", "filtered","tokens").orderBy(f.desc("tokens")).show(20)