Determine the distribution of playlist data attributes in the recsys18 challenge dataset.

Need to know what the profile of unknown playlists are so we can build a representantive dataset 
from the MPD training set.

The challenge data set is 10x1000 playslists.
There is no info header describing the file as in the mpd slice files, just the playlist dictionary.
Each playlist has a tracks dictionary that may be empty.
Each playlist has a 'num_samples: "x"' parameter where x is 0,5,10,25,100 to match the challenge category.
For the title-only playists 'num_samples: "0"' and there are no tracks in tracks dictionary.
For the title a no-title variations there is either a title attribute or there is not.
The num_samples + num_holdouts = num_tracks, which is the lenght of the orignal playlist.
The track entries lists tracks as in the original mpd data set and includes position.
Position is always 0 through num_samples-1 for the sequential seeds.
Position appears to be 0 through num_tracks with some number of drop-outs for the random samples.
That is, the sampling appears to drop out the tracks throughout the playlist.
This means the recommenders should have a rich embedding and are just filling in small holes.

In [None]:
import sys
import json
import re
import collections
import os
import datetime

In [None]:
import pandas as pd
import numpy as np

In [None]:
total_playlists = 0
total_tracks = 0
tracks = set()
artists = set()
albums = set()
titles = set()
total_descriptions = 0
ntitles = set()
title_histogram = collections.Counter()
artist_histogram = collections.Counter()
track_histogram = collections.Counter()
last_modified_histogram = collections.Counter()
num_edits_histogram = collections.Counter()
playlist_length_histogram = collections.Counter()
num_followers_histogram = collections.Counter()

quick = False
max_files_for_quick_processing = 5

In [None]:
def process_mpd(path):
    count = 0
    playlist_count = 0
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        #if filename.startswith("mpd.slice.") and filename.endswith(".json"):
        if filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            mpd_slice = json.loads(js)
            #process_info(mpd_slice['info'])
            for playlist in mpd_slice['playlists']:
                process_playlist(playlist)
                playlist_count += 1
                if ((playlist_count % 1000) == 0):
                    print("============================")
                    print("challenge subset {}".format(count % 1000))
                    show_summary()
                    print("============================")
                    reset_stats()                
            count += 1

            if quick and count > max_files_for_quick_processing:
                break




In [None]:
def reset_stats():
    
    global total_playlists
    global total_tracks
    global tracks
    global artists
    global albums
    global titles
    global total_descriptions
    global ntitles
    global title_histogram
    global artist_histogram
    global track_histogram
    global last_modified_histogram
    global num_edits_histogram
    global playlist_length_histogram
    global num_followers_histogram
    
    total_playlists = 0
    total_tracks = 0
    tracks = set()
    artists = set()
    albums = set()
    titles = set()
    total_descriptions = 0
    ntitles = set()
    title_histogram = collections.Counter()
    artist_histogram = collections.Counter()
    track_histogram = collections.Counter()
    last_modified_histogram = collections.Counter()
    num_edits_histogram = collections.Counter()
    playlist_length_histogram = collections.Counter()
    num_followers_histogram = collections.Counter()

In [None]:
def process_playlist(playlist):
    global total_playlists, total_tracks, total_descriptions

    total_playlists += 1
    # print playlist['playlist_id'], playlist['name']

    if 'description' in playlist:
        total_descriptions += 1

    try:
        playlist['name']
    except KeyError:
        pass
    else:
        titles.add(playlist['name'])
        nname = normalize_name(playlist['name'])
        ntitles.add(nname)
        title_histogram[nname] += 1

    playlist_length_histogram[playlist['num_tracks']] += 1
    #last_modified_histogram[playlist['modified_at']] += 1
    #num_edits_histogram[playlist['num_edits']] += 1
    #num_followers_histogram[playlist['num_followers']] += 1

    for track in playlist['tracks']:
        total_tracks += 1
        albums.add(track['album_uri'])
        tracks.add(track['track_uri'])
        artists.add(track['artist_uri'])

        full_name = track['track_name'] + " by " + track['artist_name']
        artist_histogram[track['artist_name']] += 1
        track_histogram[full_name] += 1



In [None]:
def process_info(_):
    pass



In [None]:
def normalize_name(name):
    name = name.lower()
    name = re.sub(r"[.,\/#!$%\^\*;:{}=\_`~()@]", ' ', name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name


def to_date(epoch):
    return datetime.datetime.fromtimestamp(epoch).strftime("%Y-%m-%d")



In [None]:
def show_summary():
    print()
    print("number of playlists", total_playlists)
    print("number of tracks", total_tracks)
    print("number of unique tracks", len(tracks))
    print("number of unique albums", len(albums))
    print("number of unique artists", len(artists))
    print("number of unique titles", len(titles))
    print("number of playlists with descriptions", total_descriptions)
    print("number of unique normalized titles", len(ntitles))
    print("avg playlist length", float(total_tracks) / total_playlists)
    print()
    print("top playlist titles")
    for title, count in title_histogram.most_common(20):
        print("%7d %s" % (count, title))

    print()
    print("top tracks")
    for track, count in track_histogram.most_common(20):
        print("%7d %s" % (count, track))

    print()
    print("top artists")
    for artist, count in artist_histogram.most_common(20):
        print("%7d %s" % (count, artist))

    print()
    print("numedits histogram")
    for num_edits, count in num_edits_histogram.most_common(20):
        print("%7d %d" % (count, num_edits))

    print()
    print("last modified histogram")
    for ts, count in last_modified_histogram.most_common(20):
        print("%7d %s" % (count, to_date(ts)))

    print()
    print("playlist length histogram")
    for length, count in playlist_length_histogram.most_common(20):
        print("%7d %d" % (count, length))

    print()
    print("num followers histogram")
    for followers, count in num_followers_histogram.most_common(20):
        print("%7d %d" % (count, followers))



In [None]:
reset_stats()
process_mpd("./data/")

The above processing is not very useful because i have to create new code for each statistic to gather. It would be better to have a data structure to query.

## Use Pandas to Explore Challenge Structure

Pandas provides lots convenience routines and data frame that is easily constructed and queried.

Follow this recipe to load the data and split the results into a table for playlists and tracks.
https://towardsdatascience.com/how-to-convert-json-into-a-pandas-dataframe-100b2ae1e0d8

In [None]:
# load data using Python JSON module
with open('data/challenge_set.json','r') as f:
    data = json.loads(f.read())

In [None]:
# Flatten data
playlists = pd.json_normalize(data, record_path=['playlists'])

In [None]:
playlists

In [None]:
tracks = pd.json_normalize(data['playlists'], record_path=['tracks'], meta=['pid'])

In [None]:
tracks

## Gather stats on each challenge subtype

### Title Only

The source playlist lenth of the title only seeds range from 10-50 tracks, with each length represented fairly evenly.  

In [None]:
playlists[0:1000].hist( column="num_tracks", bins=41)

In [None]:
playlists[0:1000].num_tracks.sort_values()

In [None]:
len(playlists[0:1000].num_tracks.sort_values())

In [None]:
np.sort(playlists[0:999].num_tracks.unique())

In [None]:
len(np.sort(playlists[0:1000].num_tracks.unique()))

In [None]:
histvals = playlists[0:1000].num_tracks.value_counts(bins=40) #.plot(kind='hist')

In [None]:
histvals

In [None]:
histvals.hist(bins=40)

### 5 Seed with Title

In [None]:
playlists[1000:1001]

In [None]:
playlists[1000:2000].hist( column="num_tracks", bins=100)

In [None]:
playlists[1000:2000].num_tracks.max()

### 5 Seed without Title

In [None]:
playlists[2000:2001]

In [None]:
playlists[2000:3000].hist( column="num_tracks", bins=100)

In [None]:
playlists[2000:3000].num_tracks.max()

### 10 Seed with Title

In [None]:
playlists[3000:3001]

In [None]:
playlists[3000:4000].hist( column="num_tracks", bins=100)

### 10 Seed without Title

In [None]:
playlists[4000:4001]

In [None]:
playlists[4000:5000].hist( column="num_tracks", bins=100)

### 25 Seed with Title

In [None]:
playlists[5000:5001]

In [None]:
tracks[tracks.pid == 1000001].head()

In [None]:
playlists[5000:6000].hist( column="num_tracks", bins=100)

### 25 Rand with Title

In [None]:
playlists[6000:6001]

In [None]:
tracks[tracks.pid == 1007147].head()

In [None]:
playlists[6000:7000].hist( column="num_tracks", bins=100)

### 100 Seed with Title

In [None]:
playlists[7000:7001]

In [None]:
tracks[tracks.pid == 1010382].head()

In [None]:
playlists[7000:8000].hist( column="num_tracks", bins=100)

### 100 Rand with Title

In [None]:
playlists[8000:8001]

In [None]:
tracks[tracks.pid == 1018569].head()

In [None]:
playlists[8000:8999].hist( column="num_tracks", bins=100)

### 1 Seed with Title

In [None]:
playlists[9000:9001]

In [None]:
playlists[9000:9999].hist( column="num_tracks", bins=100)

In [None]:
playlists[9000:9999]

### Total num_tracks distribution across all challenge playlist

In [None]:
playlists[0:10000].hist( column="num_tracks", bins=240)

In [None]:
np.sort(playlists[0:10000].num_tracks.unique())

In [None]:
len(np.sort(playlists[0:10000].num_tracks.unique()))

In [None]:
playlists[0:10000].num_tracks.value_counts()

## Conclusions

- the distribution for 0-seed, 5-seed-title, and 1-seed are all unique.
- the distribution for 5-notitle, 10-seed, and 10-notitle are roughly the same.
- the distribution for 25-seed and 25-rand are the same
- the distribution for 100-seed and 100-rand are the same

need to show the histograms in a 5x2 image plot.

but i can use these counts and the range of min max playlists in each of categories to select random playlists.

rather than making this super difficult I could just sum up all the distributions and select an equivalent count for each num_tracks.
then i can split those up according to the distribution overall.

can also compare how the challenge distribution compares to the entire mpd.

now that i have the challenge data set analyzed need to get back to parsing the full mpd.
how best to pull out my 10k test set from the 1mil list?

I can select the random filter method i used before to get a much smaller training set.
but this is a bit different, i want to keep selecting random from the values that appear in the challenge set.

looking at the unique num_tracks, each num_tracks from 10-250 is represented.
just at different selection densities.
so if i just randomly select according to the count of the distribution i should be good.


