# Data split and preprocess for Naive model
by DM

In [1]:
%cd ..
%load_ext autoreload
%autoreload 2

/home/dongmin/userdata/dongmin/robot-radio-station


In [2]:
import os
from pathlib import Path
from operator import itemgetter

import math
import random

import json
import csv

import matplotlib.pyplot as plt

import torch

In [3]:
HOME = Path.home()
CWD = Path.cwd()

In [4]:
DATASET_DIR = HOME / 'userdata' / 'dongmin' / 'smp_dataset'
DATA_DIR = DATASET_DIR / 'data'
META_DIR = CWD / 'metadata'

## Import data

In [5]:
json_paths = DATA_DIR.glob('*.json')
json_paths = sorted(json_paths, key=lambda x: int(x.stem.split('.')[-1].split('-')[0]))
json_paths = list(json_paths)

len(json_paths), json_paths[:5]

(1000,
 [PosixPath('/home/dongmin/userdata/dongmin/smp_dataset/data/mpd.slice.0-999.json'),
  PosixPath('/home/dongmin/userdata/dongmin/smp_dataset/data/mpd.slice.1000-1999.json'),
  PosixPath('/home/dongmin/userdata/dongmin/smp_dataset/data/mpd.slice.2000-2999.json'),
  PosixPath('/home/dongmin/userdata/dongmin/smp_dataset/data/mpd.slice.3000-3999.json'),
  PosixPath('/home/dongmin/userdata/dongmin/smp_dataset/data/mpd.slice.4000-4999.json')])

In [6]:
playlists = []

for j_p in json_paths:
  with open(j_p, 'r') as f:
    chunk = json.load(f)
    playlists += chunk['playlists']

len(playlists), playlists[0]

(1000000,
 {'name': 'Throwbacks',
  'collaborative': 'false',
  'pid': 0,
  'modified_at': 1493424000,
  'num_tracks': 52,
  'num_albums': 47,
  'num_followers': 1,
  'tracks': [{'pos': 0,
    'artist_name': 'Missy Elliott',
    'track_uri': 'spotify:track:0UaMYEvWZi0ZqiDOoHU3YI',
    'artist_uri': 'spotify:artist:2wIVse2owClT7go1WT98tk',
    'track_name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
    'album_uri': 'spotify:album:6vV5UrXcfyQD1wu4Qo2I9K',
    'duration_ms': 226863,
    'album_name': 'The Cookbook'},
   {'pos': 1,
    'artist_name': 'Britney Spears',
    'track_uri': 'spotify:track:6I9VzXrHxO9rA9A5euc8Ak',
    'artist_uri': 'spotify:artist:26dSoYclwsYLMAKD3tpOr4',
    'track_name': 'Toxic',
    'album_uri': 'spotify:album:0z7pVBGOD7HCIB7S8eLkLI',
    'duration_ms': 198800,
    'album_name': 'In The Zone'},
   {'pos': 2,
    'artist_name': 'Beyoncé',
    'track_uri': 'spotify:track:0WqIKmW4BTrj3eJFmnCKMv',
    'artist_uri': 'spotify:artist:6vWDO969PvNqNYHIOW5v0m',
    

## Preprocess

In [7]:
clean_uri = lambda uri: uri.split(':')[-1]

playlists_processed = []

for p in playlists:
  tracks = p['tracks']
  tracks = [ 
    f'{t['artist_name']}:{t['album_name']}:{t['track_name']}'
    for t in tracks 
  ]
  playlists_processed.append(tracks)

len(playlists_processed), playlists_processed[0]

(1000000,
 ['Missy Elliott:The Cookbook:Lose Control (feat. Ciara & Fat Man Scoop)',
  'Britney Spears:In The Zone:Toxic',
  'Beyoncé:Dangerously In Love (Alben für die Ewigkeit):Crazy In Love',
  'Justin Timberlake:Justified:Rock Your Body',
  "Shaggy:Hot Shot:It Wasn't Me",
  'Usher:Confessions:Yeah!',
  'Usher:Confessions:My Boo',
  'The Pussycat Dolls:PCD:Buttons',
  "Destiny's Child:The Writing's On The Wall:Say My Name",
  'OutKast:Speakerboxxx/The Love Below:Hey Ya! - Radio Mix / Club Mix',
  'Nelly Furtado:Loose:Promiscuous',
  'Jesse McCartney:Right Where You Want Me:Right Where You Want Me - Radio Edit Version',
  'Jesse McCartney:Beautiful Soul:Beautiful Soul',
  "Jesse McCartney:Departure - Recharged:Leavin'",
  'Cassie:Cassie:Me & U',
  'Omarion:21:Ice Box',
  'Avril Lavigne:Let Go:Sk8er Boi',
  'Chris Brown:Chris Brown:Run It!',
  "Beyoncé:B'Day:Check On It - feat. Bun B and Slim Thug",
  "Destiny's Child:The Writing's On The Wall:Jumpin', Jumpin'",
  "Sheryl Crow:C'Mon C

## Split
* 8:1:1 = train:valid:test

In [8]:
SAVE_DIR = META_DIR / 'smp_naive'
SAVE_DIR.mkdir(exist_ok=True, parents=True)

In [9]:
total = len(playlists_processed)

train_len = int(total * 0.8)

eval_len = total - train_len
valid_len = eval_len // 2
test_len = eval_len - valid_len

splits = [('train', train_len), ('valid', valid_len), ('test', test_len)]
train_len + valid_len + test_len == total

True

In [10]:
# shuffle
random.seed(0)
remainder = random.sample(playlists_processed, total)

for split, length in splits:
  split_data, remainder = remainder[:length], remainder[length:]
  torch.save(split_data, SAVE_DIR / f'{split}-segments.pt')

In [11]:
# save vocab
vocab = set()

for l in playlists_processed:
  vocab.update(l)

vocab = list(vocab)

len(vocab), vocab[:5]

(2256502,
 ['Brainwave-Sync:Various Wonders 2 - Relaxation & Meditation Music with Brainwave Entrainment - Alpha, Delta, Gamma & Theta Frequency:Quiet (Tranquility Mix) - Delta Frequency',
  'Circle of Dust:Disengage:Blindeye',
  'Bombay Bicycle Club:A Different Kind Of Fix:Bad Timing',
  'Fit For An Autopsy:Absolute Hope Absolute Hell:Mask Maker',
  'Alison Wonderland:Run:I Want U - DJ Hoodboi Remix'])

In [12]:
with open(SAVE_DIR / 'vocabulary.txt', 'w') as f:
  f.write('\n'.join(vocab))

### debug dataset

In [13]:
SAVE_DIR = META_DIR / 'smp_naive_debug'
SAVE_DIR.mkdir(exist_ok=True, parents=True)

In [14]:
total = len(playlists_processed)

train_len = 1000
valid_len = 500
test_len = 500

splits = [('train', train_len), ('valid', valid_len), ('test', test_len)]

In [15]:
# shuffle
random.seed(0)
remainder = random.sample(playlists_processed, total)

for split, length in splits:
  split_data, remainder = remainder[:length], remainder[length:]
  torch.save(split_data, SAVE_DIR / f'{split}-segments.pt')