# Model
Let's use word2vec to train a user intent model. Nothing too fancy.

In [None]:
import gensim
from idomaar import *
import progressbar
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
class PlaylistIterator():
    def __init__(self, path, verbose=0):
        self.path = path
        if verbose < 0:
            raise ValueError("verbosity level must be above or equal to 0")
        self.verbose = verbose
        self.reader = idomaarReader(self.path, tolerant=False)
        
    def preload_metadata(self, filename, expected_number = 0):
        self.reader.preload_entities(filename, expected_number)
    def __iter__(self):      
        self.reader.__enter__()
        if self.verbose == 0:
            iterator = self.reader
        else:
            iterator = progressbar.progressbar(ier)
        for playlist in iterator:
            try:
                #if self.tracks_file is None:
                #    yield [str(x.id) for x in playlist.linked.objects]
                #else:
                strings =  ["track_{} artist_{}".format(x.id, x.linked.artists[0].id)
                      for x in playlist.linked.objects]
                yield [item for string in strings for item in string.split(" ")]
            except Exception as e:
                print(e)
                print(playlist)
                raise
        self.reader.__exit__()

# Sessions and playlists based similarity

In [None]:
playliterator = PlaylistIterator("../data/ThirtyMusic/entities/playlist.idomaar")
#playliterator.preload_metadata("../data/ThirtyMusic/entities/persons.idomaar", 595049)
#playliterator.preload_metadata("../data/ThirtyMusic/entities/tracks.idomaar", 5675143)

In [None]:
playliterator.preload_metadata("../data/ThirtyMusic/entities/tracks.idomaar", 5675143)

In [None]:
idomaarRegistry.registry["track"][0].linked.artists[0].properties

In [None]:
j = 0 
for playlist in playliterator:
    j+=1
    if j>3: break
    print(playlist)

In [None]:
import os

def w2v_model(session, out, overwrite=False, min_count=1, workers=4, size=100):
    if not overwrite and os.path.exists(out):
        return gensim.models.Word2Vec.load(out)
    playliterator = PlaylistIterator(session)
    playliterator.preload_metadata("../data/ThirtyMusic/entities/persons.idomaar", 595049)
    playliterator.preload_metadata("../data/ThirtyMusic/entities/tracks.idomaar", )
    model = gensim.models.Word2Vec(playliterator, min_count=min_count, workers=workers, size=size)
    model.save(out)
    return model

In [None]:
model = w2v_model("../data/ThirtyMusic/entities/playlist.idomaar", "cheap_playlists.w2v", overwrite=True, workers=8, size=100)

In [None]:
# this will likely take over your ram
#model = w2v_model("../data/ThirtyMusic/relations/sessions.idomaar", "cheap_sessions.w2v", overwrite=True, workers=8, size=30)

# Songs data 

In [None]:
import pandas as pd
from idomaar import *
import progressbar
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [None]:
!head -n5 ../data/ThirtyMusic/entities/tracks.idomaar

In [None]:
def idomaar_df(path): # TODO: chunksize
    def row_reader():
        with idomaarReader(path) as ier:
            for t in progressbar.progressbar(ier):
                p = t.properties
                yield [t.id, p.MBID, p.duration, p.name, p.playcount]
    df = pd.DataFrame([x for x in row_reader()], columns=["id","MBID","duration","name","playcount"])
    return df

df = idomaar_df("../data/ThirtyMusic/entities/tracks.idomaar")

In [None]:
df.head()

In [None]:
class registry:
    db = dict()
    @classmethod
    def findorcreate(cls, key, val):
            if key not in cls.db:
                cls.db[key]=val
            return cls.db[key]
#registry.findorcreate("hola","adios")

In [None]:
registry.db