In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
import os
import json
import joblib

import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer

from lightfm import LightFM

In [3]:
df_tracks = pd.read_hdf('df_data/df_tracks.hdf')
df_playlists = pd.read_hdf('df_data/df_playlists.hdf')
df_playlists_info = pd.read_hdf('df_data/df_playlists_info.hdf')
df_playlists_test = pd.read_hdf('df_data/df_playlists_test.hdf')
df_playlists_test_info = pd.read_hdf('df_data/df_playlists_test_info.hdf')

In [4]:
train = pd.read_hdf('df_data/train.hdf')
val = pd.read_hdf('df_data/val1.hdf')
val1_pids = joblib.load('df_data/val1_pids.pkl')

In [5]:
user_seen = train.groupby('pid').tid.apply(set).to_dict()
val_tracks = val.groupby('pid').tid.apply(set).to_dict()

In [6]:
config = {
    'num_playlists': df_playlists_test_info.pid.max() + 1,
    'num_tracks': df_tracks.tid.max() + 1,
}

In [7]:
zeros_pids = np.array(list(set(val1_pids).difference(train.pid.unique())))

In [8]:
no_zeros_pids = np.array(list(set(val1_pids).difference(zeros_pids))[:1000])

In [9]:
target_pids = np.hstack([zeros_pids, no_zeros_pids])

In [10]:
playlist_name1 = df_playlists_test_info.set_index('pid').name
playlist_name2 = df_playlists_info.set_index('pid').name
playlist_name = pd.concat([playlist_name1, playlist_name2]).sort_index()
playlist_name = playlist_name.reindex(np.arange(config['num_playlists'])).fillna('')

In [11]:
vectorizer = CountVectorizer(max_features=20000)
user_features = vectorizer.fit_transform(playlist_name)

In [12]:
user_features = sp.hstack([sp.eye(config['num_playlists']), user_features])

In [13]:
config['model_path'] = 'models/lightfm_model_text.pkl'

In [14]:
X_train = sp.coo_matrix(
    (np.ones(len(train)), (train.pid, train.tid)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [15]:
model = LightFM(
    no_components=200, 
    loss='warp', 
    learning_rate=0.03, 
    max_sampled=400, 
    random_state=1,
    user_alpha=1e-05,
)

In [16]:
best_score = 0

for i in range(10):
    
    model.fit_partial(X_train, epochs=5, num_threads=50, user_features=user_features)

    model.batch_setup(
        item_chunks={0: np.arange(config['num_tracks'])},
        n_process=50, 
        user_features=user_features,
    )
    res = model.batch_predict(chunk_id=0, user_ids=target_pids, top_k=600)
    model.batch_cleanup()
    
    score = []
    score2 = []
    
    for pid in zeros_pids:
        tracks_t = val_tracks[pid]
        tracks = [i for i in res[pid][0] if i not in user_seen.get(pid, set())][:len(tracks_t)]
        guess = np.sum([i in tracks_t for i in tracks])
        score.append(guess / len(tracks_t))
    
    for pid in no_zeros_pids:
        tracks_t = val_tracks[pid]
        tracks = [i for i in res[pid][0] if i not in user_seen.get(pid, set())][:len(tracks_t)]
        guess = np.sum([i in tracks_t for i in tracks])
        score2.append(guess / len(tracks_t))
    
    score = np.mean(score)
    score2 = np.mean(score2)
    
    print(score, score2)
    if score > best_score:
        joblib.dump(model, open(config['model_path'], 'wb'))
        best_score = score

1 1


In [17]:
joblib.dump(user_features, open('models/user_features.pkl', 'wb'))

In [18]:
model = joblib.load(open(config['model_path'], 'rb'))