## Setting up environment

In [4]:
import os
from glob import glob

import dask
import dask.bag as db
import dask.dataframe as dd
from distributed import Client
from dask_jobqueue import SLURMCluster

from IPython.display import display
import matplotlib.pyplot as plt

import logging
import pickle

from lenskit.batch import MultiEval, predict, recommend
from lenskit.crossfold import partition_users, SampleN
from lenskit.algorithms import basic, als, svd, bias
from lenskit import topn, util, Recommender, batch
from lenskit.topn import precision, ndcg, recall

import pandas as pd
import numpy as np
from scipy import stats
import binpickle

import argparse
import time

from tqdm.auto import tqdm

In [2]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = True

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='4GB', cores=2, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

0,1
Client  Scheduler: tcp://127.0.0.1:44089  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 1  Cores: 1  Memory: 17.18 GB


## Load data

In [15]:
# choose the top-t subsample to test
t = 10000

In [16]:
if t == 10:
    train_dir = '/scratch/zh2095/quarantini/cf_train_subtrain_top10_1004.parquet'
    test_dir = '/scratch/zh2095/quarantini/cf_train_subval_top10_1004.parquet'
elif t == 100:
    train_dir = '/scratch/zh2095/quarantini/cf_train_subtrain_top100_1004.parquet'
    test_dir = '/scratch/zh2095/quarantini/cf_train_subval_top100_1004.parquet'
elif t == 1000:
    train_dir = '/scratch/zh2095/quarantini/cf_train_subtrain_top1000_1004.parquet'
    test_dir = '/scratch/zh2095/quarantini/cf_train_subval_top1000_1004.parquet'
elif t == 10000:
    train_dir = '/scratch/zh2095/quarantini/cf_train_subtrain_top10000_1004.parquet'
    test_dir = '/scratch/zh2095/quarantini/cf_train_subval_top10000_1004.parquet'

In [17]:
df_train = pd.read_parquet(train_dir)
df_test = pd.read_parquet(test_dir)

In [98]:
df_train.drop(columns=['__index_level_0__'], inplace=True)
df_test.drop(columns=['__index_level_0__'], inplace=True)
df_train.rename(columns = {'user_id':'user', 'track_id':'item', 'count':'rating'}, inplace = True)
df_test.rename(columns = {'user_id':'user', 'track_id':'item', 'count':'rating'}, inplace = True)

In [18]:
df_train

Unnamed: 0,user_id,count,track_id,__index_level_0__,user_id_index,track_id_index
0,00043d7bc800ceff4a90459e189eba5d442a1d3d,1,TRAAPNM128F4280813,11406638,5905,21705
1,00043d7bc800ceff4a90459e189eba5d442a1d3d,1,TRAERCP12903CF1368,11406630,5905,47212
2,00043d7bc800ceff4a90459e189eba5d442a1d3d,1,TRAJSQT128EF34334B,11406814,5905,37714
3,00043d7bc800ceff4a90459e189eba5d442a1d3d,1,TRAOWBP128F4257C64,11406915,5905,5739
4,00043d7bc800ceff4a90459e189eba5d442a1d3d,1,TRATNYA12903CD684E,11406829,5905,28408
...,...,...,...,...,...,...
3215104,fffc0df75a48d823ad5abfaf2a1ee61eb1e3302c,9,TRIFVDW128F4279DE0,1970006,5610,705
3215105,fffc0df75a48d823ad5abfaf2a1ee61eb1e3302c,9,TRIJMQZ128F14683A9,1970101,5610,9324
3215106,fffc0df75a48d823ad5abfaf2a1ee61eb1e3302c,9,TRIXPPS12903CE26B1,1970124,5610,8084
3215107,fffc0df75a48d823ad5abfaf2a1ee61eb1e3302c,10,TRNQBVT128F931E01D,1970133,5610,2690


## Test Performance and Runtime

In [100]:
# best configuration
best_features = 100
best_reg = 1
best_weight = 10
iterations = 20
use_ratings = True

data = (df_train, df_test)

In [101]:
eval = MultiEval('my-test', recommend=500, save_models=False)  #set save_models=True when running on the full dataset
eval.add_datasets(data, name='Song')   

best_mdl = [als.ImplicitMF(features=best_features, iterations=iterations, reg=best_reg, 
                           use_ratings = use_ratings, weight=best_weight, method='cg', progress=tqdm)]
eval.add_algorithms(best_mdl, attrs=['features'], name='ImplicitMF')

eval.run(progress = tqdm)

initializing deprecated MultiEval


  0%|          | 0/1 [00:00<?, ?it/s]

ImplicitMF:   0%|          | 0/20 [00:00<?, ?it/s]

In [102]:
recs_test = pd.read_parquet('my-test/recommendations.parquet')

In [103]:
rla = topn.RecListAnalysis()
# rla.add_metric(topn.ndcg)
# rla.add_metric(topn.recall)
rla.add_metric(precision, k=500)
rla.add_metric(ndcg, k=500)
rla.add_metric(recall, k=500)

raw_metrics_test = rla.compute(recs_test, df_test, include_missing=False)

In [104]:
metrics_test = raw_metrics_test.drop(columns=['nrecs']).fillna(0).mean()
metrics_test

precision    0.055451
ndcg         0.220240
recall       0.353165
dtype: float64

## Test the time to fit model

In [92]:
users = df_train['user'].unique()

In [93]:
ALS = als.ImplicitMF(features=best_features, iterations=iterations, reg=best_reg, 
                           use_ratings = use_ratings, weight=best_weight, method='cg', progress=tqdm)

In [51]:
Recommender.adapt(ALS)

<lenskit.algorithms.ranking.TopN at 0x14debecaf7c0>

In [94]:
start = time.time()

ALS.fit(df_train)

print('took %f s'%(time.time()-start))

ImplicitMF:   0%|          | 0/20 [00:00<?, ?it/s]

took 25.439674 s
