In [1]:
import getpass
import pandas as pd
import numpy as np


import itertools
from lightfm.evaluation import precision_at_k
import lightfm
from lightfm.data import Dataset

from scipy.sparse import csr_matrix

import os
from glob import glob
from lightfm import LightFM
from distributed import Client
from dask_jobqueue import SLURMCluster
from lightfm.data import Dataset

from time import time

In [2]:
# Set LOCAL to True for single-machine execution while developing
# Set LOCAL to False for cluster execution
LOCAL = False

if LOCAL:
    # This line creates a single-machine dask client
    client = Client()
else:    
    # This line creates a SLURM cluster dask and dask client
    # Logging outputs will be stored in /scratch/{your-netid}
    
    cluster = SLURMCluster(memory='8GB', cores=4, python='/scratch/work/public/dask/bin/python', 
                               local_directory='/tmp/{}/'.format(os.environ['SLURM_JOB_USER']),
                               job_extra=['--output=/scratch/{}/slurm-%j.out'.format(os.environ['SLURM_JOB_USER'])])

    cluster.submit_command = 'slurm'
    cluster.scale(100)

    display(cluster)
    client = Client(cluster)

display(client)

VBox(children=(HTML(value='<h2>SLURMCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

0,1
Client  Scheduler: tcp://10.32.33.5:33511  Dashboard: http://10.32.33.5:8787/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


In [3]:
train_path = glob('/scratch/work/courses/DSGA1004-2021/MSD/cf_train.parquet')
validation_path = glob('/scratch/work/courses/DSGA1004-2021/MSD/cf_validation.parquet')

# Train on 1% of the training data

In [4]:
train = pd.read_parquet(train_path, engine='fastparquet')
validation = pd.read_parquet(validation_path, engine='fastparquet')
df = train.sample(frac = 0.01,random_state=42)

In [9]:
validation = validation[validation.user_id.isin(df.user_id)]
validation = validation[validation.track_id.isin(df.track_id)]

In [10]:
# LightFM
learning_rate = [1,10,0.1]
loss = ['warp','bpr']
param_choice = itertools.product(learning_rate, loss)

data = Dataset()
data.fit_partial(df.user_id.unique(), df.track_id.unique())
csr_train, train_weights = data.build_interactions(((x['user_id'], x['track_id']) for index,x in df.iterrows()))
data.fit_partial(validation.user_id.unique(), validation.track_id.unique())
csr_validation, validation_weights = data.build_interactions(((x['user_id'], x['track_id']) for index,x in validation.iterrows()))

In [11]:
for i in param_choice:
    start = time()
    lightfm = LightFM(learning_rate = i[0], loss = i[1])
    model = lightfm.fit(csr_train,epochs = 30)
    print('Finish training for {} combination'.format(i))
    print('Precision is: ' + str(precision_at_k(model, test_interactions = csr_validation, train_interactions = csr_train,k=500).mean()))
    time_taken = time() - start
    print('Time taken: ' + str(time_taken))

Finish training for (1, 'warp') combination
Precision is: 6.8199835e-05
Time taken: 26.528757333755493
Finish training for (1, 'bpr') combination
Precision is: 0.00013798573
Time taken: 21.978182554244995
Finish training for (10, 'warp') combination
Precision is: 6.9785885e-05
Time taken: 25.809032201766968
Finish training for (10, 'bpr') combination
Precision is: 3.8065027e-05
Time taken: 22.51911187171936
Finish training for (0.1, 'warp') combination
Precision is: 0.0034432989
Time taken: 23.367972135543823
Finish training for (0.1, 'bpr') combination
Precision is: 0.0009421095
Time taken: 21.066409826278687


# Train on 2% of the training data

In [5]:
df = train.sample(frac = 0.02,random_state=42)
validation = validation[validation.user_id.isin(df.user_id)]
validation = validation[validation.track_id.isin(df.track_id)]

In [7]:
data = Dataset()
data.fit_partial(df.user_id.unique(), df.track_id.unique())
csr_train, train_weights = data.build_interactions(((x['user_id'], x['track_id']) for index,x in df.iterrows()))
data.fit_partial(validation.user_id.unique(), validation.track_id.unique())
csr_validation, validation_weights = data.build_interactions(((x['user_id'], x['track_id']) for index,x in validation.iterrows()))

In [8]:
for i in param_choice:
    start = time()
    lightfm = LightFM(learning_rate = i[0], loss = i[1])
    model = lightfm.fit(csr_train,epochs = 30)
    print('Finish training for {} combination'.format(i))
    print('Precision is: ' + str(precision_at_k(model, test_interactions = csr_validation, train_interactions = csr_train,k=500).mean()))
    time_taken = time() - start
    print('Time taken: ' + str(time_taken))

Finish training for (1, 'warp') combination
Precision is: 2.9490617e-05
Time taken: 66.98209285736084
Finish training for (1, 'bpr') combination
Precision is: 0.00016800714
Time taken: 57.017768144607544
Finish training for (10, 'warp') combination
Precision is: 3.8427166e-05
Time taken: 61.681060552597046
Finish training for (10, 'bpr') combination
Precision is: 2.4128685e-05
Time taken: 49.98122024536133
Finish training for (0.1, 'warp') combination
Precision is: 0.0040080436
Time taken: 53.60380220413208
Finish training for (0.1, 'bpr') combination
Precision is: 0.0010563002
Time taken: 47.82618975639343


# Train on 10% of the training data

In [6]:
df = train.sample(frac = 0.1,random_state=42)
validation = validation[validation.user_id.isin(df.user_id)]
validation = validation[validation.track_id.isin(df.track_id)]

In [7]:
data = Dataset()
data.fit_partial(df.user_id.unique(), df.track_id.unique())
csr_train, train_weights = data.build_interactions(((x['user_id'], x['track_id']) for index,x in df.iterrows()))
data.fit_partial(validation.user_id.unique(), validation.track_id.unique())
csr_validation, validation_weights = data.build_interactions(((x['user_id'], x['track_id']) for index,x in validation.iterrows()))

In [8]:
for i in param_choice:
    start = time()
    lightfm = LightFM(learning_rate = i[0], loss = i[1])
    model = lightfm.fit(csr_train,epochs = 30)
    print('Finish training for {} combination'.format(i))
    print('Precision is: ' + str(precision_at_k(model, test_interactions = csr_validation, train_interactions = csr_train,k=500).mean()))
    time_taken = time() - start
    print('Time taken: ' + str(time_taken))

Finish training for (1, 'warp') combination
Precision is: 1.30331755e-05
Time taken: 335.01461148262024
Finish training for (1, 'bpr') combination
Precision is: 0.00046267774
Time taken: 252.49253726005554
Finish training for (10, 'warp') combination
Precision is: 2.2808059e-05
Time taken: 294.813068151474
Finish training for (10, 'bpr') combination
Precision is: 1.007109e-05
Time taken: 259.68553137779236
Finish training for (0.1, 'warp') combination
Precision is: 0.005136552
Time taken: 279.66125082969666
Finish training for (0.1, 'bpr') combination
Precision is: 0.0030971563
Time taken: 246.64594626426697
