In [1]:
import numpy as np
import pandas as pd
from lightfm import LightFM
import itertools
from lightfm.evaluation import precision_at_k
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix
from pyspark.sql import SparkSession
from time import time



In [2]:
def clean(train, test, val):
    
    train_df.createOrReplaceTempView('train')
    test_df.createOrReplaceTempView('test')
    val_df.createOrReplaceTempView('val')
    
    # Filtering the validation set such that it contains only books that are already present in the training set
    
    val = spark.sql('SELECT v.user_id_numer, v.track_id_numer, v.count FROM val v JOIN train t on v.user_id_numer = t.user_id_numer') 
    test = spark.sql('SELECT t.user_id_numer, t.track_id_numer, t.count FROM test t JOIN train tr on t.user_id_numer = tr.user_id_numer')
    
    val.createOrReplaceTempView('val')
    test.createOrReplaceTempView('test')
    
    return train.toPandas(), val.toPandas(), test.toPandas()

In [3]:
def create_interactions(train, test, val):
    
    train_interaction = pd.pivot_table(train, index='user_id_numer', columns='track_id_numer', values='count', fill_value = 0)
    test_interaction = pd.pivot_table(test, index='user_id_numer', columns='track_id_numer', values='count', fill_value = 0)
    val_interaction = pd.pivot_table(val, index='user_id_numer', columns='track_id_numer', values='count', fill_value = 0)
    
    return train_interaction, test_interaction, val_interaction  

In [4]:
def create_matrix(train_interaction, test_interaction, val_interaction):
    
    return csr_matrix(train_interaction.values), csr_matrix(test_interaction.values), csr_matrix(val_interaction.values)

In [5]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """
    
    while True:
        yield {
            "no_components": [5,10,15,20],
            "learning_rate": [.01,.1,1]
        }

In [6]:
def random_search(train, val, m_iter, num_samples):

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        
        print(hyperparams)

        model = LightFM(**hyperparams)
        model.fit(train, epochs=m_iter)

        MAP = precision_at_k(model, val).mean()

        yield (MAP, hyperparams, model)

In [7]:
def train_model(train_matrix, rank, reg, m_iter):
    
    model = LightFM(random_state = 123, learning_rate = reg, no_components = rank)
    model = model.fit(train_matrix, epochs = m_iter)
    
    return model

In [8]:
def test_model(model, test_matrix):
    
    return precision_at_k(model, val).mean()

In [9]:
def main():
    
    train_df = pd.read_csv('../data/pandas_train_df')
    test_df = pd.read_csv('../data/pandas_test_df')
    val_df = pd.read_csv('../data/pandas_val_df')
    
    #train_df, val_df, test_df = clean(train_df, test_df, val_df)
    train_interaction, test_interaction, val_interaction = create_interactions(train_df, test_df, val_df)
    
    print('pivot tables created')
    
    train_matrix, test_matrix, val_matrix = create_matrix(train_interaction, test_interaction, val_interaction)
    
    print('matrices created')
    
    st = time()
    (score, hyperparams, model) = max(random_search(train_matrix, val_matrix, m_iter = 4, num_samples = 4), key=lambda x: x[0])
    end = round(time()-st, 3)
    
    print("Best score {} at {}".format(score, hyperparams))
    print("Hyperparameter tuning took {}".format(end))
    
    st = time()
    model = train_model(train_model, **hyperparams, m_iter = 4)
    bestMAP = test_model(model, test_matrix)  
    end = round(time()-st, 3)
    
    print("Best MAP on test data: {}".format(bestMAP))
    print("Final model training and fitting took {}".format(end))
    

In [None]:
train_df = pd.read_csv('../data/pandas_train_df')
test_df = pd.read_csv('../data/pandas_test_df')
val_df = pd.read_csv('../data/pandas_val_df')
    
#train_df, val_df, test_df = clean(train_df, test_df, val_df)
train_interaction, test_interaction, val_interaction = create_interactions(train_df, test_df, val_df)
    
print('pivot tables created')
    
train_matrix, test_matrix, val_matrix = create_matrix(train_interaction, test_interaction, val_interaction)
    
print('matrices created')

In [None]:
st = time()
(score, hyperparams, model) = max(random_search(train_matrix, val_matrix, m_iter = 4, num_samples = 4), key=lambda x: x[0])
end = round(time()-st, 3)
    
print("Best score {} at {}".format(score, hyperparams))
print("Hyperparameter tuning took {}".format(end))
    
st = time()
model = train_model(train_model, **hyperparams, m_iter = 4)
bestMAP = test_model(model, test_matrix)  
end = round(time()-st, 3)
    
print("Best MAP on test data: {}".format(bestMAP))
print("Final model training and fitting took {}".format(end))
    