In [1]:
import numpy as np
import pandas as pd
from lightfm import LightFM
import itertools
from lightfm.evaluation import precision_at_k
from scipy.sparse import csr_matrix, lil_matrix, coo_matrix
from pyspark.sql import SparkSession
from time import time
from lightfm.cross_validation import random_train_test_split



In [3]:
def create_interactions(df):
    
    return pd.pivot_table(df, index='user_id_numer', columns='track_id_numer', values='count', fill_value = 0)

In [13]:
def create_matrix(interaction):
    
    total_matrix = csr_matrix(interaction.values)
    
    (train_matrix, val_and_test_matrix) = random_train_test_split(total_matrix, test_percentage=0.8)
    
    (val_matrix, test_matrix) = random_train_test_split(val_and_test_matrix, test_percentage=0.5)
    
    return train_matrix, val_matrix, test_matrix

In [20]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """
    
    return {"no_components": [5,10,20,40,80,160], "learning_rate": [.01,.1,1,2]}

In [21]:
def random_search(train, val, m_iter):
    
    MAP_dict = {}

    hyperparams = sample_hyperparameters()
    
    for rank in hyperparams['no_components']:
            
        key = '{} Rank'.format(rank)
        nested_dict = {}
            
        for reg in hyperparams['learning_rate']:

            model = LightFM(random_state = 123, learning_rate = reg, no_components = rank)
            model.fit(train, epochs=m_iter)

            MAP = precision_at_k(model, val, k = 500).mean()
                
            nested_key = '{} Reg Param'.format(reg)
                
            nested_dict[nested_key] = MAP
        
        MAP_dict[key] = nested_dict
                
    return MAP_dict


In [54]:
def get_best_params(MAP_dict):
    
    max_MAP = 0

    for rank in dic.keys():
        for reg in dic[rank]:
            if dic[rank][reg] > max_MAP:
                max_rank = rank
                max_reg = reg
                
    return float(str.split(max_reg, ' ')[0]), int(str.split(max_rank, ' ')[0])
    

In [9]:
def train_model(train_matrix, rank, reg, m_iter):
    
    model = LightFM(random_state = 123, learning_rate = reg, no_components = rank)
    model = model.fit(train_matrix, epochs = m_iter)
    
    return model

In [59]:
def test_model(model, test_matrix):
    
    return precision_at_k(model, test_matrix).mean()

In [60]:
def main():
    
    train_df = pd.read_csv('../data/pandas_train_df')
    test_df = pd.read_csv('../data/pandas_test_df')
    val_df = pd.read_csv('../data/pandas_val_df')
    
    df_total = pd.concat([train_df, val_df, test_df])
    df_interaction = create_interactions(df_total)
    
    print('pivot table created')
    
    train_matrix, val_matrix, test_matrix = create_matrix(df_interaction)
    
    print('matrices created')
    
    st = time()
    MAP_dict = random_search(train_matrix, val_matrix, m_iter = 4)
    end = round(time()-st, 3)
    
    print("Hyperparameter tuning took {} seconds".format(end))
    
    best_reg, best_rank = get_best_params(MAP_dict)
    
    print("Best rank: {}, best reg: {}".format(best_rank, best_reg))

    st = time()
    model = train_model(train_matrix, best_rank, best_reg, m_iter = 4)
    MAP = test_model(model, test_matrix)  
    end = round(time()-st, 3)
    
    print("MAP on test data: {}".format(MAP))
    print("Final model training and fitting took {}".format(end))
    
    return pd.DataFrame(MAP_dict)
    


In [61]:
main()

pivot table created
matrices created
Hyperparameter tuning took 869.107 seconds
Best rank: 20, best reg: 1.0
MAP on test data: 0.0019304865272715688
Final model training and fitting took 35.086


Unnamed: 0,5 Rank,10 Rank,20 Rank,40 Rank,80 Rank,160 Rank
0.01 Reg Param,0.000278,0.00029,0.000292,0.00029,0.000292,0.00029
0.1 Reg Param,0.00029,0.00029,0.00029,0.000289,0.000291,0.000289
1 Reg Param,0.00029,0.00029,0.00029,0.000291,0.00029,0.000289
2 Reg Param,0.00029,0.000291,0.00029,0.000291,0.00029,0.000289
