In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn import preprocessing
from lightfm import LightFM
import time
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
import pickle
import re
import seaborn as sns
import pyarrow
from lightfm.data import Dataset
from scipy.sparse import coo_matrix

In [2]:
path = '/home/drh382/final-project-the-team/ms_train.csv/part-00000-10909035-b61b-408d-bd49-c8cd1482a3d3-c000.csv'
header_list = ["user_id", "count", "track_id"]

In [3]:
data = pd.read_csv(path, names=header_list)

In [4]:
data.head()

Unnamed: 0,user_id,count,track_id
0,7,1,1102098
1,7,1,1102098
2,7,1,1102098
3,7,1,1102098
4,7,1,1102098


In [5]:
dataset = Dataset()

In [6]:
dataset.fit((data['user_id']),
            (data['track_id']))

In [7]:
num_users, num_items = dataset.interactions_shape()
print('In train data, Num users: {}, num_items {}.'.format(num_users, num_items))

In train data, Num users: 163206, num_items 110000.


In [8]:
afterfilter = data[data['count']>=5]

In [9]:
dataseta = Dataset()
dataseta.fit((afterfilter['user_id']),
            (afterfilter['track_id']))

In [10]:
num_users, num_items = dataseta.interactions_shape()
print('In train data, Num users: {}, num_items {}.'.format(num_users, num_items))

In train data, Num users: 61624, num_items 67437.


In [11]:
data = afterfilter

In [12]:
dataset = Dataset()
dataset.fit((data['user_id']),
            (data['track_id']))

In [13]:
train = dict()
# alldata = dict()
for i in data[['user_id','track_id']]:
    cate_enc = preprocessing.LabelEncoder()
    train[i] = cate_enc.fit_transform(data[i].values)


In [14]:
n_users = len(np.unique(train['user_id']))
n_items = len(np.unique(train['track_id']))

In [15]:
ratings = dict()
ratings['train'] = cate_enc.fit_transform(data['count'])

In [16]:
train_matrix = coo_matrix((ratings['train'], (train['user_id'], \
                                                          train['track_id'])) \
                                      , shape=(n_users, n_items))

In [17]:
train_matrix

<61624x67437 sparse matrix of type '<class 'numpy.int64'>'
	with 4566073 stored elements in COOrdinate format>

In [18]:
val_path = '/home/drh382/final-project-the-team/ms_val.csv/part-00000-cc14572e-0a50-4450-a754-575b8d44007b-c000.csv'
header_list = ["user_id", "count", "track_id"]
val_data = pd.read_csv(val_path, names=header_list)
val_data = val_data[val_data['count']>=5]

In [19]:
val_dataset = Dataset()
val_dataset.fit((val_data['user_id']),
            (val_data['track_id']))

In [20]:
num_users, num_items = val_dataset.interactions_shape()
print('In val data, Num users: {}, num_items {}.'.format(num_users, num_items))

In val data, Num users: 59921, num_items 62148.


In [21]:
val = dict()
# alldata = dict()
for i in val_data[['user_id','track_id']]:
    cate_enc = preprocessing.LabelEncoder()
    val[i] = cate_enc.fit_transform(val_data[i].values)



In [22]:
val_n_users = len(np.unique(val['user_id']))
val_n_items = len(np.unique(val['track_id']))

val_ratings = dict()
val_ratings['val'] = cate_enc.fit_transform(val_data['count'])

In [23]:
val_matrix = coo_matrix((val_ratings['val'], (val['user_id'], \
                                                          val['track_id'])) \
                                      ,shape=(n_users, n_items)) #not mistake. the shape needs to be the same 

In [24]:
val_matrix

<61624x67437 sparse matrix of type '<class 'numpy.int64'>'
	with 233554 stored elements in COOrdinate format>

In [None]:
#there are 4 metrics - AUC, prec_at_k, recall_at_k, and reciprocal_rank

In [25]:
#looking for the best hyperparam. source: https://stackoverflow.com/questions/49896816/how-do-i-optimize-the-hyperparameters-of-lightfm
import itertools

import numpy as np

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, num_samples=10, num_threads=8):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)

        score = auc_score(model, test, train_interactions=train, num_threads=num_threads).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [26]:
from lightfm import LightFM
LightFM._check_test_train_intersections = lambda x, y, z: True

In [27]:
(score, hyperparams, model) = max(random_search(train_matrix, val_matrix, num_threads=4), key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))

Best score 0.5531360507011414 at {'no_components': 49, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.0040578167078020215, 'item_alpha': 1.0949384353203165e-08, 'user_alpha': 4.398305362432972e-09, 'max_sampled': 9, 'num_epochs': 22}


In [28]:
test_path = '/home/drh382/final-project-the-team/ms_test.csv/part-00000-e8dc8482-9e4f-45a3-827f-7e95e6499267-c000.csv'
header_list = ["user_id", "count", "track_id"]
test_data = pd.read_csv(test_path, names=header_list)
test_data = test_data[test_data['count']>=5]

In [29]:
test_dataset = Dataset()
test_dataset.fit((test_data['user_id']),
            (test_data['track_id']))

In [30]:
num_users, num_items = test_dataset.interactions_shape()
print('In test data, Num users: {}, num_items {}.'.format(num_users, num_items))

In test data, Num users: 6204, num_items 13193.


In [31]:
test = dict()
# alldata = dict()
for i in test_data[['user_id','track_id']]:
    cate_enc = preprocessing.LabelEncoder()
    test[i] = cate_enc.fit_transform(test_data[i].values)



In [32]:
test_ratings = dict()
test_ratings['test'] = cate_enc.fit_transform(test_data['count'])

In [33]:
test_matrix = coo_matrix((test_ratings['test'], (test['user_id'], \
                                                          test['track_id'])) \
                                      ,shape=(n_users, n_items)) #not mistake. the shape needs to be the same 

In [34]:
test_matrix

<61624x67437 sparse matrix of type '<class 'numpy.int64'>'
	with 23028 stored elements in COOrdinate format>

In [35]:
#formal best hyperparameters - without filtering count >= 5

start_time = time.time()

# no_components': 33, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.0012108928998233663, 'item_alpha': 4.008980346554833e-09, 'user_alpha': 1.7940262832295985e-08, 'max_sampled': 5, 'num_epochs': 36

model=LightFM(no_components=33,learning_schedule='adagrad',learning_rate=0.0012108928998233663,loss='warp-kos', item_alpha=4.008980346554833e-09,user_alpha=1.7940262832295985e-08,max_sampled=5)
model.fit(train_matrix,epochs=36,num_threads=8)

auc_train = auc_score(model, train_matrix).mean()
auc_test = auc_score(model, test_matrix).mean()
prec_train = precision_at_k(model, train_matrix, k = 500).mean()
prec_test = precision_at_k(model, test_matrix, k = 500).mean()

recall_train = recall_at_k(model, train_matrix, k = 500).mean()
recall_test = recall_at_k(model, test_matrix, k = 500).mean()

print("--- Run time:  {} mins ---".format((time.time() - start_time)/60))
print("Train AUC Score: {}".format(auc_train))
print("Test AUC Score: {}".format(auc_test))
print("Train Prec Score: {}".format(prec_train))
print("Test Prec Score: {}".format(prec_test))
print("Train Recall Score: {}".format(recall_train))
print("Test Recall Score: {}".format(recall_test))

--- Run time:  9.806059106190999 mins ---
Train AUC Score: 0.8230760097503662
Test AUC Score: 0.7002538442611694
Train Prec Score: 0.002928209723904729
Test Prec Score: 0.0003787878667935729
Train Recall Score: 0.014677551431378567
Test Recall Score: 0.053097017099701924


In [36]:
#new best hyperparameters - with filtering count >= 5

start_time = time.time()
# 'no_components': 49, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.0040578167078020215, 'item_alpha': 1.0949384353203165e-08, 'user_alpha': 4.398305362432972e-09, 'max_sampled': 9, 'num_epochs': 22
# no_components': 33, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.0012108928998233663, 'item_alpha': 4.008980346554833e-09, 'user_alpha': 1.7940262832295985e-08, 'max_sampled': 5, 'num_epochs': 36

model=LightFM(no_components=49,learning_schedule='adagrad',learning_rate=0.0040578167078020215,loss='warp-kos', item_alpha=1.0949384353203165e-08,user_alpha=4.398305362432972e-09,max_sampled=9)
model.fit(train_matrix,epochs=22,num_threads=8)

auc_train = auc_score(model, train_matrix).mean()
auc_test = auc_score(model, test_matrix).mean()
prec_train = precision_at_k(model, train_matrix, k = 500).mean()
prec_test = precision_at_k(model, test_matrix, k = 500).mean()

recall_train = recall_at_k(model, train_matrix, k = 500).mean()
recall_test = recall_at_k(model, test_matrix, k = 500).mean()

print("--- Run time:  {} mins ---".format((time.time() - start_time)/60))
print("Train AUC Score: {}".format(auc_train))
print("Test AUC Score: {}".format(auc_test))
print("Train Prec Score: {}".format(prec_train))
print("Test Prec Score: {}".format(prec_test))
print("Train Recall Score: {}".format(recall_train))
print("Test Recall Score: {}".format(recall_test))

--- Run time:  9.664506697654724 mins ---
Train AUC Score: 0.9180307388305664
Test AUC Score: 0.621467113494873
Train Prec Score: 0.005684505682438612
Test Prec Score: 0.0002582205052021891
Train Recall Score: 0.03815161218132962
Test Recall Score: 0.033274214451000025
