In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn import preprocessing
from lightfm import LightFM
import time
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
import pickle
import re
import seaborn as sns
import pyarrow
from lightfm.data import Dataset
from scipy.sparse import coo_matrix

In [5]:
path = '/home/drh382/final-project-the-team/ms_train.csv/part-00000-10909035-b61b-408d-bd49-c8cd1482a3d3-c000.csv'
header_list = ["user_id", "count", "track_id"]

In [6]:
data = pd.read_csv(path, names=header_list)

In [7]:
data.head()

Unnamed: 0,user_id,count,track_id
0,7,1,1102098
1,7,1,1102098
2,7,1,1102098
3,7,1,1102098
4,7,1,1102098


In [8]:
dataset = Dataset()

In [9]:
dataset.fit((data['user_id']),
            (data['track_id']))

In [10]:
num_users, num_items = dataset.interactions_shape()
print('In train data, Num users: {}, num_items {}.'.format(num_users, num_items))

In train data, Num users: 163206, num_items 110000.


In [16]:
def informed_train_test(rating_df, train_ratio):
    '''
    split_cut = np.int(np.round(rating_df.shape[0] * train_ratio)) #ok
    train_df = rating_df.iloc[0:split_cut] #ok
    test_df = rating_df.iloc[split_cut::] #ok
#     test_df = test_df[(test_df['userID'].isin(train_df['userID'])) & (test_df['ISBN'].isin(train_df['ISBN']))]
#     id_cols = ['userID', 'ISBN']
    id_cols = ['user_id', 'track_id']
    trans_cat_train = dict()
    trans_cat_test = dict()
    for k in id_cols:
        cate_enc = preprocessing.LabelEncoder()
        trans_cat_train[k] = cate_enc.fit_transform(train_df[k].values)
        trans_cat_test[k] = cate_enc.transform(test_df[k].values)
'''
    # --- Encode ratings:
    cate_enc = preprocessing.LabelEncoder()
    ratings = dict()
    ratings['train'] = cate_enc.fit_transform(train_df.count)
    ratings['test'] = cate_enc.transform(test_df.count)

    n_users = len(np.unique(trans_cat_train['user_id']))
    n_items = len(np.unique(trans_cat_train['track_id']))


    train = coo_matrix((ratings['train'], (trans_cat_train['user_id'], \
                                                          trans_cat_train['track_id'])) \
                                      , shape=(n_users, n_items))
    test = coo_matrix((ratings['test'], (trans_cat_test['user_id'], \
                                                        trans_cat_test['track_id'])) \
                                     , shape=(n_users, n_items))
    return train, test, train_df

In [11]:
train = dict()
# alldata = dict()
for i in data[['user_id','track_id']]:
    cate_enc = preprocessing.LabelEncoder()
    train[i] = cate_enc.fit_transform(data[i].values)


In [12]:
n_users = len(np.unique(train['user_id']))
n_items = len(np.unique(train['track_id']))

In [13]:
ratings = dict()
ratings['train'] = cate_enc.fit_transform(data['count'])

In [14]:
train_matrix = coo_matrix((ratings['train'], (train['user_id'], \
                                                          train['track_id'])) \
                                      , shape=(n_users, n_items))

In [15]:
train_matrix

<163206x110000 sparse matrix of type '<class 'numpy.int64'>'
	with 27010946 stored elements in COOrdinate format>

In [16]:
val_path = '/home/drh382/final-project-the-team/ms_val.csv/part-00000-cc14572e-0a50-4450-a754-575b8d44007b-c000.csv'
header_list = ["user_id", "count", "track_id"]
val_data = pd.read_csv(val_path, names=header_list)

In [17]:
val_dataset = Dataset()
val_dataset.fit((val_data['user_id']),
            (val_data['track_id']))

In [18]:
num_users, num_items = val_dataset.interactions_shape()
print('In val data, Num users: {}, num_items {}.'.format(num_users, num_items))

In val data, Num users: 159717, num_items 100000.


In [19]:
val = dict()
# alldata = dict()
for i in val_data[['user_id','track_id']]:
    cate_enc = preprocessing.LabelEncoder()
    val[i] = cate_enc.fit_transform(val_data[i].values)



In [21]:
val_n_users = len(np.unique(val['user_id']))
val_n_items = len(np.unique(val['track_id']))

val_ratings = dict()
val_ratings['val'] = cate_enc.fit_transform(val_data['count'])

In [22]:
val_matrix = coo_matrix((val_ratings['val'], (val['user_id'], \
                                                          val['track_id'])) \
                                      ,shape=(n_users, n_items)) #not mistake. the shape needs to be the same 

In [23]:
val_matrix

<163206x110000 sparse matrix of type '<class 'numpy.int64'>'
	with 1368430 stored elements in COOrdinate format>

In [None]:
#there are 4 metrics - AUC, prec_at_k, recall_at_k, and reciprocal_rank

In [25]:
#looking for the best hyperparam. source: https://stackoverflow.com/questions/49896816/how-do-i-optimize-the-hyperparameters-of-lightfm
import itertools

import numpy as np

def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, num_samples=10, num_threads=8):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)

        score = auc_score(model, test, train_interactions=train, num_threads=num_threads).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [27]:
from lightfm import LightFM
LightFM._check_test_train_intersections = lambda x, y, z: True

In [28]:
(score, hyperparams, model) = max(random_search(train_matrix, val_matrix, num_threads=4), key=lambda x: x[0])

print("Best score {} at {}".format(score, hyperparams))

Best score 0.6478155255317688 at {'no_components': 33, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.0012108928998233663, 'item_alpha': 4.008980346554833e-09, 'user_alpha': 1.7940262832295985e-08, 'max_sampled': 5, 'num_epochs': 36}


In [29]:
test_path = '/home/drh382/final-project-the-team/ms_test.csv/part-00000-e8dc8482-9e4f-45a3-827f-7e95e6499267-c000.csv'
header_list = ["user_id", "count", "track_id"]
test_data = pd.read_csv(test_path, names=header_list)

In [30]:
test_dataset = Dataset()
test_dataset.fit((test_data['user_id']),
            (test_data['track_id']))

In [31]:
num_users, num_items = test_dataset.interactions_shape()
print('In test data, Num users: {}, num_items {}.'.format(num_users, num_items))

In test data, Num users: 10000, num_items 50074.


In [32]:
test = dict()
# alldata = dict()
for i in test_data[['user_id','track_id']]:
    cate_enc = preprocessing.LabelEncoder()
    test[i] = cate_enc.fit_transform(test_data[i].values)



In [33]:
test_ratings = dict()
test_ratings['test'] = cate_enc.fit_transform(test_data['count'])

In [35]:
test_matrix = coo_matrix((test_ratings['test'], (test['user_id'], \
                                                          test['track_id'])) \
                                      ,shape=(n_users, n_items)) #not mistake. the shape needs to be the same 

In [36]:
test_matrix

<163206x110000 sparse matrix of type '<class 'numpy.int64'>'
	with 135938 stored elements in COOrdinate format>

In [37]:
LightFM?

[0;31mInit signature:[0m
[0mLightFM[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mno_components[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mk[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_schedule[0m[0;34m=[0m[0;34m'adagrad'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mloss[0m[0;34m=[0m[0;34m'logistic'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mlearning_rate[0m[0;34m=[0m[0;36m0.05[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrho[0m[0;34m=[0m[0;36m0.95[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mepsilon[0m[0;34m=[0m[0;36m1e-06[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mitem_alpha[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muser_alpha[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_sampled[0m[0;34m=[0m[0;36m10[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrandom_sta

In [42]:
start_time = time.time()

# no_components': 33, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.0012108928998233663, 'item_alpha': 4.008980346554833e-09, 'user_alpha': 1.7940262832295985e-08, 'max_sampled': 5, 'num_epochs': 36

model=LightFM(no_components=33,learning_schedule='adagrad',learning_rate=0.0012108928998233663,loss='warp-kos', item_alpha=4.008980346554833e-09,user_alpha=1.7940262832295985e-08,max_sampled=5)
model.fit(train_matrix,epochs=36,num_threads=8)

auc_train = auc_score(model, train_matrix).mean()
auc_test = auc_score(model, test_matrix).mean()
prec_train = precision_at_k(model, train_matrix, k = 50).mean()
prec_test = precision_at_k(model, test_matrix, k = 50).mean()

recall_train = recall_at_k(model, train_matrix, k = 50).mean()
recall_test = recall_at_k(model, test_matrix, k = 50).mean()

print("--- Run time:  {} mins ---".format((time.time() - start_time)/60))
print("Train AUC Score: {}".format(auc_train))
print("Test AUC Score: {}".format(auc_test))
print("Train Prec Score: {}".format(prec_train))
print("Test Prec Score: {}".format(prec_test))
print("Train Recall Score: {}".format(recall_train))
print("Test Recall Score: {}".format(recall_test))

--- Run time:  53.14894810517629 mins ---
Train AUC Score: 0.8025360703468323
Test AUC Score: 0.7990469932556152
Train Prec Score: 0.006121710874140263
Test Prec Score: 0.0018179998733103275
Train Recall Score: 0.005499042943948241
Test Recall Score: 0.007811867575982336
