Importing all necessary libraries -

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import implicit
import time
from scipy import sparse
import scipy.sparse as sp
import matplotlib.pyplot as plt
%matplotlib inline
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score

Reading the entire data -

In [3]:
Ratings_base = pd.read_table("ml-100k/u.data", header=None)
Ratings_base.columns = ["userId", "movieId", "rating", "timestamp"]

In [4]:
train = pd.read_table("ml-100k/u1.base", header = None)
train.columns = ["userId", "movieId", "rating", "timestamp"]
test = pd.read_table("ml-100k/u1.base", header = None)
test.columns = ["userId", "movieId", "rating", "timestamp"]

In [5]:
user_feature = pd.read_table("ml-100k/u.user", header = None, sep = '|')
user_feature.columns = ["user_id", "age", "gender", "occupation", "zip_code"]

In [6]:
item_feature = pd.read_table("ml-100k/u.item", header = None, sep = '|', encoding = 'latin')
item_feature.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url',
                        'unknown', 'action', 'adventure',
                        'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
                        'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi',
                        'thriller', 'war', 'western']

In [7]:
def create_user_item(Ratings, data):
    uniq_users = list(Ratings.userId.unique())
    uniq_item = list(Ratings.movieId.unique())
    user_item_mat = sp.lil_matrix((len(uniq_users), len(uniq_item)), dtype=np.int32)
    for i in range(len(data)):
        user_index = uniq_users.index(Ratings.userId.iloc[i])
        item_index = uniq_item.index(Ratings.movieId.iloc[i])
        user_item_mat[user_index,item_index] = Ratings.rating.iloc[i]
    return user_item_mat.tocoo(), uniq_users , uniq_item

In [8]:
def create_item_feature(Ratings, data):
    uniq_item = list(Ratings.movieId.unique())
    feature_column_mapping = {'movie_id':0, 'movie_title':1, 'release_date':2, 'video_release_date':3, 'imdb_url':4,
                              'unknown':5, 'action':6, 'adventure': 7,'animation': 8, 'children': 9, 'comedy':10,
                              'crime':11, 'documentary':12, 'drama':13, 'fantasy':14, 'film_noir':15, 'horror':16,
                              'musical':17, 'mystery':18, 'romance':19, 'sci_fi':20, 'thriller':21, 'war':22,
                              'western':23}
    feature_encoding = {'unknown':0, 'action':1, 'adventure':2,
                        'animation':3, 'children':4, 'comedy':5, 'crime':6, 'documentary':7, 'drama':8, 'fantasy':9,
                        'film_noir':10, 'horror':11, 'musical':12, 'mystery':13, 'romance':14, 'sci_fi':15,
                        'thriller':16, 'war':17, 'western':18}
    item_feature_mat = sp.lil_matrix((len(uniq_item), len(feature_encoding.keys())), dtype=np.int32)
    for i in range(len(data)):
        item_index = uniq_item.index(data.movie_id.iloc[i])
        for j in data.columns:
            if j in feature_encoding.keys():
                feature_index = feature_encoding[j]
                item_feature_mat[item_index, feature_index] = data.iloc[i, feature_column_mapping[j]]
    return item_feature_mat.tocsr()

In [9]:
def create_user_feature(Ratings, data):
    uniq_user = list(Ratings.userId.unique())
    uniq_gender = list(data.gender.unique())
    uniq_occupation = list(data.occupation.unique())
    uniq_zip = list(data.zip_code.unique())
    feature_column_mapping = {"user_id":0, "age":1, "gender":2, "occupation":3, "zip_code":4}
    feature_encoding = {"age":0, "gender":1, "occupation":2, "zip_code":3}
    user_feature_mat = sp.lil_matrix((len(uniq_user), len(feature_encoding.keys())), dtype=np.int32)
    for i in range(len(data)):
        user_index = uniq_user.index(data.user_id.iloc[i])
        for j in data.columns:
            if j in feature_encoding.keys():
                feature_index = feature_encoding[j]
                #print(j)
                if j == "gender":
                    value = uniq_gender.index(data.iloc[i, feature_column_mapping[j]])
                elif j == "occupation":
                    value = uniq_occupation.index(data.iloc[i, feature_column_mapping[j]])
                elif j == "zip_code":
                    value = uniq_zip.index(data.iloc[i, feature_column_mapping[j]])
                else:
                    value = data.iloc[i, feature_column_mapping[j]]
                user_feature_mat[user_index, feature_index] = value
    return user_feature_mat.tocsr()

In [10]:
def subset_data_density(Ratings,parameter, nlargest):
    if parameter == "movie":
        movie = Ratings.groupby(['movieId']).movieId.value_counts().nlargest(nlargest)
        l1 = list(dict(movie).keys())
        l2 = [ x[0] for x in l1]
        Ratings = Ratings[Ratings.movieId.isin(l2)]
        return Ratings
    elif parameter == "user":
        user = Ratings.groupby(['userId']).userId.value_counts().nlargest(nlargest)
        l1 = list(dict(user).keys())
        l2 = [ x[0] for x in l1]
        Ratings = Ratings[Ratings.userId.isin(l2)]
        return Ratings

In [11]:
def subset_data_size(Ratings, size):
    return Ratings.sample(n = size)

In [12]:
def train_test_split(df, percent):
    indices = np.random.permutation(df.shape[0])
    num= int(np.floor((1-percent)*df.shape[0]))
    training_idx, test_idx = indices[:num], indices[num:]
    train_set = df.iloc[training_idx]
    test_set = df.iloc[test_idx]
    return train_set, test_set

In [13]:
def predict_fm_user_item(train, test, learn_rate, latent_dimension):
    model = LightFM(learning_rate = learn_rate, no_components = latent_dimension, loss = 'warp')
    model.fit(train, epochs=50)
    
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10).mean()
    
    train_recall = recall_at_k(model, train, k=10).mean()
    test_recall = recall_at_k(model, test, k=10).mean()

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test).mean()

    print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
    print('Recall: train %.2f, test %.2f.' % (train_recall, test_recall))
    print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [14]:
def predict_fm_user_item_feature(train, test, item_feature_mat, user_feature_mat, learn_rate, latent_dimension):
    model = LightFM(learning_rate = learn_rate, no_components = latent_dimension, loss = 'warp')
    model.fit(train, item_features = item_feature_mat, user_features = user_feature_mat, epochs=50)
    
    train_precision = precision_at_k(model, train, item_features=item_feature_mat,
                                     user_features = user_feature_mat, k=10).mean()
    test_precision = precision_at_k(model, test, item_features=item_feature_mat,
                                    user_features = user_feature_mat, k=10).mean()
    
    train_recall = recall_at_k(model, train, item_features=item_feature_mat,
                               user_features = user_feature_mat, k=10).mean()
    test_recall = recall_at_k(model, test, item_features=item_feature_mat,
                              user_features = user_feature_mat, k=10).mean()

    train_auc = auc_score(model, train, item_features=item_feature_mat, user_features = user_feature_mat).mean()
    test_auc = auc_score(model, test, item_features=item_feature_mat, user_features = user_feature_mat).mean()

    print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
    print('Recall: train %.2f, test %.2f.' % (train_recall, test_recall))
    print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

# Checking performance on different test - train proportions -

In [15]:
train_test_proportions = [0.5, 0.4, 0.3, 0.2, 0.1]

In [16]:
for proportion in train_test_proportions:
    train, test = train_test_split(Ratings_base, proportion)
    user_item_test, uniq_users , uniq_item = create_user_item(Ratings_base, test)
    user_item_train, uniq_users , uniq_item = create_user_item(Ratings_base, train)
    print("Performance metrics for split ", proportion, "-")
    print("Without user/item features -")
    predict_fm_user_item(user_item_train, user_item_test, 0.1, 10)
    item_feature_mat = create_item_feature(Ratings_base, item_feature)
    user_feature_mat = create_user_feature(Ratings_base, user_feature)
    print("--------------------------------------------------")
    print("With user/item features -")
    predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, 0.1, 10)
    print("##################################################")

Performance metrics for split  0.5 -
Without user/item features -
Precision: train 0.49, test 0.49.
Recall: train 0.15, test 0.15.
AUC: train 0.95, test 0.95.
--------------------------------------------------
With user/item features -
Precision: train 0.12, test 0.12.
Recall: train 0.02, test 0.02.
AUC: train 0.54, test 0.54.
##################################################
Performance metrics for split  0.4 -
Without user/item features -
Precision: train 0.52, test 0.45.
Recall: train 0.15, test 0.14.
AUC: train 0.95, test 0.95.
--------------------------------------------------
With user/item features -
Precision: train 0.09, test 0.08.
Recall: train 0.01, test 0.01.
AUC: train 0.52, test 0.51.
##################################################
Performance metrics for split  0.3 -
Without user/item features -
Precision: train 0.56, test 0.38.
Recall: train 0.15, test 0.12.
AUC: train 0.95, test 0.94.
--------------------------------------------------
With user/item features -
Prec

# Checking performance on different alpha rates -

In [17]:
alpha_rates = [0.005, 0.01, 0.05, 0.1, 0.5]

In [18]:
for alpha in alpha_rates:
    train, test = train_test_split(Ratings_base, 0.2)
    user_item_test, uniq_users , uniq_item = create_user_item(Ratings_base, test)
    user_item_train, uniq_users , uniq_item = create_user_item(Ratings_base, train)
    print("Performance metrics for learning rate ", alpha, "-")
    print("Without user/item features -")
    predict_fm_user_item(user_item_train, user_item_test, alpha, 10)
    item_feature_mat = create_item_feature(Ratings_base, item_feature)
    user_feature_mat = create_user_feature(Ratings_base, user_feature)
    print("--------------------------------------------------")
    print("With user/item features -")
    predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, alpha, 10)
    print("##################################################")

Performance metrics for learning rate  0.005 -
Without user/item features -
Precision: train 0.44, test 0.21.
Recall: train 0.08, test 0.07.
AUC: train 0.86, test 0.84.
--------------------------------------------------
With user/item features -
Precision: train 0.21, test 0.10.
Recall: train 0.03, test 0.03.
AUC: train 0.63, test 0.62.
##################################################
Performance metrics for learning rate  0.01 -
Without user/item features -
Precision: train 0.52, test 0.24.
Recall: train 0.11, test 0.10.
AUC: train 0.90, test 0.89.
--------------------------------------------------
With user/item features -
Precision: train 0.20, test 0.09.
Recall: train 0.03, test 0.03.
AUC: train 0.61, test 0.61.
##################################################
Performance metrics for learning rate  0.05 -
Without user/item features -
Precision: train 0.63, test 0.31.
Recall: train 0.14, test 0.13.
AUC: train 0.95, test 0.93.
--------------------------------------------------
Wi

# Checking performance on different number of latent dimensions -

In [19]:
latent_dimensions = [1, 5, 10, 20, 50]

In [20]:
for dimension in latent_dimensions:
    train, test = train_test_split(Ratings_base, 0.2)
    user_item_test, uniq_users , uniq_item = create_user_item(Ratings_base, test)
    user_item_train, uniq_users , uniq_item = create_user_item(Ratings_base, train)
    print("Performance metrics for ", dimension, "latent dimensions -")
    print("Without user/item features -")
    predict_fm_user_item(user_item_train, user_item_test, 0.1, dimension)
    item_feature_mat = create_item_feature(Ratings_base, item_feature)
    user_feature_mat = create_user_feature(Ratings_base, user_feature)
    print("--------------------------------------------------")
    print("With user/item features -")
    predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, 0.1, dimension)
    print("##################################################")

Performance metrics for  1 latent dimensions -
Without user/item features -
Precision: train 0.51, test 0.24.
Recall: train 0.10, test 0.10.
AUC: train 0.91, test 0.89.
--------------------------------------------------
With user/item features -
Precision: train 0.06, test 0.03.
Recall: train 0.01, test 0.01.
AUC: train 0.52, test 0.52.
##################################################
Performance metrics for  5 latent dimensions -
Without user/item features -
Precision: train 0.60, test 0.29.
Recall: train 0.13, test 0.13.
AUC: train 0.94, test 0.93.
--------------------------------------------------
With user/item features -
Precision: train 0.11, test 0.04.
Recall: train 0.01, test 0.01.
AUC: train 0.51, test 0.47.
##################################################
Performance metrics for  10 latent dimensions -
Without user/item features -
Precision: train 0.64, test 0.30.
Recall: train 0.14, test 0.13.
AUC: train 0.95, test 0.94.
--------------------------------------------------

# Checking performance by picking different item-set sizes -

In [21]:
item_set_size = [1000, 500, 250, 100, 50]

In [22]:
for size in item_set_size:
    subset_base = subset_data_density(Ratings_base, "movie", size)
    train, test = train_test_split(subset_base, 0.2)
    user_item_test, uniq_users , uniq_item = create_user_item(subset_base, test)
    user_item_train, uniq_users , uniq_item = create_user_item(subset_base, train)
    print("Performance metrics for top", size, " rated items -")
    print("Without user/item features -")
    predict_fm_user_item(user_item_train, user_item_test, 0.1, 50)
    item_feature_mat = create_item_feature(Ratings_base, item_feature)
    user_feature_mat = create_user_feature(Ratings_base, user_feature)
    print("--------------------------------------------------")
    print("With user/item features -")
    predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, 0.1, 50)
    print("##################################################")

Performance metrics for top 1000  rated items -
Without user/item features -
Precision: train 0.87, test 0.39.
Recall: train 0.22, test 0.19.
AUC: train 0.98, test 0.95.
--------------------------------------------------
With user/item features -
Precision: train 0.09, test 0.04.
Recall: train 0.01, test 0.01.
AUC: train 0.48, test 0.48.
##################################################
Performance metrics for top 500  rated items -
Without user/item features -
Precision: train 0.93, test 0.41.
Recall: train 0.29, test 0.24.
AUC: train 0.98, test 0.93.
--------------------------------------------------
With user/item features -
Precision: train 0.15, test 0.07.
Recall: train 0.03, test 0.03.
AUC: train 0.48, test 0.48.
##################################################
Performance metrics for top 250  rated items -
Without user/item features -
Precision: train 0.97, test 0.44.
Recall: train 0.39, test 0.33.
AUC: train 0.99, test 0.91.
--------------------------------------------------

# Checking performance by picking different user-set sizes -

In [23]:
user_set_size = [1000, 500, 250, 100, 50]

In [24]:
for size in user_set_size:
    subset_base = subset_data_density(Ratings_base, "user", size)
    train, test = train_test_split(subset_base, 0.2)
    user_item_test, uniq_users , uniq_item = create_user_item(subset_base, test)
    user_item_train, uniq_users , uniq_item = create_user_item(subset_base, train)
    print("Performance metrics for top", size, " users -")
    print("Without user/item features -")
    predict_fm_user_item(user_item_train, user_item_test, 0.1, 50)
    item_feature_mat = create_item_feature(Ratings_base, item_feature)
    user_feature_mat = create_user_feature(Ratings_base, user_feature)
    print("--------------------------------------------------")
    print("With user/item features -")
    predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, 0.1, 50)
    print("##################################################")

Performance metrics for top 1000  users -
Without user/item features -
Precision: train 0.83, test 0.37.
Recall: train 0.20, test 0.17.
AUC: train 0.99, test 0.97.
--------------------------------------------------
With user/item features -
Precision: train 0.08, test 0.04.
Recall: train 0.01, test 0.01.
AUC: train 0.56, test 0.55.
##################################################
Performance metrics for top 500  users -
Without user/item features -
Precision: train 0.90, test 0.39.
Recall: train 0.09, test 0.08.
AUC: train 0.99, test 0.96.
--------------------------------------------------
With user/item features -
Precision: train 0.12, test 0.07.
Recall: train 0.01, test 0.01.
AUC: train 0.50, test 0.49.
##################################################
Performance metrics for top 250  users -
Without user/item features -
Precision: train 0.96, test 0.41.
Recall: train 0.06, test 0.05.
AUC: train 0.99, test 0.95.
--------------------------------------------------
With user/item fe

In [25]:
train, test = train_test_split(Ratings_base, 0.2)

In [26]:
user_item_test, uniq_users , uniq_item = create_user_item(Ratings_base, test)

In [27]:
user_item_train, uniq_users , uniq_item = create_user_item(Ratings_base, train)

In [28]:
predict_fm_user_item(user_item_train, user_item_test, 0.1, 10)

Precision: train 0.64, test 0.30.
Recall: train 0.14, test 0.13.
AUC: train 0.95, test 0.94.


In [29]:
item_feature_mat = create_item_feature(Ratings_base, item_feature)

In [30]:
user_feature_mat = create_user_feature(Ratings_base, user_feature)

In [31]:
predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, 0.1, 10)

Precision: train 0.02, test 0.01.
Recall: train 0.00, test 0.00.
AUC: train 0.43, test 0.42.
