Importing all necessary libraries -

In [5]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import implicit
import time
from scipy import sparse
import scipy.sparse as sp
import matplotlib.pyplot as plt
%matplotlib inline
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import auc_score

Reading the entire data -

In [6]:
Ratings_base = pd.read_table("ml-100k/u.data", header=None)
Ratings_base.columns = ["userId", "movieId", "rating", "timestamp"]

FileNotFoundError: File b'ml-100k/u.data' does not exist

In [20]:
train = pd.read_table("ml-100k/u1.base", header = None)
train.columns = ["userId", "movieId", "rating", "timestamp"]
test = pd.read_table("ml-100k/u1.base", header = None)
test.columns = ["userId", "movieId", "rating", "timestamp"]

In [69]:
user_feature = pd.read_table("ml-100k/u.user", header = None, sep = '|')
user_feature.columns = ["user_id", "age", "gender", "occupation", "zip_code"]

In [73]:
item_feature = pd.read_table("ml-100k/u.item", header = None, sep = '|', encoding = 'latin')
item_feature.columns = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url',
                        'unknown', 'action', 'adventure',
                        'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
                        'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi',
                        'thriller', 'war', 'western']

In [33]:
def create_user_item(Ratings, data):
    uniq_users = list(Ratings.userId.unique())
    uniq_item = list(Ratings.movieId.unique())
    user_item_mat = sp.lil_matrix((len(uniq_users), len(uniq_item)), dtype=np.int32)
    for i in range(len(data)):
        user_index = uniq_users.index(Ratings.userId.iloc[i])
        item_index = uniq_item.index(Ratings.movieId.iloc[i])
        user_item_mat[user_index,item_index] = Ratings.rating.iloc[i]
    return user_item_mat.tocoo(), uniq_users , uniq_item

In [80]:
def create_item_feature(Ratings, data):
    uniq_item = list(Ratings.movieId.unique())
    feature_column_mapping = {'movie_id':0, 'movie_title':1, 'release_date':2, 'video_release_date':3, 'imdb_url':4,
                              'unknown':5, 'action':6, 'adventure': 7,'animation': 8, 'children': 9, 'comedy':10,
                              'crime':11, 'documentary':12, 'drama':13, 'fantasy':14, 'film_noir':15, 'horror':16,
                              'musical':17, 'mystery':18, 'romance':19, 'sci_fi':20, 'thriller':21, 'war':22,
                              'western':23}
    feature_encoding = {'unknown':0, 'action':1, 'adventure':2,
                        'animation':3, 'children':4, 'comedy':5, 'crime':6, 'documentary':7, 'drama':8, 'fantasy':9,
                        'film_noir':10, 'horror':11, 'musical':12, 'mystery':13, 'romance':14, 'sci_fi':15,
                        'thriller':16, 'war':17, 'western':18}
    item_feature_mat = sp.lil_matrix((len(uniq_item), len(feature_encoding.keys())), dtype=np.int32)
    for i in range(len(data)):
        item_index = uniq_item.index(data.movie_id.iloc[i])
        for j in data.columns:
            if j in feature_encoding.keys():
                feature_index = feature_encoding[j]
                item_feature_mat[item_index, feature_index] = data.iloc[i, feature_column_mapping[j]]
    return item_feature_mat.tocsr()

In [95]:
def create_user_feature(Ratings, data):
    uniq_user = list(Ratings.userId.unique())
    uniq_gender = list(data.gender.unique())
    uniq_occupation = list(data.occupation.unique())
    uniq_zip = list(data.zip_code.unique())
    feature_column_mapping = {"user_id":0, "age":1, "gender":2, "occupation":3, "zip_code":4}
    feature_encoding = {"age":0, "gender":1, "occupation":2, "zip_code":3}
    user_feature_mat = sp.lil_matrix((len(uniq_user), len(feature_encoding.keys())), dtype=np.int32)
    for i in range(len(data)):
        user_index = uniq_user.index(data.user_id.iloc[i])
        for j in data.columns:
            if j in feature_encoding.keys():
                feature_index = feature_encoding[j]
                #print(j)
                if j == "gender":
                    value = uniq_gender.index(data.iloc[i, feature_column_mapping[j]])
                elif j == "occupation":
                    value = uniq_occupation.index(data.iloc[i, feature_column_mapping[j]])
                elif j == "zip_code":
                    value = uniq_zip.index(data.iloc[i, feature_column_mapping[j]])
                else:
                    value = data.iloc[i, feature_column_mapping[j]]
                user_feature_mat[user_index, feature_index] = value
    return user_feature_mat.tocsr()

In [5]:
def subset_data_density(Ratings,parameter, nlargest):
    if parameter == "movie":
        movie = Ratings.groupby(['movieId']).movieId.value_counts().nlargest(nlargest)
        l1 = list(dict(movie).keys())
        l2 = [ x[0] for x in l1]
        Ratings = Ratings[Ratings.movieId.isin(l2)]
        return Ratings
    elif parameter == "user":
        user = Ratings.groupby(['userId']).userId.value_counts().nlargest(nlargest)
        l1 = list(dict(user).keys())
        l2 = [ x[0] for x in l1]
        Ratings = Ratings[Ratings.userId.isin(l2)]
        return Ratings

In [6]:
def subset_data_size(Ratings, size):
    return Ratings.sample(n = size)

In [7]:
def train_test_split(df, percent):
    indices = np.random.permutation(df.shape[0])
    num= int(np.floor((1-percent)*df.shape[0]))
    training_idx, test_idx = indices[:num], indices[num:]
    train_set = df.iloc[training_idx]
    test_set = df.iloc[test_idx]
    return train_set, test_set

In [1]:
def predict_fm_user_item(train, test, learn_rate, latent_dimension):
    model = LightFM(learning_rate = learn_rate, no_components = latent_dimension, loss = 'warp')
    model.fit(train, epochs=50)
    
    train_precision = precision_at_k(model, train, k=10).mean()
    test_precision = precision_at_k(model, test, k=10).mean()
    
    train_recall = recall_at_k(model, train, k=10).mean()
    test_recall = recall_at_k(model, test, k=10).mean()

    train_auc = auc_score(model, train).mean()
    test_auc = auc_score(model, test).mean()

    print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
    print('Recall: train %.2f, test %.2f.' % (train_recall, test_recall))
    print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [97]:
def predict_fm_user_item_feature(train, test, item_feature_mat, user_feature_mat, learn_rate, latent_dimension):
    model = LightFM(learning_rate = learn_rate, no_components = latent_dimension)
    model.fit(train, item_features = item_feature_mat, user_features = user_feature_mat, epochs=50)
    
    train_precision = precision_at_k(model, train, item_features=item_feature_mat,
                                     user_features = user_feature_mat, k=10).mean()
    test_precision = precision_at_k(model, test, item_features=item_feature_mat,
                                    user_features = user_feature_mat, k=10).mean()
    
    train_recall = recall_at_k(model, train, item_features=item_feature_mat,
                               user_features = user_feature_mat, k=10).mean()
    test_recall = recall_at_k(model, test, item_features=item_feature_mat,
                              user_features = user_feature_mat, k=10).mean()

    train_auc = auc_score(model, train, item_features=item_feature_mat, user_features = user_feature_mat).mean()
    test_auc = auc_score(model, test, item_features=item_feature_mat, user_features = user_feature_mat).mean()

    print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
    print('Recall: train %.2f, test %.2f.' % (train_recall, test_recall))
    print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

In [3]:
train, test = train_test_split(Ratings_base, 0.2)

NameError: name 'train_test_split' is not defined

In [65]:
user_item_test, uniq_users , uniq_item = create_user_item(Ratings_base, test)

In [66]:
user_item_train, uniq_users , uniq_item = create_user_item(Ratings_base, train)

In [2]:
predict_fm_user_item(user_item_train, user_item_test, 0.1, 10)

NameError: name 'user_item_train' is not defined

In [81]:
item_feature_mat = create_item_feature(Ratings_base, item_feature)

In [96]:
user_feature_mat = create_user_feature(Ratings_base, user_feature)

In [98]:
predict_fm_user_item_feature(user_item_train, user_item_test, item_feature_mat, user_feature_mat, 0.1, 10)

Precision: train 0.06, test 0.03.
Recall: train 0.01, test 0.01.
AUC: train 0.45, test 0.45.
