In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from recommender_lib.dataprep.movielens import MovieLens
from recommender_lib.training.pipeline import train_recommender, evaluate_recommender
import pandas as pd

### Explore Dataset

In [3]:
ml = MovieLens(
    ratings_path="/Users/kartik/Downloads/ml-latest-small/ratings.csv",
    movies_path="/Users/kartik/Downloads/ml-latest-small/movies.csv",
    movie_content_path="",
)

In [8]:
print("Loading movie ratings...")
data_ratings = ml.load_dataset()
data_ratings

Loading movie ratings...


<surprise.dataset.DatasetAutoFolds at 0x7f8f1300d040>

#### Load csv files using pandas

In [4]:
df_ratings = pd.read_csv(ml.ratings_path, skiprows=1, names=['user_id','item_id','rating','timestamp'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
df_movie = pd.read_csv(ml.movies_path, skiprows=1, names=['item_id', 'item_name', 'genres'])
df_movie.head()

Unnamed: 0,item_id,item_name,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
df_movie_user_ratings =  pd.merge(df_movie, df_ratings, on = 'item_id') 
df_movie_user_ratings.head()

Unnamed: 0,item_id,item_name,genres,user_id,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


In [7]:
df_movie_user_ratings.shape

(100004, 6)

### Train Recommender Model

In [9]:
# Set value of Top-N (N=no of recommendations per user)
N = 10

In [10]:
similarity_svd_model = train_recommender(data_ratings, 
                                         algorithm="SVD", 
                                         no_of_recommended_items=N, 
                                         model_random_state=10,
                                         test_size=0.25, no_of_items_dropped=1, 
                                         training_split_random_state=1)


Building recommendation model using SVD...

Computing recommendations...

Evaluating accuracy of model...
RMSE:  0.9033701087151801
MAE:  0.6977882196132263

Evaluating top-10 recommendations...
Computing recommendations with leave-one-out...
Predict ratings for left-out set...
Predict all missing ratings...
Compute top 10 recs per user...

Hit Rate:  0.029806259314456036

rHR (Hit Rate by Rating value): 
3.5 0.017241379310344827
4.0 0.0425531914893617
4.5 0.020833333333333332
5.0 0.06802721088435375

cHR (Cumulative Hit Rate, rating >= 4):  0.04960835509138381

ARHR (Average Reciprocal Hit Rank):  0.0111560570576964


### Evaluate Recommender Model

In [11]:
# Get Popularity Rankings (needed for evaluating novelty)
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.get_popularity_rankings()


Computing movie popularity ranks so we can measure novelty later...


In [12]:
# Evaluate Model
evaluate_recommender(data_ratings, 
                     similarity_svd_model, 
                     no_of_recommended_items=N, 
                     rating_threshold=4.0,
                     evaluate_diversity=True, 
                     evaluate_novelty=True,
                     baseline_name='pearson_baseline', 
                     baseline_user_based=False, 
                     rankings=rankings)


Computing item similarities so we can measure diversity later...
similarity options: {'name': 'pearson_baseline', 'user_based': False}

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Computing complete recommendations, no hold outs...

User coverage:  0.9552906110283159

Novelty (average popularity rank):  491.5767777960256
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.

Diversity:  0.9665208258150911
