In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from recommender_lib.dataprep.movielens import MovieLens
from recommender_lib.training.pipeline import Pipeline
import pandas as pd
import os

### Explore Dataset

In [3]:
ml = MovieLens(
    ratings_path=os.environ['HOME'] + "/Downloads/ml-latest-small/ratings.csv",
    movies_path=os.environ['HOME'] + "/Downloads/ml-latest-small/movies.csv",
    movie_content_path=os.environ['HOME'] + "/Downloads/ml-latest-small/LLVisualFeatures13K_Log.csv",
)

In [4]:
print("Loading movie ratings...")
data_ratings = ml.load_dataset()
data_ratings

Loading movie ratings...


<surprise.dataset.DatasetAutoFolds at 0x7f8c74f84eb0>

#### Load csv files using pandas

In [5]:
names = ['avg_shot_length', 'mean_color_variance', 'stddev_color_variance', 
         'mean_motion', 'stddev_motion', 'mean_lighting_key', 'num_shots']
df_movie_metadata = pd.read_csv(ml.movie_content_path, skiprows=1, names=names)
df_movie_metadata.head()

Unnamed: 0,avg_shot_length,mean_color_variance,stddev_color_variance,mean_motion,stddev_motion,mean_lighting_key,num_shots
89,0.44083,0.763504,0.784965,0.132239,0.176285,0.275521,0.707383
93,0.467434,0.657441,0.65194,0.024859,0.061322,0.226915,0.687485
94,0.700268,0.652688,0.653051,0.020993,0.050809,0.201239,0.498546
95,0.522593,0.720691,0.725353,0.017811,0.040945,0.240973,0.660679
96,0.782697,0.658655,0.648308,0.138313,0.172365,0.191381,0.353793


In [6]:
df_ratings = pd.read_csv(ml.ratings_path, skiprows=1, names=['user_id','item_id','rating','timestamp'])
df_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [7]:
df_movie = pd.read_csv(ml.movies_path, skiprows=1, names=['item_id', 'item_name', 'genres'])
df_movie.head()

Unnamed: 0,item_id,item_name,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
df_movie_user_ratings =  pd.merge(df_movie, df_ratings, on = 'item_id') 
df_movie_user_ratings.head()

Unnamed: 0,item_id,item_name,genres,user_id,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7,3.0,851866703
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,9,4.0,938629179
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,13,5.0,1331380058
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15,2.0,997938310
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,19,3.0,855190091


In [9]:
df_movie_user_ratings.shape

(100004, 6)

### Train Recommender Model

In [10]:
# Set value of Top-N (N=no of recommendations per user)
N = 10

In [12]:
print("\nComputing movie popularity ranks so we can measure novelty later...")
rankings = ml.get_popularity_rankings()


Computing movie popularity ranks so we can measure novelty later...


In [13]:
# Construct an Evaluator to, you know, evaluate them
evaluator = Pipeline(data_ratings, rankings,
                     baseline_sim_options_name="pearson",
                     baseline_sim_options_user_based=False,
                     test_size=0.25,
                     no_of_items_dropped=1,
                     training_split_random_state=1)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.


### Evaluate Recommender Model

In [14]:
# Throw in an SVD recommender
evaluator.add_algorithm("SVD", model_random_state=10)
# Just make random recommendations
evaluator.add_algorithm("Random")

In [15]:
# Fight!
evaluator.run_evaluation(run_top_n=True, no_of_recommended_items=N, verbose=True)

Evaluating  SVD ...
Evaluating accuracy...
Evaluating top-N with leave-one-out...
Computing hit-rate and rank metrics...
Computing recommendations with full data set...
Analyzing coverage, diversity, and novelty...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Analysis complete.
Evaluating  Random ...
Evaluating accuracy...
Evaluating top-N with leave-one-out...
Computing hit-rate and rank metrics...
Computing recommendations with full data set...
Analyzing coverage, diversity, and novelty...
Computing the pearson similarity matrix...
Done computing similarity matrix.
Analysis complete.


Algorithm  RMSE       MAE        HR         cHR        ARHR       Coverage   Diversity  Novelty   
SVD        0.9034     0.6978     0.0298     0.0298     0.0112     0.9553     0.7381     491.5768  
Random     1.4457     1.1526     0.0179     0.0179     0.0063     1.0000     0.8398     525.4292  

Legend:

RMSE:      Root Mean Squared Error. Lower values mean better accur