In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display, HTML
from collections import defaultdict
from sklearn.model_selection import KFold

# Fix the dying kernel problem (only a problem in some installations - you can remove it, if it works without it)
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', 50)

# Movielens data

In [3]:
ml_ratings_df = pd.read_csv(os.path.join("..", "data", "movielens_small", "ratings.csv")).rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
ml_movies_df = pd.read_csv(os.path.join("..", "data", "movielens_small", "movies.csv")).rename(columns={'movieId': 'item_id'})
ml_df = pd.merge(ml_ratings_df, ml_movies_df, on='item_id')
print("Number of movies: {}".format(len(ml_movies_df)))
print("Number of users: {}".format(len(ml_ratings_df.user_id.unique())))
print("Number of interactions: {}".format(len(ml_ratings_df)))
print()
print("Movies")
display(ml_movies_df.head(10))
print("Interactions")
display(ml_ratings_df.head(30))

Number of movies: 9742
Number of users: 610
Number of interactions: 100836

Movies


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


Interactions


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


# Train a chosen recommender

In [7]:
from model.nearest_neighbors_recommender import ItemBasedCosineNearestNeighborsRecommender

ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)

In [5]:
ibcnn_recommender.fit(ml_ratings_df, None, ml_movies_df)

In [11]:
import pickle
pickle.dump(ibcnn_recommender, open(os.path.join("model", "model.pkl"), 'wb'))

In [6]:
import joblib
with open(os.path.join("model", "model.joblib"), 'wb') as f:
    joblib.dump(ibcnn_recommender, f)

# (Optional) Evaluation

## Take a look on user 6 preferences

In [6]:
# Print movies watched by user 6

active_user_movies = ml_df.loc[ml_df['user_id'] == 6]
print("Active user history")
display(active_user_movies.sort_values('rating', ascending=False))

Active user history


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
216,6,3,5.0,845554296,Grumpier Old Men (1995),Comedy|Romance
5339,6,780,5.0,845556915,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
16298,6,318,5.0,845553200,"Shawshank Redemption, The (1994)",Crime|Drama
19628,6,368,5.0,845553726,Maverick (1994),Adventure|Comedy|Western
19941,6,588,5.0,845553146,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
...,...,...,...,...,...,...
30804,6,327,1.0,845554062,Tank Girl (1995),Action|Comedy|Sci-Fi
33122,6,640,1.0,845555362,Diabolique (1996),Drama|Thriller
29246,6,179,1.0,845555362,Mad Love (1995),Drama|Romance
33245,6,762,1.0,845555006,Striptease (1996),Comedy|Crime


## Generate recommendations

In [7]:
recommenders = [ibcnn_recommender]

for recommender in recommenders:
    recommendations = recommender.recommend(pd.DataFrame([[6]], columns=['user_id']), ml_movies_df, 5)

    recommendations = pd.merge(recommendations, ml_movies_df, on='item_id', how='left')
    print("Recommendations for {}".format(type(recommender).__name__))
    display(recommendations)

Recommendations for ItemBasedCosineNearestNeighborsRecommender


Unnamed: 0,user_id,item_id,score,title,genres
0,6,551,76.10934,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical
1,6,648,74.424101,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
2,6,420,73.459661,Beverly Hills Cop III (1994),Action|Comedy|Crime|Thriller
3,6,586,72.446457,Home Alone (1990),Children|Comedy
4,6,442,70.079843,Demolition Man (1993),Action|Adventure|Sci-Fi


### Train-test split test

In [8]:
from evaluation_and_testing.testing import evaluate_train_test_split_implicit

ibcnn_recommender = ItemBasedCosineNearestNeighborsRecommender(n_neighbors=30)

recommenders = [ibcnn_recommender]

all_results = []

for recommender in recommenders:
    results = [[type(recommender).__name__] + list(evaluate_train_test_split_implicit(
        recommender, ml_ratings_df, ml_movies_df))]

    results = pd.DataFrame(results, 
                           columns=['Recommender', 'HR@1', 'HR@3', 'HR@5', 'HR@10', 'NDCG@1', 'NDCG@3', 'NDCG@5', 'NDCG@10'])
    all_results.append(results)

    display(results)
    
all_results = pd.concat(all_results).reset_index(drop=True)
display(all_results)

Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,ItemBasedCosineNearestNeighborsRecommender,0.331148,0.801639,1.281967,2.193443,0.331148,0.60138,0.797685,1.090682


Unnamed: 0,Recommender,HR@1,HR@3,HR@5,HR@10,NDCG@1,NDCG@3,NDCG@5,NDCG@10
0,ItemBasedCosineNearestNeighborsRecommender,0.331148,0.801639,1.281967,2.193443,0.331148,0.60138,0.797685,1.090682
