# Item - Item Collaborative Filter Recommender

BitTiger DS501

In [None]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity
from time import time

## Load data to pandas

In [None]:
df_ratings_contents = pd.read_table("../data/u.data",
                                    names=["user", "movie", "rating", "timestamp"])

In [None]:
df_ratings_contents.head()

In [None]:
df_ratings_contents.info()

In [None]:
df_ratings_contents.describe()

## Convert rating records to user-movie utility matrix

### Option #1, use pandas.pivot_table

In [None]:
df_utility = pd.pivot_table(data=df_ratings_contents, 
                            values='rating', 
                            index='user', 
                            columns='movie', 
                            fill_value=0)

In [None]:
df_utility.head()

In [None]:
df_utility.info()

### Option #2, convert to sparse matrix using scipy.sparse.lil_matrix

In [None]:
highest_user_id = df_ratings_contents.user.max()
highest_movie_id = df_ratings_contents.movie.max()
ratings_mat = sparse.lil_matrix((highest_user_id, highest_movie_id))
ratings_mat

In [None]:
for _, row in df_ratings_contents.iterrows():
    # subtract 1 from id's due to match 0 indexing
    ratings_mat[row.user-1, row.movie-1] = row.rating

In [None]:
ratings_mat

### Let's carry on with option #2

In [None]:
utility_mat = ratings_mat

### Calculate item-item similarity matrix

In [None]:
# Item-Item Similarity Matrix
item_sim_mat = cosine_similarity(utility_mat.T)


### Calculate neighborhood

In [None]:
least_to_most_sim_indexes = np.argsort(item_sim_mat, axis=1)

# Neighborhoods
neighborhood_size = 75
neighborhoods = least_to_most_sim_indexes[:, -neighborhood_size:]

In [None]:
neighborhoods.shape

## Make rating prediction on a user

In [None]:
# Let's pick a lucky user
user_id = 100

In [None]:
n_users = utility_mat.shape[0]
n_items = utility_mat.shape[1]

start_time = time()
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]
# Just initializing so we have somewhere to put rating preds
out = np.zeros(n_items)
for item_to_rate in range(n_items):
    relevant_items = np.intersect1d(neighborhoods[item_to_rate],
                                    items_rated_by_this_user,
                                    assume_unique=True)  # assume_unique speeds up intersection op
    out[item_to_rate] = ratings_mat[user_id, relevant_items] * \
        item_sim_mat[item_to_rate, relevant_items] / \
        item_sim_mat[item_to_rate, relevant_items].sum()


pred_ratings = np.nan_to_num(out)
print(pred_ratings)
print("Execution time: %f seconds" % (time()-start_time))

In [None]:
pred_ratings.shape

## Get final recommendations for a user

In [None]:
# Recommend n movies
n = 10

# Get item indexes sorted by predicted rating
item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))[::-1]

# Find items that have been rated by user
items_rated_by_this_user = ratings_mat[user_id].nonzero()[1]

# We want to exclude the items that have been rated by user
unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                if item not in items_rated_by_this_user]

unrated_items_by_pred_rating[:n]
