# Recommender system - Item-based Collaborative Filtering

source (https://towardsdatascience.com/item-based-collaborative-filtering-in-python-91f747200fab)

In [20]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity


In [21]:
df = pd.read_csv("data/item-based-cf.csv", index_col=0)
df = df.fillna(0)
df

Unnamed: 0,user_0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9
movie_0,4.0,5.0,4.0,3.0,5.0,1.0,0.0,3.0,2.0,0.0
movie_1,1.0,0.0,1.0,0.0,2.0,4.0,5.0,4.0,0.0,4.0
movie_2,0.0,1.0,3.0,0.0,0.0,5.0,5.0,0.0,4.0,3.0
movie_3,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,0.0,4.0
movie_4,4.0,4.0,0.0,5.0,0.0,3.0,1.0,4.0,0.0,0.0
movie_5,5.0,4.0,4.0,3.0,4.0,2.0,1.0,0.0,0.0,1.0
movie_6,2.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,5.0,4.0
movie_7,1.0,2.0,1.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
movie_8,0.0,0.0,1.0,0.0,2.0,0.0,5.0,4.0,0.0,5.0
movie_9,5.0,5.0,3.0,3.0,5.0,2.0,0.0,0.0,2.0,1.0


In [22]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(df.values)

distances, indices = knn.kneighbors(df.values, n_neighbors=3)

indices

array([[0, 9, 5],
       [1, 8, 3],
       [2, 1, 7],
       [3, 8, 1],
       [4, 0, 5],
       [5, 9, 0],
       [6, 9, 0],
       [7, 2, 4],
       [8, 3, 1],
       [9, 5, 0]])

In [23]:
distances

array([[0.00000000e+00, 6.27034924e-02, 9.49277133e-02],
       [0.00000000e+00, 1.18745768e-01, 1.93304631e-01],
       [1.11022302e-16, 2.67802500e-01, 3.94829828e-01],
       [0.00000000e+00, 8.82890759e-02, 1.93304631e-01],
       [0.00000000e+00, 2.93014934e-01, 3.21347253e-01],
       [1.11022302e-16, 3.94942472e-02, 9.49277133e-02],
       [2.22044605e-16, 4.14942662e-01, 5.00246366e-01],
       [0.00000000e+00, 3.94829828e-01, 4.02519047e-01],
       [0.00000000e+00, 8.82890759e-02, 1.18745768e-01],
       [0.00000000e+00, 3.94942472e-02, 6.27034924e-02]])

# Predict a Rating for a Movie by a User

In [24]:
# This whole script is just to remove and pop the movie used for prediction (the movie itself)
def get_movies_distances(movie_title):
    # get the index for movie_title
    index_for_movie = df.index.tolist().index(movie_title)

    # find the indices and distances for the similar movies
    sim_movies = indices[index_for_movie].tolist()
    movie_distances = distances[index_for_movie].tolist()

    # the position of 'movie_0' in the list sim_movies
    id_movie = sim_movies.index(index_for_movie)

    # remove 'movie_0' from the list sim_movies
    sim_movies.remove(index_for_movie)

    # remove 'movie_0' from the list movie_distances
    movie_distances.pop(id_movie)

    print('The Nearest Movies to {}:{}'.format(movie_title, sim_movies))
    print('The Distance from {}:{}'.format(movie_title, movie_distances))
    
    return sim_movies, movie_distances

    
get_movies_distances("movie_0")

The Nearest Movies to movie_0:[9, 5]
The Distance from movie_0:[0.062703492361875, 0.09492771329807914]


([9, 5], [0.062703492361875, 0.09492771329807914])

## Equation for rating prediction:

## R(m, u) = {∑ ⱼ S(m, j)R(j, u)}/ ∑ ⱼ S(m, j)

In [25]:
import numpy as np

def get_rating(movie, user, verbose=False):    

    user_i = df.columns.tolist().index(user)
    movie_i = df.index.tolist().index(movie)
    
    # understand if the rating already exists. If not, calculate, otherwise, return
    if df.iloc[movie_i, user_i] == 0:
    
        ####################################################################
        similar_movies, distance_movies = get_movies_distances(movie)
        
        # filter by those that the user has ratings
        new_similar_movies_i = [similar_movies.index(temp_m_i) for temp_m_i in similar_movies if (df.iloc[temp_m_i, user_i] > 0)]
        similar_movies = np.array(similar_movies)[new_similar_movies_i]
        distance_movies = np.array(distance_movies)[new_similar_movies_i]
        
        # convert distances into similarities
        movie_similarities = [(1-d) for d in distance_movies]
        all_movie_similarities = sum(movie_similarities)      
        all_ratings = [get_rating(df.index.tolist()[temp_m_i], user) for temp_m_i in similar_movies]
    
        if verbose:
            print("User ratings" , all_ratings)
            print("Movie similarities" , movie_similarities)
            print("Sum of similarities (denominator)" , all_movie_similarities)
    
        return sum(np.array(movie_similarities) * np.array(all_ratings))/all_movie_similarities

    else:
        return df.iloc[movie_i, user_i]

print("Predicted rating: " , get_rating("movie_4","user_2", verbose=True))

The Nearest Movies to movie_4:[0, 5]
The Distance from movie_4:[0.29301493441486826, 0.32134725305868195]
User ratings [4.0, 4.0]
Movie similarities [0.7069850655851317, 0.678652746941318]
Sum of similarities (denominator) 1.3856378125264497
Predicted rating:  4.0


In [26]:
df

Unnamed: 0,user_0,user_1,user_2,user_3,user_4,user_5,user_6,user_7,user_8,user_9
movie_0,4.0,5.0,4.0,3.0,5.0,1.0,0.0,3.0,2.0,0.0
movie_1,1.0,0.0,1.0,0.0,2.0,4.0,5.0,4.0,0.0,4.0
movie_2,0.0,1.0,3.0,0.0,0.0,5.0,5.0,0.0,4.0,3.0
movie_3,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,0.0,4.0
movie_4,4.0,4.0,0.0,5.0,0.0,3.0,1.0,4.0,0.0,0.0
movie_5,5.0,4.0,4.0,3.0,4.0,2.0,1.0,0.0,0.0,1.0
movie_6,2.0,0.0,0.0,2.0,3.0,0.0,0.0,0.0,5.0,4.0
movie_7,1.0,2.0,1.0,0.0,0.0,4.0,0.0,4.0,4.0,0.0
movie_8,0.0,0.0,1.0,0.0,2.0,0.0,5.0,4.0,0.0,5.0
movie_9,5.0,5.0,3.0,3.0,5.0,2.0,0.0,0.0,2.0,1.0


# Recommending Movies

In [27]:
import numpy as np
user = "user_2"
recommended_movies = []

# Assuming the user did not watch movies without ratings / watched movies with ratings
for movie in df[df[user] == 0].index.tolist():
    recommended_movies.append((movie, get_rating(movie, user)))
    
sorted_movies = sorted(recommended_movies, key=lambda element:element[1], reverse=True)

for i in np.arange(len(sorted_movies)):
    print("{}: {} - rating:{}".format(i+1, sorted_movies[i][0], sorted_movies[i][1]))

The Nearest Movies to movie_3:[8, 1]
The Distance from movie_3:[0.08828907593962254, 0.19330463088501193]
The Nearest Movies to movie_4:[0, 5]
The Distance from movie_4:[0.29301493441486826, 0.32134725305868195]
The Nearest Movies to movie_6:[9, 0]
The Distance from movie_6:[0.41494266157111326, 0.5002463661149819]
1: movie_4 - rating:4.0
2: movie_6 - rating:3.460682687251072
3: movie_3 - rating:1.0
