## Collaborative Filtering
###### code copied and modified from https://medium.com/@urvimidha/recommendation-system-using-collaborative-filtering-in-python-83992251c8f7

In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_absolute_error

## Reading the data

In [3]:
ratings=pd.read_csv('ratings.csv')

In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
df=ratings.iloc[:,:-1]

In [7]:
# unique ratings
df['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [8]:
# number of ratings
len(df)

100836

In [9]:
# number of unique users
len(df['userId'].unique())

610

In [10]:
# number of unique movies
len(df['movieId'].unique())

9724

## Ratings Matrix - UserID vs MovieID

In [41]:
df_ratings = ratings.pivot(index='userId', columns='movieId', values='rating')  #sparse dataframe
df_ratings_transformed = df_ratings.T
df_ratings2 = df_ratings.copy().fillna(0) #fill nan by zeros
df_ratings.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


## Similarity Matrix - Item

In [30]:
similarity_matrix_item = cosine_similarity(df_ratings2.T, df_ratings2.T)  #similarity matrix
similarity_matrix_item_df = pd.DataFrame(similarity_matrix_item, index=df_ratings.columns, columns=df_ratings.columns)  #dataframe
similarity_matrix_item_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.410562,0.296917,0.035573,0.308762,0.376316,0.277491,0.131629,0.232586,0.395573,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.410562,1.0,0.282438,0.106415,0.287795,0.297009,0.228576,0.172498,0.044835,0.417693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.296917,0.282438,1.0,0.092406,0.417802,0.284257,0.402831,0.313434,0.30484,0.242954,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035573,0.106415,0.092406,1.0,0.188376,0.089685,0.275035,0.158022,0.0,0.095598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.308762,0.287795,0.417802,0.188376,1.0,0.298969,0.474002,0.283523,0.335058,0.218061,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Function to predict ratings  with user-based collaborative filtering

In [27]:
#function to predict ratings
def calculate_ratings(id_movie, id_user, top_k_similar, user_similarity):

    cosine_scores = user_similarity[id_user] #similarity of id_user with every other user
    ratings_scores = df_ratings[id_movie]      #ratings of every other user for the meal id_movie

    #won't consider users who haven't rated id_movie so drop similarity scores and ratings corresponsing to np.nan
    index_not_rated = ratings_scores[ratings_scores.isnull()].index
    ratings_scores = ratings_scores.dropna()
    cosine_scores = cosine_scores.drop(index_not_rated) 
    
    #selecting top k nearest neighbour based on the function defined threshold
    index_within_threshold=cosine_scores.nlargest(top_k_similar).index.values
    ratings_scores=ratings_scores[index_within_threshold]
    cosine_scores=cosine_scores[index_within_threshold]


    
    return np.dot(ratings_scores, cosine_scores)/cosine_scores.sum()

In [28]:
# function that returns the top k recommendations for a user
def recommend_movies_user(id_user, top_k_recommendations):
    user_similarity = cosine_similarity(df_ratings.fillna(0))
    user_similarity = pd.DataFrame(user_similarity, index=df_ratings.index, columns=df_ratings.index)
    
    #selecting movies which are not rated by id_user
    movies_not_rated = df_ratings.loc[id_user][df_ratings.loc[id_user].isnull()].index.values
    
    #predicting ratings for movies_not_rated
    predicted_ratings = [calculate_ratings(id_movie, id_user, 30, user_similarity) for id_movie in movies_not_rated]
    
    #sorting ratings in descending order
    top_movies = pd.DataFrame({'movieId':movies_not_rated, 'rating':predicted_ratings}).sort_values(by='rating', ascending=False).head(top_k_recommendations)
    
    return top_movies

In [29]:
# time calculate_ratings(2, 1, 1) with timeit
%timeit -n 1 -r 1  print(recommend_movies_user(1, 3))





      movieId  rating
4356     6835     5.0
4351     6818     5.0
3807     5746     5.0
12 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
