# collaborative filtering 

In [1]:
#import required libraries
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
ratings = pd.read_csv("datasets/toy_dataset.csv", index_col = 0)

#get rid of NaN values
ratings = ratings.fillna(0)
ratings


Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,4.0,5.0,3.0,0.0,2.0,1.0
user 2,5.0,3.0,3.0,2.0,2.0,0.0
user 3,1.0,0.0,0.0,4.0,5.0,4.0
user 4,0.0,2.0,1.0,4.0,0.0,3.0
user 5,1.0,0.0,2.0,3.0,3.0,4.0


#### normalize ratings

In [4]:
#having 0's doesn't make sense because it implies a rating of 0 was given 
#need to standardize/normalize

def standardize(row):
    new_row = (row - row.mean())/(row.max()-row.min())
    #corrects for users that are too harsh or too lenient
    #makes the mean of the new ratings 0 and the range 1
    return new_row

ratings_std = ratings.apply(standardize)
ratings_std


Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
user 1,0.36,0.6,0.4,-0.65,-0.08,-0.35
user 2,0.56,0.2,0.4,-0.15,-0.08,-0.6
user 3,-0.24,-0.4,-0.6,0.35,0.52,0.4
user 4,-0.44,0.0,-0.266667,0.35,-0.48,0.15
user 5,-0.24,-0.4,0.066667,0.1,0.12,0.4


## item-item CF with cosine similarity

In [5]:
#cos similarity calculates row-wise which would be user, so to do item, need to transpose 
item_similarity = cosine_similarity(ratings_std.T)

#similarity matrix
print(item_similarity)

[[ 1.          0.70668875  0.81368151 -0.79941088 -0.02539184 -0.91410609]
 [ 0.70668875  1.          0.72310153 -0.84515425 -0.5189993  -0.84337386]
 [ 0.81368151  0.72310153  1.         -0.84794611 -0.3799803  -0.80218063]
 [-0.79941088 -0.84515425 -0.84794611  1.          0.14803913  0.72374686]
 [-0.02539184 -0.5189993  -0.3799803   0.14803913  1.          0.39393939]
 [-0.91410609 -0.84337386 -0.80218063  0.72374686  0.39393939  1.        ]]


In [6]:
#create dataframe
item_similarity_df = pd.DataFrame(item_similarity, index = ratings.columns, columns = ratings.columns)
item_similarity_df

Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
action1,1.0,0.706689,0.813682,-0.799411,-0.025392,-0.914106
action2,0.706689,1.0,0.723102,-0.845154,-0.518999,-0.843374
action3,0.813682,0.723102,1.0,-0.847946,-0.37998,-0.802181
romantic1,-0.799411,-0.845154,-0.847946,1.0,0.148039,0.723747
romantic2,-0.025392,-0.518999,-0.37998,0.148039,1.0,0.393939
romantic3,-0.914106,-0.843374,-0.802181,0.723747,0.393939,1.0


#### make recommendation

In [7]:
# make reco
def get_similar_movies(movie_name, user_rating):
    similar_score = item_similarity_df[movie_name]*(user_rating - 2.5)
    #need the amplifier of user_rating - 2.5 so that a negative rating is seen as a bad thing
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

print(get_similar_movies("romantic3", 1))

action1      1.371159
action2      1.265061
action3      1.203271
romantic2   -0.590909
romantic1   -1.085620
romantic3   -1.500000
Name: romantic3, dtype: float64


In [8]:
# user rates multiple movies
action_lover = [("action1", 5), ("romantic2", 1), ("romantic3",1)]

#create empty DF to collect similar movies that can be suggested
similar_movies = pd.DataFrame()

for movie, rating in action_lover:
    similar_movies = similar_movies.append(get_similar_movies(movie,rating), ignore_index = True)
    
similar_movies.head()


Unnamed: 0,action1,action2,action3,romantic1,romantic2,romantic3
0,2.5,1.766722,2.034204,-1.998527,-0.06348,-2.285265
1,0.038088,0.778499,0.56997,-0.222059,-1.5,-0.590909
2,1.371159,1.265061,1.203271,-1.08562,-0.590909,-1.5


In [9]:

#sum all values for each movie to find the best recommendations
similar_movies.sum().sort_values(ascending = False)

action1      3.909247
action2      3.810282
action3      3.807445
romantic2   -2.154389
romantic1   -3.306206
romantic3   -4.376174
dtype: float64

## user-user cosine similarity

In [10]:
#cos similarity calculates row-wise which would be user
user_similarity = cosine_similarity(ratings_std)

#similarity matrix
print(user_similarity)

[[ 1.          0.76409098 -0.83718317 -0.57786068 -0.7361606 ]
 [ 0.76409098  1.         -0.78062564 -0.60148288 -0.74685173]
 [-0.83718317 -0.78062564  1.          0.23334449  0.64173153]
 [-0.57786068 -0.60148288  0.23334449  1.          0.24550092]
 [-0.7361606  -0.74685173  0.64173153  0.24550092  1.        ]]


In [11]:
#create dataframe
user_similarity_df = pd.DataFrame(user_similarity, index = ratings.index, columns = ratings.index)
user_similarity_df
#thus the below dataframe represents the similarity levels between users based on how they rated the movies


Unnamed: 0,user 1,user 2,user 3,user 4,user 5
user 1,1.0,0.764091,-0.837183,-0.577861,-0.736161
user 2,0.764091,1.0,-0.780626,-0.601483,-0.746852
user 3,-0.837183,-0.780626,1.0,0.233344,0.641732
user 4,-0.577861,-0.601483,0.233344,1.0,0.245501
user 5,-0.736161,-0.746852,0.641732,0.245501,1.0


In [12]:

#create new DF and copy old ratings
updated_ratings = ratings.copy()

def input_user_rating(user_name, movie_name, user_rating):
    #add new user rating into updated ratings dataframe
    updated_ratings.at[user_name, movie_name] = user_rating
    
    #normalize the new dataframe
    updated_ratings_std = updated_ratings.apply(standardize)
    
    #perform cosine similarity on users
    updated_user_similarity = cosine_similarity(updated_ratings_std)
    
    #put into dataframe  
    updated_user_similarity_df = pd.DataFrame(updated_user_similarity, index = ratings.index, columns = ratings.index)
    
    return updated_user_similarity_df



In [13]:
#every time a user inputs a new rating, the table gets updated and the user-user similarities are recalculated
new_user_ratings_df = input_user_rating('user 1', 'action2', 3)


#### find the similar users

In [14]:
#find and sort in ascending order the similar users 
user_similarity_score = new_user_ratings_df['user 1']
user_similaritiy_score = user_similarity_score.sort_values(ascending=False)

user_similarity_score


user 1    1.000000
user 2    0.835250
user 3   -0.853989
user 4   -0.532524
user 5   -0.697882
Name: user 1, dtype: float64