# COLABORATIVE FILTERING RECOMMENDATION MODEL

###### This collaborative filtering recommendation model utilizes cosine similarity between users' rating vectors to predict movie ratings for a given user. By comparing a user's ratings with those of similar users, it identifies movies the user hasn't watched and predicts ratings for them. Finally, it recommends the top-rated unseen movies to the user based on these predictions. 

In [8]:
import pandas as pd
import math
import os

In [9]:
# Get the current working directory
current_dir = os.getcwd()

# Construct the file path
movies_path = os.path.join(current_dir, "movies.csv")
ratings_path = os.path.join(current_dir, "ratings.csv")

# Read the data
movies = pd.read_csv(movies_path)
ratings = pd.read_csv(ratings_path)

In [10]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [11]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [12]:
# Merge based on "movieId"
data = ratings.merge(movies, on = "movieId")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [13]:
# Drop null values, timestamp and genres
data.drop(["timestamp", "genres"], axis =1, inplace = True)
data.isnull().sum()

userId     0
movieId    0
rating     0
title      0
dtype: int64

In [14]:
titles = data[["title", "movieId"]].drop_duplicates()
titles.set_index("movieId", inplace = True)
titles.head()

Unnamed: 0_level_0,title
movieId,Unnamed: 1_level_1
1,Toy Story (1995)
3,Grumpier Old Men (1995)
6,Heat (1995)
47,Seven (a.k.a. Se7en) (1995)
50,"Usual Suspects, The (1995)"


In [15]:
# Create a pivot table where rows represent movie IDs, columns represent user IDs, and values represent ratings
user_movie_table  = data.pivot_table(index = "movieId", columns = "userId", values = "rating")
user_movie_table.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [16]:

def similarity_coefficient(user_1, user_2):
    
    """
    Computes the cosine similarity coefficient between two users' vectors.
    
    """
    
    global user_movie_table  # User_movie_table is a global variable containing user-movie ratings
    
    # Merge the ratings of user_1 and user_2
    merged = pd.concat([user_movie_table[user_1], user_movie_table[user_2]], axis=1)
    
    # Use ratings for films that both watched
    merged = merged.dropna()   
    
    # If both users haven't watched at least 30% of the movies in common, return 0 similarity
    if merged.shape[0] < round(user_movie_table[user_1].dropna().shape[0] * 0.3):  
        return 0
    
    # Compute the dot product of the two users' vectors
    dot_product = sum(list(merged[user_1]*merged[user_2]))
    
    # Compute the magnitude of each user's vector
    len_1 = math.sqrt(sum(list(merged[user_1].apply(lambda x: x**2))))
    len_2 = math.sqrt(sum(list(merged[user_2].apply(lambda x: x**2))))
    
    # Compute the cosine of the angle between the two vectors
    cos_a = dot_product/(len_1*len_2)
    
    return cos_a




def find_similar_users(user):   # User ID must be between 1 and 650
    
    """
    Finds users similar to the given user based on cosine similarity coefficients.
    
    """
    
    # Create an empty DataFrame to store similarity coefficients
    similar_users = pd.DataFrame(columns=user_movie_table.columns)
    
    # Calculate similarity coefficients between the given user and all other users
    similar_users.loc[user]  = [similarity_coefficient(user, x) for x in similar_users.columns]

    # Filter users with similarity coefficient greater than 0.5 (adjustable threshold)
    similar_users = similar_users[similar_users.columns[similar_users.loc[user] > 0.5]] 
    
    # Drop user itself
    if not similar_users.empty:
        similar_users.drop(user, axis=1, inplace=True) 
    
    # Sort similar users by similarity coefficient in descending order
    similar_users_sorted = similar_users.T.sort_values(by = user, ascending = False )

    return similar_users_sorted


In [17]:
def predict_movie_ratings_for_user(user):
    global user_movie_table
    
    # Find movies the user hasn't watched
    user_movies = user_movie_table[[user]]
    not_watched = user_movies[user_movies[user].isnull()]
    not_watched_movie_id = not_watched.index

    # Find similar users to the given user
    similar_users = find_similar_users(user)
    similar_users_id  = similar_users.index
    
    # Dictionary to store predicted movie ratings
    predicted_movie_rate = {}

    # Compute predicted ratings for each movie the user hasn't watched
    for movie_id in not_watched_movie_id:
        movie_rating = 0
        div = 0
        
        # Iterate through similar users
        for similar_user in similar_users_id:
            if not pd.isna(user_movie_table.loc[movie_id, similar_user]):
                
                # Compute weighted sum of ratings
                movie_rating += (user_movie_table.loc[movie_id, similar_user] * similar_users.loc[similar_user].item())
                div += similar_users.loc[similar_user].item()
                
        # If there are similar users with ratings for the movie, compute the predicted rating    
        if div != 0:  
            movie_rating /= div
            predicted_movie_rate[movie_id] = movie_rating
            
    return predicted_movie_rate

    

In [18]:
def recommended_movies(user_id, quantity = 10):
    
    # Ensure user_id exists in the database
    if not 1 <= user_id <= 610:
        print("Such user doesn't exist")
        return
    
    # Predict movie ratings for the user
    dic = predict_movie_ratings_for_user(user_id)
    
    movie_ids = dict(sorted(dic.items(), key = lambda x: x[1], reverse = True)).keys()
    
    # Retrieve movie names from movie_ids using the 'titles' DataFrame
    movie_names = [titles.loc[x, "title"] for x in movie_ids]
    
    if len(movie_names) < quantity:
        return movie_names
    
    return movie_names[:quantity]


In [19]:
# Example
# Get recommendation for user 10

recommended_movies(10)

['Love and Death (1975)',
 "Adam's Rib (1949)",
 'Jackass 3.5 (2011)',
 'Tom Segura: Completely Normal (2014)',
 'Tom Segura: Mostly Stories (2016)',
 'Smoke (1995)',
 'Once Were Warriors (1994)',
 'In the Mouth of Madness (1995)',
 'Barcelona (1994)',
 'Shadowlands (1993)']