# DSC 630 Week 10 Assignment
## By: Jamie Tran 
## Date: 5/14/2024

### Libraries downloaded:

In [32]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### Week 10 code:

In [2]:
#importing movie ratings dataset
ratings = pd.read_csv('ratings.csv')
#previewing data
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
#importing movies dataset
movies = pd.read_csv('movies.csv')
#previewing data
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
#finding the number of ratings, movies, user ids in the rating data frame
ratings_count = len(ratings)
movies_count = len(ratings['movieId'].unique())
users_count = len(ratings['userId'].unique())
#printing values
print('Number of ratings: ' + str(ratings_count))
print('Number of different movies: ' + str(movies_count))
print('Number of different users: ' + str(users_count))

Number of ratings: 100836
Number of different movies: 9724
Number of different users: 610


In [7]:
#finding rating per user and movie
print('Average ratings per user : ' + str(ratings_count/users_count))
print('Average ratings per movie : ' + str(ratings_count/movies_count))

Average ratings per user : 165.30491803278687
Average ratings per movie : 10.369806663924312


In [54]:
#finding how often each user leaves reviews
user_freq = ratings[['userId', 'movieId']].groupby('userId').count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
print(user_freq.head())

   userId  n_ratings
0       1        232
1       2         29
2       3         39
3       4        216
4       5         44


In [56]:
#finding mean rating
mean_rating = ratings.groupby('movieId')[['rating']].mean()
#finding lowest rated movie
lowest_rated = mean_rating['rating'].idxmin()
movies.loc[movies['movieId'] == lowest_rated]

Unnamed: 0,movieId,title,genres
2689,3604,Gypsy (1962),Musical


In [57]:
#finding highest rated movie
highest_rated = mean_rating['rating'].idxmax()
movies.loc[movies['movieId'] == highest_rated]

Unnamed: 0,movieId,title,genres
48,53,Lamerica (1994),Adventure|Drama


In [58]:
#diplaying highest rated movie
ratings[ratings['movieId'] == highest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13368,85,53,5.0,889468268
96115,603,53,5.0,963180003


In [59]:
#diplaying lowest rated movie
ratings[ratings['movieId'] == lowest_rated]

Unnamed: 0,userId,movieId,rating,timestamp
13633,89,3604,0.5,1520408880


In [60]:
#removing bias from smaller movies
movie_stats = ratings.groupby('movieId')[['rating']].agg(['count', 'mean'])
movie_stats.columns = movie_stats.columns.droplevel()

In [67]:
#creating matrix of corresponding ratings to different users and movies
def movie_matrix(data):
    #finding number of different users and movies
    uni_user = len(data['userId'].unique())
    uni_movie = len(data['movieId'].unique())
    #establishing dictionaries of unique users and movies
    user_mapper = dict(zip(np.unique(data['userId']), list(range(uni_user))))
    movie_mapper = dict(zip(np.unique(data['movieId']), list(range(uni_movie))))
    #mapping users and movies to matrix
    user_inv_mapper = dict(zip(list(range(uni_user)), np.unique(data['userId'])))
    movie_inv_mapper = dict(zip(list(range(uni_movie)), np.unique(data['movieId'])))
    #map corresponding indices
    user_index = [user_mapper[i] for i in data['userId']]
    movie_index = [movie_mapper[i] for i in data['movieId']]
    #making matrix with user and movies indices and ratings
    x = csr_matrix((data['rating'], (movie_index, user_index)), shape = (uni_movie,uni_user))

    return x, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

x, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = movie_matrix(ratings)

In [76]:
#creating matrix to reccomend similarly rated movies 
def similar_movies(movie_id, x, k, metric = 'cosine', show_distance = False):
    #creating empty list where titles will be held 
    neighbor_ids = []
    #getting index of movie and corresponding user
    movie_ind = movie_mapper[movie_id]
    movie_vec = x[movie_ind]
    #using KNN to find most similar movies in range
    k+=1
    kNN = NearestNeighbors(n_neighbors = k, algorithm = 'brute', metric = metric)
    kNN.fit(x)
    movie_vec = movie_vec.reshape(1,-1)
    neighbor = kNN.kneighbors(movie_vec, return_distance = show_distance)
    #adding titles to empty list
    for i in range(0, k):
        n = neighbor.item(i)
        neighbor_ids.append(movie_inv_mapper[n])
    neighbor_ids.pop(0)
    return neighbor_ids
#finding movie titles in dictionary 
movie_titles = dict(zip(movies['movieId'], movies['title']))
#assigning test value 
movie_id = 10

similar_ids = similar_movies(movie_id, x, k=10)
movie_title = movie_titles[movie_id]

print('If you enjoyed ' + str(movie_title) + ' give these a try:')
print('----------------------------------------------------------')
for i in similar_ids:
    print(movie_titles[i])

If you enjoyed GoldenEye (1995) give these a try:
----------------------------------------------------------
Die Hard: With a Vengeance (1995)
True Lies (1994)
Clear and Present Danger (1994)
Speed (1994)
Batman (1989)
Stargate (1994)
Jurassic Park (1993)
Batman Forever (1995)
Mission: Impossible (1996)
Terminator 2: Judgment Day (1991)


In [75]:
#creating function that reccomends movie
def recommend_movies_for_user(user_id, x, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    #seeing if user has rating history
    user_retrevial = ratings[ratings['userId'] == user_id]
    #creating if statement for unknown values 
    if user_retrevial.empty:
        print('User ' + str(user_id) + ' does not exist.')
        return
    #filtering for movies with highest ratings 
    movie_id = user_retrevial[user_retrevial['rating'] == max(user_retrevial['rating'])]['movieId'].iloc[0]
    #finding titles in dictionary
    movie_titles = dict(zip(movies['movieId'], movies['title']))
    #finding similar movies to the highest rated
    similar_ids = similar_movies(movie_id, x, k)
    movie_title = movie_titles.get(movie_id, 'Movie not found')
    #if statement for movies not found
    if movie_title == 'Movie not found':
        print(str(movie_id) + ' is not found')
        return

    print('Since you watched ' + str(movie_title) + ', you might also like:')
    print('---------------------------------------------------------------')
    for i in similar_ids:
        print(movie_titles.get(i, 'Movies not found'))
        

In [74]:
#assigning test value
user_id = 128
recommend_movies_for_user(user_id, x, user_mapper, movie_mapper, movie_inv_mapper, k=10)

Since you watched Braveheart (1995), you might also like:
---------------------------------------------------------------
Jurassic Park (1993)
Terminator 2: Judgment Day (1991)
Fugitive, The (1993)
Forrest Gump (1994)
Pulp Fiction (1994)
True Lies (1994)
Apollo 13 (1995)
Dances with Wolves (1990)
Shawshank Redemption, The (1994)
Batman (1989)


In [71]:
#assigning and testing value that doesn't exist
user_id = 700
recommend_movies_for_user(user_id, x, user_mapper, movie_mapper, movie_inv_mapper, k=10)

User 700 does not exist.


### Write up:

   To begin the research we start by downloading the necessary libraries to complete the model. After importing the ratings and movies dataset we will begin some exploratory analysis. To get a better understanding of the information we used the len function to find out how many ratings, different users and movies were present. We also used the groupby function to find the mean rating as well as the most frequent movie reviewers. Through this exploratory analysis we were able to see that some movies had a very low number of reviews. To help with this bias we utilized the Bayesian Average to add extra values for these movies. 
   
   Moving on to the first function that must be created for this reccommender system, the movie matrix. This is the integral part of this model, this matrix will map out user sentiment to various movies. We start by assigning our uni_user and uni_movies variables to different user IDs and movies. The user and movie mapper variables assigned these unique values to specific indices. From there the user and movie inv mappers will tie these two variables back together by mapping the indicies back to the original user and movie IDs. Next the movie and user indexes creates a list that maps the user or movie ID back to their corresponding index using the mapping functions we created previously. Lastly, the csr matrix function is used to create a matrix with the user indicies as columns, movies indices as rows and then the ratings to fill the data.
   
   Our next function we are creating will find similar movies to the one provided based on the user's past ratings. We start by creating an empty list that our recommendations will be added to. The movie_ind variable utilizes the movie mapper function to get the index of a given movie in the dictionary. Movie_vec utilizes the user matrix to get the featured vector. From there the kNN model is fitted and used to find the nearest neighbors of the given movie through the vector that was found earlier. The for loop is designed to iterate over the similar movies in the vector and add them to the empty list we created at the beginning. Afterwards the intial movie is removed from the list because of course we can't reccomend the same movie as the one they entered. In the similar ids variable the top 10 titles are matched up with their index to be displayed. From there a test variable is assigned so the function can be ran.
   
   This last function reccomends best movies for a user based on their rating history as opposed to the previous function which reccomended similar movies to the one input. We start by seeing if the user has rated any movies previously. If they have no history then an if statement is ready to let them know we can make a reccodmendation off no data. If the user does have a history the movie_id variable checks for the user's highest rated movies. The movie titles variable creates a dictionary of these highest rated titles. Then the similar IDs variable uses the similar_movies function to find movies like their highest rated history. A print function is then written to display the results. Similar to the last function test variables are assigned to ensure the function works properly.

### Citations:

“Recommendation System in Python.” GeeksforGeeks, GeeksforGeeks, 20 Oct. 2023, www.geeksforgeeks.org/recommendation-system-in-python/. 