# Data Preparation and Preprocessing

In [2]:
import pandas as pd

# Load dataset
rating_data = pd.read_csv('Movie Lens Small Latest Dataset/ratings.csv')
movie_data = pd.read_csv('Movie Lens Small Latest Dataset/movies.csv')

# Display first few rows of the dataset
print("ratings:\n", rating_data.head())
print("------------------------\nmovies:\n", movie_data.head())

# show statistic
total_ratings = rating_data.shape[0]
print(f"------------------------\nTotal number of ratings: {total_ratings}")
unique_users = rating_data['userId'].nunique() # number of unique users
print(f"Number of unique users: {unique_users}")
unique_movies = rating_data['movieId'].nunique()
print(f"Number of unique movies: {unique_movies}")
unique_ratings = rating_data['rating'].nunique()
print(f"Number of unique ratings: {unique_ratings}")
average_ratings = rating_data['rating'].mean()
print(f"Average of ratings: {average_ratings}")

# Drop unnecessary column
rating_data = rating_data.drop('timestamp', axis = 1)
print("------------------------\nratings:\n", rating_data.head())


ratings:
    userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931
------------------------
movies:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
------------------------
Total number of ratings: 100836
Number of unique users: 610
Number of unique movies: 9724
Number of unique r

In [21]:

# create the User-Item matrix, which is a pivot table
# this matrix is very sparse and wide
# rows represent users, columns represent movies
# entries represent ratings, NaN means no rating
train_user_item_matrix = rating_data.pivot(index ='userId', columns='movieId', values='rating')
print("Train User-Item Matrix:\n", train_user_item_matrix.head())


# fill NaN with 0 for cosine similarity calculation
# train_user_item_matrix_filled = train_user_item_matrix.fillna(0)

# Correct Way: Calculate the mean of the NON-NaN values in each row (axis=1)
# and subtract that mean from the non-NaN values in that row.
train_user_item_matrix_centered = train_user_item_matrix.apply(
    lambda row: row - row.mean(), 
    axis=1
)

# Now, to prepare this centered matrix for a Cosine Similarity function 
# (which usually can't handle NaNs), you must replace the NaNs with 0.
# Since the NaNs represent ratings the user HASN'T made, and the centered values
# represent DEVIATION from the mean, setting unrated items to 0 (zero deviation) 
# is the standard and correct step here.

train_user_item_matrix_centered = train_user_item_matrix_centered.fillna(0)

print("------------------------\nTrain User-Item Matrix:\n", train_user_item_matrix_centered.head())


Train User-Item Matrix:
 movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           4.0     NaN     4.0     NaN     NaN     4.0     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           4.0     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  193565  193567  193571  193573  193579  193581  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN     

# Algorithm Implementation and Training

### Calculate User Similarity

In [4]:
# use collaborative filtering to predict ratings with cosine similarity
# cosine similarity compares the similarity of vectors and return the cosine angle
from sklearn.metrics.pairwise import cosine_similarity
user_similarity = cosine_similarity(train_user_item_matrix_centered)
print(user_similarity[:5])

# Convert the NumPy array result back into a Pandas DataFrame for easier indexing
user_similarity_df = pd.DataFrame(user_similarity, index=train_user_item_matrix.index, columns=train_user_item_matrix.index)

print("------------------------\nUser-User Similarity Matrix (First 5 users):\n", user_similarity_df.head())
print(f"Shape of Similarity Matrix: {user_similarity_df.shape}")

[[ 1.00000000e+00  1.26451574e-03  5.52577176e-04 ...  7.52238457e-02
  -2.57125541e-02  1.09323166e-02]
 [ 1.26451574e-03  1.00000000e+00  0.00000000e+00 ... -6.00082818e-03
  -6.00909967e-02  2.49992083e-02]
 [ 5.52577176e-04  0.00000000e+00  1.00000000e+00 ... -1.30006374e-02
   0.00000000e+00  1.95499646e-02]
 [ 4.84185976e-02 -1.71640209e-02 -1.12597776e-02 ... -3.75690926e-02
  -1.78835804e-02 -9.94998236e-04]
 [ 2.18465724e-02  2.17957137e-02 -3.15389233e-02 ... -1.75112132e-03
   9.38289218e-02 -2.78093207e-04]]
------------------------
User-User Similarity Matrix (First 5 users):
 userId       1         2         3         4         5         6         7    \
userId                                                                         
1       1.000000  0.001265  0.000553  0.048419  0.021847 -0.045497 -0.006200   
2       0.001265  1.000000  0.000000 -0.017164  0.021796 -0.021051 -0.011114   
3       0.000553  0.000000  1.000000 -0.011260 -0.031539  0.004800  0.000000   
4  

### Predict Ratings

In [45]:
from sklearn.metrics import mean_squared_error
import math 

average_of_all_movie = rating_data['rating'].mean() 

def predict_rating_func(user_id, movie_id, train_user_item_matrix, similarity_matrix, k_neighbors=10):
    # 1. Check if the movie and user are in the training data.
    if movie_id not in train_user_item_matrix.columns:
        print(f"Movie ID {movie_id} not found in training data.")
        return None
    # this is a "cold start" problem for user preference not in training data
    if user_id not in similarity_matrix.index:
        # return the average rating of that specfic movie in training data
        print(f"User ID {user_id} not found in training data. Returning average movie rating.")
        movie_ratings = train_user_item_matrix[movie_id] # col for pandas
        movie_average_rating = movie_ratings.mean()
        return movie_average_rating

    # 2. Get the target user's similarity scores with all other users.
    user_similarity = similarity_matrix.loc[user_id]

    # 3. Sort the similarity scores and select the Top K most similar users (neighbors).
    user_similarity = user_similarity.sort_values(ascending = False)
    # print(user_similarity) ## userId 1      1.00000  301    0.124799 ...
    TopKSimilarity = user_similarity[1:k_neighbors+1]
    # print(f"Top {k_neighbors} similar users to User {user_id}:\n", TopKSimilarity)

    # 4. Filter the neighbors' ratings for the target movie.
    neighbor_rating = []
    for neighbor_id, similarity_score in TopKSimilarity.items():
        neighbor_rating.append(train_user_item_matrix.loc[neighbor_id, movie_id])
        print(f"Neighbor ID: {neighbor_id}, Similarity Score: {similarity_score}, Rating: {train_user_item_matrix.loc[neighbor_id, movie_id]}")
        
    # 5. Calculate the weighted average of the neighbors' ratings for the target movie,
    #    using their similarity scores as weights.
    weighted_sum = 0
    sum_of_weights = 0
    for ID, rating in enumerate(neighbor_rating):
        if not pd.isna(rating):  # Only consider non-NaN ratings
            weight = TopKSimilarity.iloc[ID]
            weighted_sum += rating * weight
            sum_of_weights += weight
    if sum_of_weights > 0:
        predict_rating = weighted_sum / sum_of_weights
    else:
        # fall back to average rating of all movies in training data
        predict_rating = average_of_all_movie

    # 6. Return the predicted rating.
    return predict_rating



# -------- case study --------
# test the function with user 1 and movie 1
predict_rating = predict_rating_func(1,1,train_user_item_matrix,user_similarity_df,10)
print(f"Predicted rating for User 1 on Movie 1: {predict_rating}")
# get solution
print("The rating of User 1 on Movie 1 in training data is:", train_user_item_matrix.loc[1,1])





# -------- RMSE evaluation --------
# # The next step will be to apply this function to every row in your test_data
# # and then calculate the RMSE
# # print(train_user_item_matrix.head())
# # print("---------------------")
# # print(train_user_item_matrix.stack()) # stack removes NaN and converts to series
# # print("---------------------")
# actual_ratings = []
# predicted_ratings = []
# for (userID, movieID), rating in train_user_item_matrix.stack().items(): # .items() to iterate through series
#     # print(f"UserID: {userID}, MovieID: {movieID}, Rating: {rating}") ## UserID: 600, MovieID: 48082, Rating: 3.5 UserID: 600, MovieID: 48385, Rating: 4.5 ...
#     actual_ratings.append(rating)
#     predicted_ratings.append(predict_rating_func(userID, movieID, train_user_item_matrix, user_similarity_df, 10))

# # cal rmse
# mse = mean_squared_error(actual_ratings, predicted_ratings)
# print("RMSE=", math.sqrt(mse))




Neighbor ID: 301, Similarity Score: 0.12479906517911295, Rating: nan
Neighbor ID: 597, Similarity Score: 0.10263065929063635, Rating: 4.0
Neighbor ID: 414, Similarity Score: 0.10134803449460672, Rating: 4.0
Neighbor ID: 477, Similarity Score: 0.09921664240127351, Rating: 4.0
Neighbor ID: 57, Similarity Score: 0.09907007791369307, Rating: 5.0
Neighbor ID: 369, Similarity Score: 0.09829454350405938, Rating: nan
Neighbor ID: 206, Similarity Score: 0.09685159012412194, Rating: 5.0
Neighbor ID: 535, Similarity Score: 0.09649292687545011, Rating: nan
Neighbor ID: 590, Similarity Score: 0.09519062577565122, Rating: 4.0
Neighbor ID: 418, Similarity Score: 0.09415261937134864, Rating: nan
Predicted rating for User 1 on Movie 1: 4.329663726575109
The rating of User 1 on Movie 1 in training data is: 4.0


# Note:

1. Previously, in the predict_rating_func, when there were no similar users who rated the movie, we returned 0 as the predicted rating. This lead to a high RMSE(RMSE = 1.9) because 0 is far from the average rating. Now, we return the average rating of all movies in the training data as a fallback, the RMSE is now around 0.97.

2. In prediction function, the score is biased as some user might have a higher average rating than others. 

In [42]:
for i in range(len(actual_ratings)):
    print(f"act={actual_ratings[i]:.2f} pred={predicted_ratings[i]:.2f}")



act=4.00 pred=4.33
act=4.00 pred=3.34
act=4.00 pred=3.12
act=5.00 pred=3.87
act=5.00 pred=4.75
act=3.00 pred=1.51
act=5.00 pred=4.50
act=4.00 pred=3.79
act=5.00 pred=4.01
act=5.00 pred=4.00
act=5.00 pred=3.52
act=5.00 pred=3.13
act=3.00 pred=4.00
act=5.00 pred=3.19
act=4.00 pred=4.33
act=5.00 pred=4.67
act=3.00 pred=4.34
act=3.00 pred=3.00
act=5.00 pred=2.84
act=4.00 pred=3.52
act=4.00 pred=4.56
act=5.00 pred=3.50
act=4.00 pred=2.32
act=3.00 pred=2.00
act=4.00 pred=4.34
act=5.00 pred=4.65
act=4.00 pred=4.12
act=3.00 pred=3.26
act=5.00 pred=4.58
act=4.00 pred=3.50
act=4.00 pred=2.66
act=5.00 pred=4.00
act=4.00 pred=3.34
act=4.00 pred=3.88
act=4.00 pred=4.31
act=5.00 pred=4.00
act=5.00 pred=4.20
act=3.00 pred=3.07
act=5.00 pred=4.33
act=3.00 pred=1.61
act=4.00 pred=2.42
act=3.00 pred=2.36
act=3.00 pred=3.04
act=4.00 pred=3.00
act=5.00 pred=4.11
act=5.00 pred=4.51
act=5.00 pred=4.49
act=4.00 pred=3.50
act=5.00 pred=5.00
act=3.00 pred=2.00
act=5.00 pred=4.00
act=5.00 pred=4.00
act=5.00 pre