In [48]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns


# Read Data

In [49]:
df1=pd.read_csv("/kaggle/input/movielens-dataset/movies.csv")
df2=pd.read_csv("/kaggle/input/movielens-dataset/ratings.csv")

In [50]:
df1.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [51]:
df1.shape

(10329, 3)

In [52]:
df1.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [53]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10329 entries, 0 to 10328
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  10329 non-null  int64 
 1   title    10329 non-null  object
 2   genres   10329 non-null  object
dtypes: int64(1), object(2)
memory usage: 242.2+ KB


In [54]:
df2.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523


In [55]:
df2.shape

(105339, 4)

In [56]:
df2.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [57]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105339 entries, 0 to 105338
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     105339 non-null  int64  
 1   movieId    105339 non-null  int64  
 2   rating     105339 non-null  float64
 3   timestamp  105339 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.2 MB


In [58]:
n_ratings = len(df2)
n_movies = len(df2['movieId'].unique())
n_users = len(df2['userId'].unique())
print("###################################################################################")
print(f"Number of ratings: {n_ratings}")
print("###################################################################################")
print(f"Number of unique movieId's: {n_movies}")
print("###################################################################################")
print(f"Number of unique users: {n_users}")
print("###################################################################################")
print(f"Average ratings per user: {round(n_ratings/n_users, 2)}")
print("###################################################################################")
print(f"Average ratings per movie:{round(n_ratings/n_movies)}")
print("###################################################################################")

###################################################################################
Number of ratings: 105339
###################################################################################
Number of unique movieId's: 10325
###################################################################################
Number of unique users: 668
###################################################################################
Average ratings per user: 157.69
###################################################################################
Average ratings per movie:10
###################################################################################


# User Rating Frequency

In [59]:
user_freq=df2[["userId","movieId"]].groupby("userId").count().reset_index()
user_freq.columns = ['userId', 'n_ratings']
user_freq.head()

Unnamed: 0,userId,n_ratings
0,1,113
1,2,29
2,3,73
3,4,124
4,5,68


# Movie Rating Analysis

In [60]:
# Find Lowest and Highest rated movies:
mean_rating = df2.groupby("movieId")[["rating"]].mean()
# Lowest rated movies
lowest_rated=mean_rating["rating"].idxmin()
df1.loc[df1['movieId'] == lowest_rated]

Unnamed: 0,movieId,title,genres
353,396,Fall Time (1995),Drama


# Highest rated movies

In [61]:

highest_rated = mean_rating['rating'].idxmax()
df1.loc[df1["movieId"]==highest_rated]

Unnamed: 0,movieId,title,genres
110,124,"Star Maker, The (Uomo delle stelle, L') (1995)",Drama


# show number of people who rated movies highest

In [62]:
highest_rated_movie=df2[df2["movieId"]==highest_rated]
highest_rated_movie

Unnamed: 0,userId,movieId,rating,timestamp
90313,601,124,5.0,938941812


In [63]:
low_rated_movie=df2[df2['movieId']==lowest_rated]
low_rated_movie

Unnamed: 0,userId,movieId,rating,timestamp
99833,668,396,0.5,1153426883


In [64]:
movie_stats=df2.groupby("movieId")[["rating"]].agg(["count","mean"])
movie_stats

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,count,mean
movieId,Unnamed: 1_level_2,Unnamed: 2_level_2
1,232,3.907328
2,92,3.353261
3,58,3.189655
4,11,2.818182
5,62,3.250000
...,...,...
146684,1,4.000000
146878,1,2.500000
148238,1,3.000000
148626,3,4.333333


In [65]:
movie_stats.columns = ["rating_count", "rating_mean"]
movie_stats

Unnamed: 0_level_0,rating_count,rating_mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,232,3.907328
2,92,3.353261
3,58,3.189655
4,11,2.818182
5,62,3.250000
...,...,...
146684,1,4.000000
146878,1,2.500000
148238,1,3.000000
148626,3,4.333333


# Now, we create user-item matrix using scipy csr matrix

# There are four dictionaries produced:
* user_mapper: Maps distinct user IDs to indexes (user ID 1 becomes index 0 for example).
* movie_mapper: Converts distinct movie IDs into indices (movie ID 1 becomes index 0 for example).
* user_inv_mapper: Reverses user_mapper and maps indices back to user IDs.
* movie_inv_mapper: Reverses movie_mapper by mapping indices to movie IDs.

In [66]:

from scipy.sparse import csr_matrix

def create_matrix(df):
    
    N = len(df['userId'].unique())
    M = len(df['movieId'].unique())
    
    # Map Ids to indices
    user_mapper = dict(zip(np.unique(df["userId"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movieId"]), list(range(M))))
    
    # Map indices to IDs
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["userId"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movieId"])))
    
    user_index = [user_mapper[i] for i in df['userId']]
    movie_index = [movie_mapper[i] for i in df['movieId']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper
    
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(df2)

# Movie Similarity Analysis

In [67]:

from sklearn.neighbors import NearestNeighbors
def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids


movie_titles = dict(zip(df1['movieId'],df1['title']))

movie_id = 3

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Since you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])

Since you watched Grumpier Old Men (1995)
Father of the Bride Part II (1995)
Nutty Professor, The (1996)
Twister (1996)
Birdcage, The (1996)
Island of Dr. Moreau, The (1996)
Eraser (1996)
River Wild, The (1994)
Multiplicity (1996)
Phenomenon (1996)
Mr. Holland's Opus (1995)


In [68]:
movies=df1
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [69]:
def recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10):
    df1 = df2[df2['userId'] == user_id]
    
    if df1.empty:
        print(f"User with ID {user_id} does not exist.")
        return

    movie_id = df1[df1['rating'] == max(df1['rating'])]['movieId'].iloc[0]

    movie_titles = dict(zip(movies['movieId'],movies['title']))

    similar_ids = find_similar_movies(movie_id, X, k)
    movie_title = movie_titles.get(movie_id, "Movie not found")

    if movie_title == "Movie not found":
        print(f"Movie with ID {movie_id} not found.")
        return

    print(f"Since you watched {movie_title}, you might also like:")
    for i in similar_ids:
        print(movie_titles.get(i, "Movie not found"))

In [70]:
user_id = 120  # Replace with the desired user ID
recommend_movies_for_user(user_id, X, user_mapper, movie_mapper, movie_inv_mapper, k=10)

Since you watched Misérables, Les (1995), you might also like:
Endurance: Shackleton's Legendary Antarctic Expedition, The (2000)
Bread and Tulips (Pane e tulipani) (2000)
School of Flesh, The (École de la chair, L') (1998)
Puppet Master 4 (1993)
Minus Man, The (1999)
Two Women (Ciociara, La) (1960)
Nightmare City (a.k.a. City of the Walking Dead) (a.k.a. Invasión de los zombies atómicos, La) (Incubo sulla città contaminata) (1980)
Puppet Master II (1991)
Puppet Master III: Toulon's Revenge (1991)
Beyond the Poseidon Adventure (1979)
