In [6]:
import os
import pandas as pd 


movie = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
rating = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')

In [11]:
# Merging dataframes on movieId
df = movie.merge(rating, how="left", on="movieId")

In [12]:
df.head()
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000797 entries, 0 to 20000796
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   userId     float64
 4   rating     float64
 5   timestamp  object 
dtypes: float64(2), int64(1), object(3)
memory usage: 1.0+ GB


In [13]:
comment_counts = pd.DataFrame(df["title"].value_counts())


In [14]:
rare_movies = comment_counts[comment_counts["title"] <= 1000].index

In [15]:
# Finding most commented movies and assigning as common_movies
common_movies = df[~df["title"].isin(rare_movies)]

# Show user ratings by considering userid and movie names and assign the resulting pivot table as user_movie_df
user_movie_df = common_movies.pivot_table(index=["userId"], columns=["title"], values="rating")

# Determining the random user to suggest
random_user = int(pd.Series(user_movie_df.index).sample(1, random_state=45).values)
print(random_user)


28941


In [16]:
# Determining the movies watched by the user to be suggested.

# Assining random user as df
random_user_df = user_movie_df[user_movie_df.index == random_user]

# Assining the movies watched by random user as df
movies_watched_random_user = random_user_df.columns[random_user_df.notna().any()].tolist()

# Number of movies watched by random user
len(movies_watched_random_user)

33

In [17]:
# Detecting the ids of other users watching the same movies

# Shooting the watched movies with the audience ids in df
movies_watched_df = user_movie_df[movies_watched_random_user]
movies_watched_df.head()
movies_watched_df.shape

# Calculating how many of the movies watched by the users are the movies watched by the random user
user_movie_count = movies_watched_df.T.notnull().sum()
user_movie_count = user_movie_count.reset_index()
user_movie_count.columns = ["userId", "movie_count"]

# Selecting users with more than 60 percent similarity to recommend
perc = len(movies_watched_random_user) * 60 / 100
users_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userId"]

# Checking the selected users
users_same_movies.head()
users_same_movies.count()
users_same_movies.index

Int64Index([    90,    129,    155,    157,    159,    183,    293,    294,
               297,    308,
            ...
            137884, 137948, 137975, 138018, 138161, 138207, 138278, 138381,
            138414, 138482],
           dtype='int64', length=4139)

In [18]:
# Determining the users who are most similar to the user to be suggested

# Creating the final df
final_df = movies_watched_df[movies_watched_df.index.isin(users_same_movies)]
final_df.head()
final_df.shape

# Examining the relationship between selected users and each other
corr_df = final_df.T.corr().unstack().sort_values()
corr_df = pd.DataFrame(corr_df, columns=["corr"])
corr_df.index.names = ['user_id_1', 'user_id_2']
corr_df = corr_df.reset_index()
corr_df.head()

# Examining the relationship between random user and selected users
top_users = corr_df[(corr_df["user_id_1"] == random_user) & (corr_df["corr"] >= 0.65)][
    ["user_id_2", "corr"]].reset_index(drop=True)
top_users = top_users.sort_values(by='corr', ascending=False)
top_users.rename(columns={"user_id_2": "userId"}, inplace=True)

In [19]:
# Weighted average recommendation score
top_users_ratings = top_users.merge(rating[["userId", "movieId", "rating"]], how='inner')
top_users_ratings = top_users_ratings[top_users_ratings["userId"] != random_user]
top_users_ratings['weighted_rating'] = top_users_ratings['corr'] * top_users_ratings['rating']

# Recommendation_df
recommendation_df = top_users_ratings.groupby('movieId').agg({"weighted_rating": "mean"})
recommendation_df = recommendation_df.reset_index()

# User_based recommendation
movies_to_be_recommend = recommendation_df[recommendation_df["weighted_rating"] > 3.5].sort_values("weighted_rating",
                                                                                                   ascending=False)
movies_to_be_recommend = movies_to_be_recommend.merge(movie[["movieId", "title"]])[0:5]
print(movies_to_be_recommend)

   movieId  weighted_rating                           title
0       53         3.952023                 Lamerica (1994)
1     1922         3.763580                 Whatever (1998)
2     2057         3.763580  Incredible Journey, The (1963)
3     2485         3.763580           She's All That (1999)
4     3118         3.763580              Tumbleweeds (1999)


In [20]:
# Selection of the user's most recent highest rated movie
user = 108170
movie_id = rating[(rating["userId"] == user) & (rating["rating"] == 5.0)]. \
               sort_values(by="timestamp", ascending=False)["movieId"][0:1].values[0]
print(movie_id)

7044


In [23]:
# Item_based recommendation

def check_id(dataframe, id):
    movie_name = dataframe[dataframe["movieId"] == id][["title"]].values[0].tolist()
    print(movie_name)

def item_based_recommender(movie_name, user_movie_df):
    movie_name = user_movie_df[movie_name]
    return user_movie_df.corrwith(movie_name).sort_values(ascending=False)[1:6].index

check_id(movie, 170)
item_based_recommender('Hackers (1995)', user_movie_df)

['Hackers (1995)']


Index(['Bloodsport 2 (a.k.a. Bloodsport II: The Next Kumite) (1996)',
       'Bad Girls (1994)', 'Next (2007)', 'Resident Evil: Apocalypse (2004)',
       'X-Men Origins: Wolverine (2009)'],
      dtype='object', name='title')