In [2]:
import pandas as pd

ratings=pd.read_csv(r'E:\Python\Movie Recomendation System\ml-25m\ml-25m\ratings.csv')
movies=pd.read_csv(r'E:\Python\Movie Recomendation System\ml-25m\ml-25m\movies.csv')

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


<h2> Data Cleaning </h2>

In [5]:
#drop timestamp
ratings=ratings.drop('timestamp', axis=1)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5


In [6]:
import re
#clean the movie title
def title_cleaner(title):
  title = re.sub("[^a-zA-Z0-9 ]", "", title)
  return title

#Split genres into separate columns (one-hot encoding)
movies['genres'] = movies['genres'].str.split('|')
movies_encoded = movies.join(movies['genres'].str.get_dummies())



In [7]:
movies['title']=movies['title'].apply(title_cleaner)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story 1995,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji 1995,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men 1995,"[Comedy, Romance]"
3,4,Waiting to Exhale 1995,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II 1995,[Comedy]


In [8]:
#creating feature for   one hot encoding

from sklearn.preprocessing import MultiLabelBinarizer

mlb= MultiLabelBinarizer()

genre_matrix= pd.DataFrame(mlb.fit_transform(movies['genres']),columns=mlb.classes_,index=movies.index)

In [9]:
#Encoding results
genre_matrix

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62418,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
62419,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
62420,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
62421,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:

def create_user_profile(user_id):
    user_ratings = ratings[ratings['userId'] == user_id]
    
    # Get movies rated by the user and merge with genre data
    liked_movies = user_ratings[['movieId', 'rating']].merge(movies[['movieId', 'title']], on='movieId')
    # Get the genres for each liked movie
    user_genre_preference = genre_matrix.loc[liked_movies['movieId']]

    # Weight each genre by the user's rating (ratings are between 1-5)
    weighted_genre_preference = user_genre_preference.mul(liked_movies['rating'], axis=0)
    
    # Create the user profile by averaging the weighted genre preferences
    user_profile = weighted_genre_preference.mean()

    return user_profile


In [11]:
#check genre preferences of user 
user=1
user_profile=create_user_profile(user)
user_profile

(no genres listed)   NaN
Action               NaN
Adventure            NaN
Animation            NaN
Children             NaN
Comedy               NaN
Crime                NaN
Documentary          NaN
Drama                NaN
Fantasy              NaN
Film-Noir            NaN
Horror               NaN
IMAX                 NaN
Musical              NaN
Mystery              NaN
Romance              NaN
Sci-Fi               NaN
Thriller             NaN
War                  NaN
Western              NaN
dtype: float64

<h2> Recommending based on user profile</h2>

In [None]:
user=10
user_profile=create_user_profile(user)



Recommended movies:


Unnamed: 0,title,genres
13905,Aelita The Queen of Mars Aelita 1924,"[Action, Adventure, Drama, Fantasy, Romance, S..."
2338,Mighty Joe Young 1998,"[Action, Adventure, Drama, Fantasy, Thriller]"
10451,King Kong 2005,"[Action, Adventure, Drama, Fantasy, Thriller]"
23306,Dragonheart 2 A New Beginning 2000,"[Action, Adventure, Comedy, Drama, Fantasy, Th..."
56543,Gamera Guardian of the Universe 1995,"[Action, Adventure, Drama, Fantasy, Sci-Fi, Th..."


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_with_profile(user_profile, top_n=5):
    # Calculate cosine similarity between the user profile and the genre matrix
    similarity = cosine_similarity([user_profile], genre_matrix).flatten()
    
    # Get the top N movie indices with the highest similarity scores
    indices = np.argpartition(similarity, -top_n)[-top_n:]
    
    # Return the recommended movies
    recommended_movies = movies.iloc[indices].copy()
    recommended_movies['similarity'] = similarity[indices]
    return recommended_movies.sort_values(by='similarity', ascending=False)

recommend_with_profile(user_profile)

Unnamed: 0,movieId,title,genres,similarity
2338,2429,Mighty Joe Young 1998,"[Action, Adventure, Drama, Fantasy, Thriller]",0.866883
10451,41569,King Kong 2005,"[Action, Adventure, Drama, Fantasy, Thriller]",0.866883
32955,142086,Fallen The Beginning 2006,"[Action, Adventure, Children, Drama, Fantasy]",0.849192
4694,4800,King Solomons Mines 1937,"[Action, Adventure, Drama, Romance, Thriller]",0.8315
12128,57854,The Count of Monte Cristo 1934,"[Action, Adventure, Drama, Romance, Thriller]",0.8315


<h2>collaborative recommendation</h2>

In [15]:
def collaborative_filtering_by_user(user_id, ratings, movies, threshold=0.10, top_n=10):
    # Get movies rated highly by the target user
    target_user_rated_movies = ratings[(ratings["userId"] == user_id) & (ratings["rating"] > 4) ]["movieId"]

    if target_user_rated_movies.empty:
        return "No highly-rated movies found for this user."

    # Find users who have rated the same movies highly
    similar_users = ratings[(ratings["movieId"].isin(target_user_rated_movies)) & (ratings["rating"] > 4)]["userId"].unique()

    # Get movies rated highly by these similar users
    similar_user_recs = ratings[ (ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

    # Count how many similar users liked each movie and normalize
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

    # Remove movies already rated by the target user
    similar_user_recs = similar_user_recs[~similar_user_recs.index.isin(target_user_rated_movies)]

    # Filter out movies liked by fewer than the threshold of similar users
    similar_user_recs = similar_user_recs[similar_user_recs > threshold]

    # Calculate popularity scores among all users
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4) ]
    all_user_recs = all_users["movieId"].value_counts() / len( all_users["userId"].unique())

    # SCombine the scores
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)

    # Merge with movie titles and genres for the top recommendations
    recommendations = rec_percentages.head(top_n).merge( movies, left_index=True, right_on="movieId")[["movieId","score", "title", "genres"]]

    return recommendations


In [None]:

# Recommend movies for a specific user
user_id = 2001 # Replace with the desired userId
recommendations = collaborative_filtering_by_user( user_id=user_id, ratings=ratings, movies=movies, threshold=0.10, top_n=10)

recommendations

Unnamed: 0,movieId,score,title,genres
1258,1291,1.361518,Indiana Jones and the Last Crusade 1989,"[Action, Adventure]"
1013,1036,1.333575,Die Hard 1988,"[Action, Crime, Thriller]"
1207,1240,1.32768,Terminator The 1984,"[Action, Sci-Fi, Thriller]"
1168,1198,1.323631,Raiders of the Lost Ark Indiana Jones and the ...,"[Action, Adventure]"
1170,1200,1.314095,Aliens 1986,"[Action, Adventure, Horror, Sci-Fi]"
1237,1270,1.29486,Back to the Future 1985,"[Adventure, Comedy, Sci-Fi]"
3054,3147,1.277862,Green Mile The 1999,"[Crime, Drama]"
1939,2028,1.276457,Saving Private Ryan 1998,"[Action, Drama, War]"
3479,3578,1.267726,Gladiator 2000,"[Action, Adventure, Drama]"
1232,1265,1.266588,Groundhog Day 1993,"[Comedy, Fantasy, Romance]"
