In [1]:
# ======================================
# CONTENT-BASED RECOMMENDER SYSTEM LAB
# Dataset: MovieLens Movies (genres metadata)
# ======================================
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# ------------------------------------------
# 1. Load Dataset (Movies metadata)
# ------------------------------------------
print("Downloading MovieLens movies metadata...")
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
# u.item format: movie_id | title | release date | video release | IMDb URL | genres (binary flags)
# We'll extract movie_id, title, and genres for simplicity
columns = ["movie_id", "title", "release_date", "video_release", "IMDb_URL",
 "unknown","Action","Adventure","Animation","Children","Comedy","Crime",
 "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
 "Romance","Sci-Fi","Thriller","War","Western"]
movies = pd.read_csv(url, sep="|", names=columns, encoding="latin-1")
movies = movies.iloc[:500] # Use first 500 movies for speed
# Create a genre string for each movie
genre_cols = columns[5:]
movies["genres"] = movies[genre_cols].apply(lambda x: " ".join([g for g, v in zip(genre_cols, x) if v == 1]), axis=1)
print("\nSample movies data:")
print(movies[["movie_id","title","genres"]].head())

Downloading MovieLens movies metadata...

Sample movies data:
   movie_id              title                     genres
0         1   Toy Story (1995)  Animation Children Comedy
1         2   GoldenEye (1995)  Action Adventure Thriller
2         3  Four Rooms (1995)                   Thriller
3         4  Get Shorty (1995)        Action Comedy Drama
4         5     Copycat (1995)       Crime Drama Thriller


In [3]:
# ------------------------------------------
# 2. Create TF-IDF Matrix based on Genres
# ------------------------------------------
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["genres"])
print("\nTF-IDF Matrix Shape:", tfidf_matrix.shape)


TF-IDF Matrix Shape: (500, 21)


In [5]:
# ------------------------------------------
# 3. Compute Cosine Similarity
# ------------------------------------------
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# ------------------------------------------
# 4. Build Recommendation Function
# ------------------------------------------
indices = pd.Series(movies.index, index=movies["title"]).drop_duplicates()
def recommend_movies(title, top_n=5):
    if title not in indices:
        return f"Movie '{title}' not found in the dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]

    return movies.iloc[movie_indices][["title","genres"]]

In [6]:
# ------------------------------------------
# 5. Test the Recommender
# ------------------------------------------
print("\nRecommendations for 'Toy Story (1995)':")
print(recommend_movies("Toy Story (1995)"))
print("\nRecommendations for 'Star Wars (1977)':")
print(recommend_movies("Star Wars (1977)"))


Recommendations for 'Toy Story (1995)':
                                      title                             genres
421  Aladdin and the King of Thieves (1996)          Animation Children Comedy
101                  Aristocats, The (1970)                 Animation Children
403                        Pinocchio (1940)                 Animation Children
94                           Aladdin (1992)  Animation Children Comedy Musical
168              Wrong Trousers, The (1993)                   Animation Comedy

Recommendations for 'Star Wars (1977)':
                               title  \
180        Return of the Jedi (1983)   
171  Empire Strikes Back, The (1980)   
270         Starship Troopers (1997)   
120    Independence Day (ID4) (1996)   
497        African Queen, The (1951)   

                                        genres  
180        Action Adventure Romance Sci-Fi War  
171  Action Adventure Drama Romance Sci-Fi War  
270                Action Adventure Sci-Fi War  
120    